summaryrefslogtreecommitdiffstats
path: root/src/backend/storage/lmgr
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/lmgr')
-rw-r--r--src/backend/storage/lmgr/.gitignore2
-rw-r--r--src/backend/storage/lmgr/Makefile51
-rw-r--r--src/backend/storage/lmgr/README739
-rw-r--r--src/backend/storage/lmgr/README-SSI646
-rw-r--r--src/backend/storage/lmgr/README.barrier197
-rw-r--r--src/backend/storage/lmgr/condition_variable.c364
-rw-r--r--src/backend/storage/lmgr/deadlock.c1177
-rw-r--r--src/backend/storage/lmgr/generate-lwlocknames.pl71
-rw-r--r--src/backend/storage/lmgr/lmgr.c1196
-rw-r--r--src/backend/storage/lmgr/lock.c4738
-rw-r--r--src/backend/storage/lmgr/lwlock.c1977
-rw-r--r--src/backend/storage/lmgr/lwlocknames.c52
-rw-r--r--src/backend/storage/lmgr/lwlocknames.h50
-rw-r--r--src/backend/storage/lmgr/lwlocknames.txt55
-rw-r--r--src/backend/storage/lmgr/predicate.c5203
-rw-r--r--src/backend/storage/lmgr/proc.c2012
-rw-r--r--src/backend/storage/lmgr/s_lock.c377
-rw-r--r--src/backend/storage/lmgr/spin.c180
18 files changed, 19087 insertions, 0 deletions
diff --git a/src/backend/storage/lmgr/.gitignore b/src/backend/storage/lmgr/.gitignore
new file mode 100644
index 0000000..9355cae
--- /dev/null
+++ b/src/backend/storage/lmgr/.gitignore
@@ -0,0 +1,2 @@
+/lwlocknames.c
+/lwlocknames.h
diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile
new file mode 100644
index 0000000..829b792
--- /dev/null
+++ b/src/backend/storage/lmgr/Makefile
@@ -0,0 +1,51 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/lmgr
+#
+# IDENTIFICATION
+# src/backend/storage/lmgr/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/lmgr
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ condition_variable.o \
+ deadlock.o \
+ lmgr.o \
+ lock.o \
+ lwlock.o \
+ lwlocknames.o \
+ predicate.o \
+ proc.o \
+ s_lock.o \
+ spin.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+ifdef TAS
+TASPATH = $(top_builddir)/src/backend/port/tas.o
+endif
+
+s_lock_test: s_lock.c $(top_builddir)/src/port/libpgport.a
+ $(CC) $(CPPFLAGS) $(CFLAGS) -DS_LOCK_TEST=1 $(srcdir)/s_lock.c \
+ $(TASPATH) -L $(top_builddir)/src/port -lpgport -o s_lock_test
+
+# see notes in src/backend/parser/Makefile
+lwlocknames.c: lwlocknames.h
+ touch $@
+
+lwlocknames.h: $(top_srcdir)/src/backend/storage/lmgr/lwlocknames.txt generate-lwlocknames.pl
+ $(PERL) $(srcdir)/generate-lwlocknames.pl $<
+
+check: s_lock_test
+ ./s_lock_test
+
+clean distclean:
+ rm -f s_lock_test
+
+maintainer-clean: clean
+ rm -f lwlocknames.h lwlocknames.c
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README
new file mode 100644
index 0000000..c96cc7b
--- /dev/null
+++ b/src/backend/storage/lmgr/README
@@ -0,0 +1,739 @@
+src/backend/storage/lmgr/README
+
+Locking Overview
+================
+
+Postgres uses four types of interprocess locks:
+
+* Spinlocks. These are intended for *very* short-term locks. If a lock
+is to be held more than a few dozen instructions, or across any sort of
+kernel call (or even a call to a nontrivial subroutine), don't use a
+spinlock. Spinlocks are primarily used as infrastructure for lightweight
+locks. They are implemented using a hardware atomic-test-and-set
+instruction, if available. Waiting processes busy-loop until they can
+get the lock. There is no provision for deadlock detection, automatic
+release on error, or any other nicety. There is a timeout if the lock
+cannot be gotten after a minute or so (which is approximately forever in
+comparison to the intended lock hold time, so this is certainly an error
+condition).
+
+* Lightweight locks (LWLocks). These locks are typically used to
+interlock access to datastructures in shared memory. LWLocks support
+both exclusive and shared lock modes (for read/write and read-only
+access to a shared object). There is no provision for deadlock
+detection, but the LWLock manager will automatically release held
+LWLocks during elog() recovery, so it is safe to raise an error while
+holding LWLocks. Obtaining or releasing an LWLock is quite fast (a few
+dozen instructions) when there is no contention for the lock. When a
+process has to wait for an LWLock, it blocks on a SysV semaphore so as
+to not consume CPU time. Waiting processes will be granted the lock in
+arrival order. There is no timeout.
+
+* Regular locks (a/k/a heavyweight locks). The regular lock manager
+supports a variety of lock modes with table-driven semantics, and it has
+full deadlock detection and automatic release at transaction end.
+Regular locks should be used for all user-driven lock requests.
+
+* SIReadLock predicate locks. See separate README-SSI file for details.
+
+Acquisition of either a spinlock or a lightweight lock causes query
+cancel and die() interrupts to be held off until all such locks are
+released. No such restriction exists for regular locks, however. Also
+note that we can accept query cancel and die() interrupts while waiting
+for a regular lock, but we will not accept them while waiting for
+spinlocks or LW locks. It is therefore not a good idea to use LW locks
+when the wait time might exceed a few seconds.
+
+The rest of this README file discusses the regular lock manager in detail.
+
+
+Lock Data Structures
+--------------------
+
+Lock methods describe the overall locking behavior. Currently there are
+two lock methods: DEFAULT and USER.
+
+Lock modes describe the type of the lock (read/write or shared/exclusive).
+In principle, each lock method can have its own set of lock modes with
+different conflict rules, but currently DEFAULT and USER methods use
+identical lock mode sets. See src/include/storage/lock.h for more details.
+(Lock modes are also called lock types in some places in the code and
+documentation.)
+
+There are two main methods for recording locks in shared memory. The primary
+mechanism uses two main structures: the per-lockable-object LOCK struct, and
+the per-lock-and-requestor PROCLOCK struct. A LOCK object exists for each
+lockable object that currently has locks held or requested on it. A PROCLOCK
+struct exists for each backend that is holding or requesting lock(s) on each
+LOCK object.
+
+There is also a special "fast path" mechanism which backends may use to
+record a limited number of locks with very specific characteristics: they must
+use the DEFAULT lockmethod; they must represent a lock on a database relation
+(not a shared relation), they must be a "weak" lock which is unlikely to
+conflict (AccessShareLock, RowShareLock, or RowExclusiveLock); and the system
+must be able to quickly verify that no conflicting locks could possibly be
+present. See "Fast Path Locking", below, for more details.
+
+Each backend also maintains an unshared LOCALLOCK structure for each lockable
+object and lock mode that it is currently holding or requesting. The shared
+lock structures only allow a single lock grant to be made per lockable
+object/lock mode/backend. Internally to a backend, however, the same lock may
+be requested and perhaps released multiple times in a transaction, and it can
+also be held both transactionally and session-wide. The internal request
+counts are held in LOCALLOCK so that the shared data structures need not be
+accessed to alter them.
+
+---------------------------------------------------------------------------
+
+The lock manager's LOCK objects contain:
+
+tag -
+ The key fields that are used for hashing locks in the shared memory
+ lock hash table. The contents of the tag essentially define an
+ individual lockable object. See include/storage/lock.h for details
+ about the supported types of lockable objects. This is declared as
+ a separate struct to ensure that we always zero out the correct number
+ of bytes. It is critical that any alignment-padding bytes the compiler
+ might insert in the struct be zeroed out, else the hash computation
+ will be random. (Currently, we are careful to define struct LOCKTAG
+ so that there are no padding bytes.)
+
+grantMask -
+ This bitmask indicates what types of locks are currently held on the
+ given lockable object. It is used (against the lock table's conflict
+ table) to determine if a new lock request will conflict with existing
+ lock types held. Conflicts are determined by bitwise AND operations
+ between the grantMask and the conflict table entry for the requested
+ lock type. Bit i of grantMask is 1 if and only if granted[i] > 0.
+
+waitMask -
+ This bitmask shows the types of locks being waited for. Bit i of waitMask
+ is 1 if and only if requested[i] > granted[i].
+
+procLocks -
+ This is a shared memory queue of all the PROCLOCK structs associated with
+ the lock object. Note that both granted and waiting PROCLOCKs are in this
+ list (indeed, the same PROCLOCK might have some already-granted locks and
+ be waiting for more!).
+
+waitProcs -
+ This is a shared memory queue of all PGPROC structures corresponding to
+ backends that are waiting (sleeping) until another backend releases this
+ lock. The process structure holds the information needed to determine
+ if it should be woken up when the lock is released.
+
+nRequested -
+ Keeps a count of how many times this lock has been attempted to be
+ acquired. The count includes attempts by processes which were put
+ to sleep due to conflicts. It also counts the same backend twice
+ if, for example, a backend process first acquires a read and then
+ acquires a write. (But multiple acquisitions of the same lock/lock mode
+ within a backend are not multiply counted here; they are recorded
+ only in the backend's LOCALLOCK structure.)
+
+requested -
+ Keeps a count of how many locks of each type have been attempted. Only
+ elements 1 through MAX_LOCKMODES-1 are used as they correspond to the lock
+ type defined constants. Summing the values of requested[] should come out
+ equal to nRequested.
+
+nGranted -
+ Keeps count of how many times this lock has been successfully acquired.
+ This count does not include attempts that are waiting due to conflicts.
+ Otherwise the counting rules are the same as for nRequested.
+
+granted -
+ Keeps count of how many locks of each type are currently held. Once again
+ only elements 1 through MAX_LOCKMODES-1 are used (0 is not). Also, like
+ requested[], summing the values of granted[] should total to the value
+ of nGranted.
+
+We should always have 0 <= nGranted <= nRequested, and
+0 <= granted[i] <= requested[i] for each i. When all the request counts
+go to zero, the LOCK object is no longer needed and can be freed.
+
+---------------------------------------------------------------------------
+
+The lock manager's PROCLOCK objects contain:
+
+tag -
+ The key fields that are used for hashing entries in the shared memory
+ PROCLOCK hash table. This is declared as a separate struct to ensure that
+ we always zero out the correct number of bytes. It is critical that any
+ alignment-padding bytes the compiler might insert in the struct be zeroed
+ out, else the hash computation will be random. (Currently, we are careful
+ to define struct PROCLOCKTAG so that there are no padding bytes.)
+
+ tag.myLock
+ Pointer to the shared LOCK object this PROCLOCK is for.
+
+ tag.myProc
+ Pointer to the PGPROC of backend process that owns this PROCLOCK.
+
+ Note: it's OK to use pointers here because a PROCLOCK never outlives
+ either its lock or its proc. The tag is therefore unique for as long
+ as it needs to be, even though the same tag values might mean something
+ else at other times.
+
+holdMask -
+ A bitmask for the lock modes successfully acquired by this PROCLOCK.
+ This should be a subset of the LOCK object's grantMask, and also a
+ subset of the PGPROC object's heldLocks mask (if the PGPROC is
+ currently waiting for another lock mode on this lock).
+
+releaseMask -
+ A bitmask for the lock modes due to be released during LockReleaseAll.
+ This must be a subset of the holdMask. Note that it is modified without
+ taking the partition LWLock, and therefore it is unsafe for any
+ backend except the one owning the PROCLOCK to examine/change it.
+
+lockLink -
+ List link for shared memory queue of all the PROCLOCK objects for the
+ same LOCK.
+
+procLink -
+ List link for shared memory queue of all the PROCLOCK objects for the
+ same backend.
+
+---------------------------------------------------------------------------
+
+
+Lock Manager Internal Locking
+-----------------------------
+
+Before PostgreSQL 8.2, all of the shared-memory data structures used by
+the lock manager were protected by a single LWLock, the LockMgrLock;
+any operation involving these data structures had to exclusively lock
+LockMgrLock. Not too surprisingly, this became a contention bottleneck.
+To reduce contention, the lock manager's data structures have been split
+into multiple "partitions", each protected by an independent LWLock.
+Most operations only need to lock the single partition they are working in.
+Here are the details:
+
+* Each possible lock is assigned to one partition according to a hash of
+its LOCKTAG value. The partition's LWLock is considered to protect all the
+LOCK objects of that partition as well as their subsidiary PROCLOCKs.
+
+* The shared-memory hash tables for LOCKs and PROCLOCKs are organized
+so that different partitions use different hash chains, and thus there
+is no conflict in working with objects in different partitions. This
+is supported directly by dynahash.c's "partitioned table" mechanism
+for the LOCK table: we need only ensure that the partition number is
+taken from the low-order bits of the dynahash hash value for the LOCKTAG.
+To make it work for PROCLOCKs, we have to ensure that a PROCLOCK's hash
+value has the same low-order bits as its associated LOCK. This requires
+a specialized hash function (see proclock_hash).
+
+* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it.
+This has now been split into per-partition lists, so that access to a
+particular PROCLOCK list can be protected by the associated partition's
+LWLock. (This rule allows one backend to manipulate another backend's
+PROCLOCK lists, which was not originally necessary but is now required in
+connection with fast-path locking; see below.)
+
+* The other lock-related fields of a PGPROC are only interesting when
+the PGPROC is waiting for a lock, so we consider that they are protected
+by the partition LWLock of the awaited lock.
+
+For normal lock acquisition and release, it is sufficient to lock the
+partition containing the desired lock. Deadlock checking needs to touch
+multiple partitions in general; for simplicity, we just make it lock all
+the partitions in partition-number order. (To prevent LWLock deadlock,
+we establish the rule that any backend needing to lock more than one
+partition at once must lock them in partition-number order.) It's
+possible that deadlock checking could be done without touching every
+partition in typical cases, but since in a properly functioning system
+deadlock checking should not occur often enough to be performance-critical,
+trying to make this work does not seem a productive use of effort.
+
+A backend's internal LOCALLOCK hash table is not partitioned. We do store
+a copy of the locktag hash code in LOCALLOCK table entries, from which the
+partition number can be computed, but this is a straight speed-for-space
+tradeoff: we could instead recalculate the partition number from the LOCKTAG
+when needed.
+
+
+Fast Path Locking
+-----------------
+
+Fast path locking is a special purpose mechanism designed to reduce the
+overhead of taking and releasing certain types of locks which are taken
+and released very frequently but rarely conflict. Currently, this includes
+two categories of locks:
+
+(1) Weak relation locks. SELECT, INSERT, UPDATE, and DELETE must acquire a
+lock on every relation they operate on, as well as various system catalogs
+that can be used internally. Many DML operations can proceed in parallel
+against the same table at the same time; only DDL operations such as
+CLUSTER, ALTER TABLE, or DROP -- or explicit user action such as LOCK TABLE
+-- will create lock conflicts with the "weak" locks (AccessShareLock,
+RowShareLock, RowExclusiveLock) acquired by DML operations.
+
+(2) VXID locks. Every transaction takes a lock on its own virtual
+transaction ID. Currently, the only operations that wait for these locks
+are CREATE INDEX CONCURRENTLY and Hot Standby (in the case of a conflict),
+so most VXID locks are taken and released by the owner without anyone else
+needing to care.
+
+The primary locking mechanism does not cope well with this workload. Even
+though the lock manager locks are partitioned, the locktag for any given
+relation still falls in one, and only one, partition. Thus, if many short
+queries are accessing the same relation, the lock manager partition lock for
+that partition becomes a contention bottleneck. This effect is measurable
+even on 2-core servers, and becomes very pronounced as core count increases.
+
+To alleviate this bottleneck, beginning in PostgreSQL 9.2, each backend is
+permitted to record a limited number of locks on unshared relations in an
+array within its PGPROC structure, rather than using the primary lock table.
+This mechanism can only be used when the locker can verify that no conflicting
+locks exist at the time of taking the lock.
+
+A key point of this algorithm is that it must be possible to verify the
+absence of possibly conflicting locks without fighting over a shared LWLock or
+spinlock. Otherwise, this effort would simply move the contention bottleneck
+from one place to another. We accomplish this using an array of 1024 integer
+counters, which are in effect a 1024-way partitioning of the lock space.
+Each counter records the number of "strong" locks (that is, ShareLock,
+ShareRowExclusiveLock, ExclusiveLock, and AccessExclusiveLock) on unshared
+relations that fall into that partition. When this counter is non-zero, the
+fast path mechanism may not be used to take new relation locks within that
+partition. A strong locker bumps the counter and then scans each per-backend
+array for matching fast-path locks; any which are found must be transferred to
+the primary lock table before attempting to acquire the lock, to ensure proper
+lock conflict and deadlock detection.
+
+On an SMP system, we must guarantee proper memory synchronization. Here we
+rely on the fact that LWLock acquisition acts as a memory sequence point: if
+A performs a store, A and B both acquire an LWLock in either order, and B
+then performs a load on the same memory location, it is guaranteed to see
+A's store. In this case, each backend's fast-path lock queue is protected
+by an LWLock. A backend wishing to acquire a fast-path lock grabs this
+LWLock before examining FastPathStrongRelationLocks to check for the presence
+of a conflicting strong lock. And the backend attempting to acquire a strong
+lock, because it must transfer any matching weak locks taken via the fast-path
+mechanism to the shared lock table, will acquire every LWLock protecting a
+backend fast-path queue in turn. So, if we examine
+FastPathStrongRelationLocks and see a zero, then either the value is truly
+zero, or if it is a stale value, the strong locker has yet to acquire the
+per-backend LWLock we now hold (or, indeed, even the first per-backend LWLock)
+and will notice any weak lock we take when it does.
+
+Fast-path VXID locks do not use the FastPathStrongRelationLocks table. The
+first lock taken on a VXID is always the ExclusiveLock taken by its owner.
+Any subsequent lockers are share lockers waiting for the VXID to terminate.
+Indeed, the only reason VXID locks use the lock manager at all (rather than
+waiting for the VXID to terminate via some other method) is for deadlock
+detection. Thus, the initial VXID lock can *always* be taken via the fast
+path without checking for conflicts. Any subsequent locker must check
+whether the lock has been transferred to the main lock table, and if not,
+do so. The backend owning the VXID must be careful to clean up any entry
+made in the main lock table at end of transaction.
+
+Deadlock detection does not need to examine the fast-path data structures,
+because any lock that could possibly be involved in a deadlock must have
+been transferred to the main tables beforehand.
+
+
+The Deadlock Detection Algorithm
+--------------------------------
+
+Since we allow user transactions to request locks in any order, deadlock
+is possible. We use a deadlock detection/breaking algorithm that is
+fairly standard in essence, but there are many special considerations
+needed to deal with Postgres' generalized locking model.
+
+A key design consideration is that we want to make routine operations
+(lock grant and release) run quickly when there is no deadlock, and
+avoid the overhead of deadlock handling as much as possible. We do this
+using an "optimistic waiting" approach: if a process cannot acquire the
+lock it wants immediately, it goes to sleep without any deadlock check.
+But it also sets a delay timer, with a delay of DeadlockTimeout
+milliseconds (typically set to one second). If the delay expires before
+the process is granted the lock it wants, it runs the deadlock
+detection/breaking code. Normally this code will determine that there is
+no deadlock condition, and then the process will go back to sleep and
+wait quietly until it is granted the lock. But if a deadlock condition
+does exist, it will be resolved, usually by aborting the detecting
+process' transaction. In this way, we avoid deadlock handling overhead
+whenever the wait time for a lock is less than DeadlockTimeout, while
+not imposing an unreasonable delay of detection when there is an error.
+
+Lock acquisition (routines LockAcquire and ProcSleep) follows these rules:
+
+1. A lock request is granted immediately if it does not conflict with
+any existing or waiting lock request, or if the process already holds an
+instance of the same lock type (eg, there's no penalty to acquire a read
+lock twice). Note that a process never conflicts with itself, eg one
+can obtain read lock when one already holds exclusive lock.
+
+2. Otherwise the process joins the lock's wait queue. Normally it will
+be added to the end of the queue, but there is an exception: if the
+process already holds locks on this same lockable object that conflict
+with the request of any pending waiter, then the process will be
+inserted in the wait queue just ahead of the first such waiter. (If we
+did not make this check, the deadlock detection code would adjust the
+queue order to resolve the conflict, but it's relatively cheap to make
+the check in ProcSleep and avoid a deadlock timeout delay in this case.)
+Note special case when inserting before the end of the queue: if the
+process's request does not conflict with any existing lock nor any
+waiting request before its insertion point, then go ahead and grant the
+lock without waiting.
+
+When a lock is released, the lock release routine (ProcLockWakeup) scans
+the lock object's wait queue. Each waiter is awoken if (a) its request
+does not conflict with already-granted locks, and (b) its request does
+not conflict with the requests of prior un-wakable waiters. Rule (b)
+ensures that conflicting requests are granted in order of arrival. There
+are cases where a later waiter must be allowed to go in front of
+conflicting earlier waiters to avoid deadlock, but it is not
+ProcLockWakeup's responsibility to recognize these cases; instead, the
+deadlock detection code will re-order the wait queue when necessary.
+
+To perform deadlock checking, we use the standard method of viewing the
+various processes as nodes in a directed graph (the waits-for graph or
+WFG). There is a graph edge leading from process A to process B if A
+waits for B, ie, A is waiting for some lock and B holds a conflicting
+lock. There is a deadlock condition if and only if the WFG contains a
+cycle. We detect cycles by searching outward along waits-for edges to
+see if we return to our starting point. There are three possible
+outcomes:
+
+1. All outgoing paths terminate at a running process (which has no
+outgoing edge).
+
+2. A deadlock is detected by looping back to the start point. We
+resolve such a deadlock by canceling the start point's lock request and
+reporting an error in that transaction, which normally leads to
+transaction abort and release of that transaction's held locks. Note
+that it's sufficient to cancel one request to remove the cycle; we don't
+need to kill all the transactions involved.
+
+3. Some path(s) loop back to a node other than the start point. This
+indicates a deadlock, but one that does not involve our starting
+process. We ignore this condition on the grounds that resolving such a
+deadlock is the responsibility of the processes involved --- killing our
+start-point process would not resolve the deadlock. So, cases 1 and 3
+both report "no deadlock".
+
+Postgres' situation is a little more complex than the standard discussion
+of deadlock detection, for two reasons:
+
+1. A process can be waiting for more than one other process, since there
+might be multiple PROCLOCKs of (non-conflicting) lock types that all
+conflict with the waiter's request. This creates no real difficulty
+however; we simply need to be prepared to trace more than one outgoing
+edge.
+
+2. If a process A is behind a process B in some lock's wait queue, and
+their requested locks conflict, then we must say that A waits for B, since
+ProcLockWakeup will never awaken A before B. This creates additional
+edges in the WFG. We call these "soft" edges, as opposed to the "hard"
+edges induced by locks already held. Note that if B already holds any
+locks conflicting with A's request, then their relationship is a hard edge
+not a soft edge.
+
+A "soft" block, or wait-priority block, has the same potential for
+inducing deadlock as a hard block. However, we may be able to resolve
+a soft block without aborting the transactions involved: we can instead
+rearrange the order of the wait queue. This rearrangement reverses the
+direction of the soft edge between two processes with conflicting requests
+whose queue order is reversed. If we can find a rearrangement that
+eliminates a cycle without creating new ones, then we can avoid an abort.
+Checking for such possible rearrangements is the trickiest part of the
+algorithm.
+
+The workhorse of the deadlock detector is a routine FindLockCycle() which
+is given a starting point process (which must be a waiting process).
+It recursively scans outward across waits-for edges as discussed above.
+If it finds no cycle involving the start point, it returns "false".
+(As discussed above, we can ignore cycles not involving the start point.)
+When such a cycle is found, FindLockCycle() returns "true", and as it
+unwinds it also builds a list of any "soft" edges involved in the cycle.
+If the resulting list is empty then there is a hard deadlock and the
+configuration cannot succeed. However, if the list is not empty, then
+reversing any one of the listed edges through wait-queue rearrangement
+will eliminate that cycle. Since such a reversal might create cycles
+elsewhere, we may need to try every possibility. Therefore, we need to
+be able to invoke FindLockCycle() on hypothetical configurations (wait
+orders) as well as the current real order.
+
+The easiest way to handle this seems to be to have a lookaside table that
+shows the proposed new queue order for each wait queue that we are
+considering rearranging. This table is checked by FindLockCycle, and it
+believes the proposed queue order rather than the real order for each lock
+that has an entry in the lookaside table.
+
+We build a proposed new queue order by doing a "topological sort" of the
+existing entries. Each soft edge that we are currently considering
+reversing creates a property of the partial order that the topological sort
+has to enforce. We must use a sort method that preserves the input
+ordering as much as possible, so as not to gratuitously break arrival
+order for processes not involved in a deadlock. (This is not true of the
+tsort method shown in Knuth, for example, but it's easily done by a simple
+doubly-nested-loop method that emits the first legal candidate at each
+step. Fortunately, we don't need a highly efficient sort algorithm, since
+the number of partial order constraints is not likely to be large.) Note
+that failure of the topological sort tells us we have conflicting ordering
+constraints, and therefore that the last-added soft edge reversal
+conflicts with a prior edge reversal. We need to detect this case to
+avoid an infinite loop in the case where no possible rearrangement will
+work: otherwise, we might try a reversal, find that it still leads to
+a cycle, then try to un-reverse the reversal while trying to get rid of
+that cycle, etc etc. Topological sort failure tells us the un-reversal
+is not a legitimate move in this context.
+
+So, the basic step in our rearrangement method is to take a list of
+soft edges in a cycle (as returned by FindLockCycle()) and successively
+try the reversal of each one as a topological-sort constraint added to
+whatever constraints we are already considering. We recursively search
+through all such sets of constraints to see if any one eliminates all
+the deadlock cycles at once. Although this might seem impossibly
+inefficient, it shouldn't be a big problem in practice, because there
+will normally be very few, and not very large, deadlock cycles --- if
+any at all. So the combinatorial inefficiency isn't going to hurt us.
+Besides, it's better to spend some time to guarantee that we've checked
+all possible escape routes than to abort a transaction when we didn't
+really have to.
+
+Each edge reversal constraint can be viewed as requesting that the waiting
+process A be moved to before the blocking process B in the wait queue they
+are both in. This action will reverse the desired soft edge, as well as
+any other soft edges between A and other processes it is advanced over.
+No other edges will be affected (note this is actually a constraint on our
+topological sort method to not re-order the queue more than necessary.)
+Therefore, we can be sure we have not created any new deadlock cycles if
+neither FindLockCycle(A) nor FindLockCycle(B) discovers any cycle. Given
+the above-defined behavior of FindLockCycle, each of these searches is
+necessary as well as sufficient, since FindLockCycle starting at the
+original start point will not complain about cycles that include A or B
+but not the original start point.
+
+In short then, a proposed rearrangement of the wait queue(s) is determined
+by one or more broken soft edges A->B, fully specified by the output of
+topological sorts of each wait queue involved, and then tested by invoking
+FindLockCycle() starting at the original start point as well as each of
+the mentioned processes (A's and B's). If none of the tests detect a
+cycle, then we have a valid configuration and can implement it by
+reordering the wait queues per the sort outputs (and then applying
+ProcLockWakeup on each reordered queue, in case a waiter has become wakable).
+If any test detects a soft cycle, we can try to resolve it by adding each
+soft link in that cycle, in turn, to the proposed rearrangement list.
+This is repeated recursively until we either find a workable rearrangement
+or determine that none exists. In the latter case, the outer level
+resolves the deadlock by aborting the original start-point transaction.
+
+The particular order in which rearrangements are tried depends on the
+order FindLockCycle() happens to scan in, so if there are multiple
+workable rearrangements of the wait queues, then it is unspecified which
+one will be chosen. What's more important is that we guarantee to try
+every queue rearrangement that could lead to success. (For example,
+if we have A before B before C and the needed order constraints are
+C before A and B before C, we would first discover that A before C
+doesn't work and try the rearrangement C before A before B. This would
+eventually lead to the discovery of the additional constraint B before C.)
+
+Got that?
+
+Miscellaneous Notes
+-------------------
+
+1. It is easily proven that no deadlock will be missed due to our
+asynchronous invocation of deadlock checking. A deadlock cycle in the WFG
+is formed when the last edge in the cycle is added; therefore the last
+process in the cycle to wait (the one from which that edge is outgoing) is
+certain to detect and resolve the cycle when it later runs CheckDeadLock.
+This holds even if that edge addition created multiple cycles; the process
+may indeed abort without ever noticing those additional cycles, but we
+don't particularly care. The only other possible creation of deadlocks is
+during deadlock resolution's rearrangement of wait queues, and we already
+saw that that algorithm will prove that it creates no new deadlocks before
+it attempts to actually execute any rearrangement.
+
+2. It is not certain that a deadlock will be resolved by aborting the
+last-to-wait process. If earlier waiters in the cycle have not yet run
+CheckDeadLock, then the first one to do so will be the victim.
+
+3. No live (wakable) process can be missed by ProcLockWakeup, since it
+examines every member of the wait queue (this was not true in the 7.0
+implementation, BTW). Therefore, if ProcLockWakeup is always invoked
+after a lock is released or a wait queue is rearranged, there can be no
+failure to wake a wakable process. One should also note that
+LockErrorCleanup (abort a waiter due to outside factors) must run
+ProcLockWakeup, in case the canceled waiter was soft-blocking other
+waiters.
+
+4. We can minimize excess rearrangement-trial work by being careful to
+scan the wait queue from the front when looking for soft edges. For
+example, if we have queue order A,B,C and C has deadlock conflicts with
+both A and B, we want to generate the "C before A" constraint first,
+rather than wasting time with "C before B", which won't move C far
+enough up. So we look for soft edges outgoing from C starting at the
+front of the wait queue.
+
+5. The working data structures needed by the deadlock detection code can
+be limited to numbers of entries computed from MaxBackends. Therefore,
+we can allocate the worst-case space needed during backend startup. This
+seems a safer approach than trying to allocate workspace on the fly; we
+don't want to risk having the deadlock detector run out of memory, else
+we really have no guarantees at all that deadlock will be detected.
+
+6. We abuse the deadlock detector to implement autovacuum cancellation.
+When we run the detector and we find that there's an autovacuum worker
+involved in the waits-for graph, we store a pointer to its PGPROC, and
+return a special return code (unless a hard deadlock has been detected).
+The caller can then send a cancellation signal. This implements the
+principle that autovacuum has a low locking priority (eg it must not block
+DDL on the table).
+
+Group Locking
+-------------
+
+As if all of that weren't already complicated enough, PostgreSQL now supports
+parallelism (see src/backend/access/transam/README.parallel), which means that
+we might need to resolve deadlocks that occur between gangs of related
+processes rather than individual processes. This doesn't change the basic
+deadlock detection algorithm very much, but it makes the bookkeeping more
+complicated.
+
+We choose to regard locks held by processes in the same parallel group as
+non-conflicting with the exception of relation extension and page locks. This
+means that two processes in a parallel group can hold a self-exclusive lock on
+the same relation at the same time, or one process can acquire an AccessShareLock
+while the other already holds AccessExclusiveLock. This might seem dangerous and
+could be in some cases (more on that below), but if we didn't do this then
+parallel query would be extremely prone to self-deadlock. For example, a
+parallel query against a relation on which the leader already had
+AccessExclusiveLock would hang, because the workers would try to lock the same
+relation and be blocked by the leader; yet the leader can't finish until it
+receives completion indications from all workers. An undetected deadlock
+results. This is far from the only scenario where such a problem happens. The
+same thing will occur if the leader holds only AccessShareLock, the worker
+seeks AccessShareLock, but between the time the leader attempts to acquire the
+lock and the time the worker attempts to acquire it, some other process queues
+up waiting for an AccessExclusiveLock. In this case, too, an indefinite hang
+results.
+
+It might seem that we could predict which locks the workers will attempt to
+acquire and ensure before going parallel that those locks would be acquired
+successfully. But this is very difficult to make work in a general way. For
+example, a parallel worker's portion of the query plan could involve an
+SQL-callable function which generates a query dynamically, and that query
+might happen to hit a table on which the leader happens to hold
+AccessExclusiveLock. By imposing enough restrictions on what workers can do,
+we could eventually create a situation where their behavior can be adequately
+restricted, but these restrictions would be fairly onerous, and even then, the
+system required to decide whether the workers will succeed at acquiring the
+necessary locks would be complex and possibly buggy.
+
+So, instead, we take the approach of deciding that locks within a lock group
+do not conflict. This eliminates the possibility of an undetected deadlock,
+but also opens up some problem cases: if the leader and worker try to do some
+operation at the same time which would ordinarily be prevented by the
+heavyweight lock mechanism, undefined behavior might result. In practice, the
+dangers are modest. The leader and worker share the same transaction,
+snapshot, and combo CID hash, and neither can perform any DDL or, indeed,
+write any data at all. Thus, for either to read a table locked exclusively by
+the other is safe enough. Problems would occur if the leader initiated
+parallelism from a point in the code at which it had some backend-private
+state that made table access from another process unsafe, for example after
+calling SetReindexProcessing and before calling ResetReindexProcessing,
+catastrophe could ensue, because the worker won't have that state.
+
+To allow parallel inserts and parallel copy, we have ensured that relation
+extension and page locks don't participate in group locking which means such
+locks can conflict among the same group members. This is required as it is no
+safer for two related processes to extend the same relation or perform clean up
+in gin indexes at a time than for unrelated processes to do the same. We don't
+acquire a heavyweight lock on any other object after relation extension lock
+which means such a lock can never participate in the deadlock cycle. After
+acquiring page locks, we can acquire relation extension lock but reverse never
+happens, so those will also not participate in deadlock. To allow for other
+parallel writes like parallel update or parallel delete, we'll either need to
+(1) further enhance the deadlock detector to handle those tuple locks in a
+different way than other types; or (2) have parallel workers use some other
+mutual exclusion method for such cases. Currently, the parallel mode is
+strictly read-only, but now we have the infrastructure to allow parallel
+inserts and parallel copy.
+
+Group locking adds three new members to each PGPROC: lockGroupLeader,
+lockGroupMembers, and lockGroupLink. A PGPROC's lockGroupLeader is NULL for
+processes not involved in parallel query. When a process wants to cooperate
+with parallel workers, it becomes a lock group leader, which means setting
+this field to point to its own PGPROC. When a parallel worker starts up, it
+points this field at the leader. The lockGroupMembers field is only used in
+the leader; it is a list of the member PGPROCs of the lock group (the leader
+and all workers). The lockGroupLink field is the list link for this list.
+
+All three of these fields are considered to be protected by a lock manager
+partition lock. The partition lock that protects these fields within a given
+lock group is chosen by taking the leader's pgprocno modulo the number of lock
+manager partitions. This unusual arrangement has a major advantage: the
+deadlock detector can count on the fact that no lockGroupLeader field can
+change while the deadlock detector is running, because it knows that it holds
+all the lock manager locks. Also, holding this single lock allows safe
+manipulation of the lockGroupMembers list for the lock group.
+
+We need an additional interlock when setting these fields, because a newly
+started parallel worker has to try to join the leader's lock group, but it
+has no guarantee that the group leader is still alive by the time it gets
+started. We try to ensure that the parallel leader dies after all workers
+in normal cases, but also that the system could survive relatively intact
+if that somehow fails to happen. This is one of the precautions against
+such a scenario: the leader relays its PGPROC and also its PID to the
+worker, and the worker fails to join the lock group unless the given PGPROC
+still has the same PID and is still a lock group leader. We assume that
+PIDs are not recycled quickly enough for this interlock to fail.
+
+
+User Locks (Advisory Locks)
+---------------------------
+
+User locks are handled totally on the application side as long term
+cooperative locks which may extend beyond the normal transaction boundaries.
+Their purpose is to indicate to an application that someone is `working'
+on an item. So it is possible to put an user lock on a tuple's oid,
+retrieve the tuple, work on it for an hour and then update it and remove
+the lock. While the lock is active other clients can still read and write
+the tuple but they can be aware that it has been locked at the application
+level by someone.
+
+User locks and normal locks are completely orthogonal and they don't
+interfere with each other.
+
+User locks can be acquired either at session level or transaction level.
+A session-level lock request is not automatically released at transaction
+end, but must be explicitly released by the application. (However, any
+remaining locks are always released at session end.) Transaction-level
+user lock requests behave the same as normal lock requests, in that they
+are released at transaction end and do not need explicit unlocking.
+
+Locking during Hot Standby
+--------------------------
+
+The Startup process is the only backend that can make changes during
+recovery, all other backends are read only. As a result the Startup
+process does not acquire locks on relations or objects except when the lock
+level is AccessExclusiveLock.
+
+Regular backends are only allowed to take locks on relations or objects
+at RowExclusiveLock or lower. This ensures that they do not conflict with
+each other or with the Startup process, unless AccessExclusiveLocks are
+requested by the Startup process.
+
+Deadlocks involving AccessExclusiveLocks are not possible, so we need
+not be concerned that a user initiated deadlock can prevent recovery from
+progressing.
+
+AccessExclusiveLocks on the primary node generate WAL records
+that are then applied by the Startup process. Locks are released at end
+of transaction just as they are in normal processing. These locks are
+held by the Startup process, acting as a proxy for the backends that
+originally acquired these locks. Again, these locks cannot conflict with
+one another, so the Startup process cannot deadlock itself either.
+
+Although deadlock is not possible, a regular backend's weak lock can
+prevent the Startup process from making progress in applying WAL, which is
+usually not something that should be tolerated for very long. Mechanisms
+exist to forcibly cancel a regular backend's query if it blocks the
+Startup process for too long.
diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI
new file mode 100644
index 0000000..50d2ecc
--- /dev/null
+++ b/src/backend/storage/lmgr/README-SSI
@@ -0,0 +1,646 @@
+src/backend/storage/lmgr/README-SSI
+
+Serializable Snapshot Isolation (SSI) and Predicate Locking
+===========================================================
+
+This code is in the lmgr directory because about 90% of it is an
+implementation of predicate locking, which is required for SSI,
+rather than being directly related to SSI itself. When another use
+for predicate locking justifies the effort to tease these two things
+apart, this README file should probably be split.
+
+
+Credits
+-------
+
+This feature was developed by Kevin Grittner and Dan R. K. Ports,
+with review and suggestions from Joe Conway, Heikki Linnakangas, and
+Jeff Davis. It is based on work published in these papers:
+
+ Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
+ Serializable isolation for snapshot databases.
+ In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
+ international conference on Management of data,
+ pages 729-738, New York, NY, USA. ACM.
+ http://doi.acm.org/10.1145/1376616.1376690
+
+ Michael James Cahill. 2009.
+ Serializable Isolation for Snapshot Databases.
+ Sydney Digital Theses.
+ University of Sydney, School of Information Technologies.
+ http://hdl.handle.net/2123/5353
+
+
+Overview
+--------
+
+With true serializable transactions, if you can show that your
+transaction will do the right thing if there are no concurrent
+transactions, it will do the right thing in any mix of serializable
+transactions or be rolled back with a serialization failure. This
+feature has been implemented in PostgreSQL using SSI.
+
+
+Serializable and Snapshot Transaction Isolation Levels
+------------------------------------------------------
+
+Serializable transaction isolation is attractive for shops with
+active development by many programmers against a complex schema
+because it guarantees data integrity with very little staff time --
+if a transaction can be shown to always do the right thing when it is
+run alone (before or after any other transaction), it will always do
+the right thing in any mix of concurrent serializable transactions.
+Where conflicts with other transactions would result in an
+inconsistent state within the database or an inconsistent view of
+the data, a serializable transaction will block or roll back to
+prevent the anomaly. The SQL standard provides a specific SQLSTATE
+for errors generated when a transaction rolls back for this reason,
+so that transactions can be retried automatically.
+
+Before version 9.1, PostgreSQL did not support a full serializable
+isolation level. A request for serializable transaction isolation
+actually provided snapshot isolation. This has well known anomalies
+which can allow data corruption or inconsistent views of the data
+during concurrent transactions; although these anomalies only occur
+when certain patterns of read-write dependencies exist within a set
+of concurrent transactions. Where these patterns exist, the anomalies
+can be prevented by introducing conflicts through explicitly
+programmed locks or otherwise unnecessary writes to the database.
+Snapshot isolation is popular because performance is better than
+serializable isolation and the integrity guarantees which it does
+provide allow anomalies to be avoided or managed with reasonable
+effort in many environments.
+
+
+Serializable Isolation Implementation Strategies
+------------------------------------------------
+
+Techniques for implementing full serializable isolation have been
+published and in use in many database products for decades. The
+primary technique which has been used is Strict Two-Phase Locking
+(S2PL), which operates by blocking writes against data which has been
+read by concurrent transactions and blocking any access (read or
+write) against data which has been written by concurrent
+transactions. A cycle in a graph of blocking indicates a deadlock,
+requiring a rollback. Blocking and deadlocks under S2PL in high
+contention workloads can be debilitating, crippling throughput and
+response time.
+
+A new technique for implementing full serializable isolation in an
+MVCC database appears in the literature beginning in 2008. This
+technique, known as Serializable Snapshot Isolation (SSI) has many of
+the advantages of snapshot isolation. In particular, reads don't
+block anything and writes don't block reads. Essentially, it runs
+snapshot isolation but monitors the read-write conflicts between
+transactions to identify dangerous structures in the transaction
+graph which indicate that a set of concurrent transactions might
+produce an anomaly, and rolls back transactions to ensure that no
+anomalies occur. It will produce some false positives (where a
+transaction is rolled back even though there would not have been an
+anomaly), but will never let an anomaly occur. In the two known
+prototype implementations, performance for many workloads (even with
+the need to restart transactions which are rolled back) is very close
+to snapshot isolation and generally far better than an S2PL
+implementation.
+
+
+Apparent Serial Order of Execution
+----------------------------------
+
+One way to understand when snapshot anomalies can occur, and to
+visualize the difference between the serializable implementations
+described above, is to consider that among transactions executing at
+the serializable transaction isolation level, the results are
+required to be consistent with some serial (one-at-a-time) execution
+of the transactions [1]. How is that order determined in each?
+
+In S2PL, each transaction locks any data it accesses. It holds the
+locks until committing, preventing other transactions from making
+conflicting accesses to the same data in the interim. Some
+transactions may have to be rolled back to prevent deadlock. But
+successful transactions can always be viewed as having occurred
+sequentially, in the order they committed.
+
+With snapshot isolation, reads never block writes, nor vice versa, so
+more concurrency is possible. The order in which transactions appear
+to have executed is determined by something more subtle than in S2PL:
+read/write dependencies. If a transaction reads data, it appears to
+execute after the transaction that wrote the data it is reading.
+Similarly, if it updates data, it appears to execute after the
+transaction that wrote the previous version. These dependencies, which
+we call "wr-dependencies" and "ww-dependencies", are consistent with
+the commit order, because the first transaction must have committed
+before the second starts. However, there can also be dependencies
+between two *concurrent* transactions, i.e. where one was running when
+the other acquired its snapshot. These "rw-conflicts" occur when one
+transaction attempts to read data which is not visible to it because
+the transaction which wrote it (or will later write it) is
+concurrent. The reading transaction appears to have executed first,
+regardless of the actual sequence of transaction starts or commits,
+because it sees a database state prior to that in which the other
+transaction leaves it.
+
+Anomalies occur when a cycle is created in the graph of dependencies:
+when a dependency or series of dependencies causes transaction A to
+appear to have executed before transaction B, but another series of
+dependencies causes B to appear before A. If that's the case, then
+the results can't be consistent with any serial execution of the
+transactions.
+
+
+SSI Algorithm
+-------------
+
+As of 9.1, serializable transactions in PostgreSQL are implemented using
+Serializable Snapshot Isolation (SSI), based on the work of Cahill
+et al. Fundamentally, this allows snapshot isolation to run as it
+previously did, while monitoring for conditions which could create a
+serialization anomaly.
+
+SSI is based on the observation [2] that each snapshot isolation
+anomaly corresponds to a cycle that contains a "dangerous structure"
+of two adjacent rw-conflict edges:
+
+ Tin ------> Tpivot ------> Tout
+ rw rw
+
+SSI works by watching for this dangerous structure, and rolling
+back a transaction when needed to prevent any anomaly. This means it
+only needs to track rw-conflicts between concurrent transactions, not
+wr- and ww-dependencies. It also means there is a risk of false
+positives, because not every dangerous structure is embedded in an
+actual cycle. The number of false positives is low in practice, so
+this represents an acceptable tradeoff for keeping the detection
+overhead low.
+
+The PostgreSQL implementation uses two additional optimizations:
+
+* Tout must commit before any other transaction in the cycle
+ (see proof of Theorem 2.1 of [2]). We only roll back a transaction
+ if Tout commits before Tpivot and Tin.
+
+* if Tin is read-only, there can only be an anomaly if Tout committed
+ before Tin takes its snapshot. This optimization is an original
+ one. Proof:
+
+ - Because there is a cycle, there must be some transaction T0 that
+ precedes Tin in the cycle. (T0 might be the same as Tout.)
+
+ - The edge between T0 and Tin can't be a rw-conflict or ww-dependency,
+ because Tin was read-only, so it must be a wr-dependency.
+ Those can only occur if T0 committed before Tin took its snapshot,
+ else Tin would have ignored T0's output.
+
+ - Because Tout must commit before any other transaction in the
+ cycle, it must commit before T0 commits -- and thus before Tin
+ starts.
+
+
+PostgreSQL Implementation
+-------------------------
+
+ * Since this technique is based on Snapshot Isolation (SI), those
+areas in PostgreSQL which don't use SI can't be brought under SSI.
+This includes system tables, temporary tables, sequences, hint bit
+rewrites, etc. SSI can not eliminate existing anomalies in these
+areas.
+
+ * Any transaction which is run at a transaction isolation level
+other than SERIALIZABLE will not be affected by SSI. If you want to
+enforce business rules through SSI, all transactions should be run at
+the SERIALIZABLE transaction isolation level, and that should
+probably be set as the default.
+
+ * If all transactions are run at the SERIALIZABLE transaction
+isolation level, business rules can be enforced in triggers or
+application code without ever having a need to acquire an explicit
+lock or to use SELECT FOR SHARE or SELECT FOR UPDATE.
+
+ * Those who want to continue to use snapshot isolation without
+the additional protections of SSI (and the associated costs of
+enforcing those protections), can use the REPEATABLE READ transaction
+isolation level. This level retains its legacy behavior, which
+is identical to the old SERIALIZABLE implementation and fully
+consistent with the standard's requirements for the REPEATABLE READ
+transaction isolation level.
+
+ * Performance under this SSI implementation will be significantly
+improved if transactions which don't modify permanent tables are
+declared to be READ ONLY before they begin reading data.
+
+ * Performance under SSI will tend to degrade more rapidly with a
+large number of active database transactions than under less strict
+isolation levels. Limiting the number of active transactions through
+use of a connection pool or similar techniques may be necessary to
+maintain good performance.
+
+ * Any transaction which must be rolled back to prevent
+serialization anomalies will fail with SQLSTATE 40001, which has a
+standard meaning of "serialization failure".
+
+ * This SSI implementation makes an effort to choose the
+transaction to be canceled such that an immediate retry of the
+transaction will not fail due to conflicts with exactly the same
+transactions. Pursuant to this goal, no transaction is canceled
+until one of the other transactions in the set of conflicts which
+could generate an anomaly has successfully committed. This is
+conceptually similar to how write conflicts are handled. To fully
+implement this guarantee there needs to be a way to roll back the
+active transaction for another process with a serialization failure
+SQLSTATE, even if it is "idle in transaction".
+
+
+Predicate Locking
+-----------------
+
+Both S2PL and SSI require some form of predicate locking to handle
+situations where reads conflict with later inserts or with later
+updates which move data into the selected range. PostgreSQL didn't
+already have predicate locking, so it needed to be added to support
+full serializable transactions under either strategy. Practical
+implementations of predicate locking generally involve acquiring
+locks against data as it is accessed, using multiple granularities
+(tuple, page, table, etc.) with escalation as needed to keep the lock
+count to a number which can be tracked within RAM structures. This
+approach was used in PostgreSQL. Coarse granularities can cause some
+false positive indications of conflict. The number of false positives
+can be influenced by plan choice.
+
+
+Implementation overview
+-----------------------
+
+New RAM structures, inspired by those used to track traditional locks
+in PostgreSQL, but tailored to the needs of SIREAD predicate locking,
+are used. These refer to physical objects actually accessed in the
+course of executing the query, to model the predicates through
+inference. Anyone interested in this subject should review the
+Hellerstein, Stonebraker and Hamilton paper [3], along with the
+locking papers referenced from that and the Cahill papers.
+
+Because the SIREAD locks don't block, traditional locking techniques
+have to be modified. Intent locking (locking higher level objects
+before locking lower level objects) doesn't work with non-blocking
+"locks" (which are, in some respects, more like flags than locks).
+
+A configurable amount of shared memory is reserved at postmaster
+start-up to track predicate locks. This size cannot be changed
+without a restart.
+
+To prevent resource exhaustion, multiple fine-grained locks may
+be promoted to a single coarser-grained lock as needed.
+
+An attempt to acquire an SIREAD lock on a tuple when the same
+transaction already holds an SIREAD lock on the page or the relation
+will be ignored. Likewise, an attempt to lock a page when the
+relation is locked will be ignored, and the acquisition of a coarser
+lock will result in the automatic release of all finer-grained locks
+it covers.
+
+
+Heap locking
+------------
+
+Predicate locks will be acquired for the heap based on the following:
+
+ * For a table scan, the entire relation will be locked.
+
+ * Each tuple read which is visible to the reading transaction
+will be locked, whether or not it meets selection criteria; except
+that there is no need to acquire an SIREAD lock on a tuple when the
+transaction already holds a write lock on any tuple representing the
+row, since a rw-conflict would also create a ww-dependency which
+has more aggressive enforcement and thus will prevent any anomaly.
+
+ * Modifying a heap tuple creates a rw-conflict with any transaction
+that holds a SIREAD lock on that tuple, or on the page or relation
+that contains it.
+
+ * Inserting a new tuple creates a rw-conflict with any transaction
+holding a SIREAD lock on the entire relation. It doesn't conflict with
+page-level locks, because page-level locks are only used to aggregate
+tuple locks. Unlike index page locks, they don't lock "gaps" on the page.
+
+
+Index AM implementations
+------------------------
+
+Since predicate locks only exist to detect writes which conflict with
+earlier reads, and heap tuple locks are acquired to cover all heap
+tuples actually read, including those read through indexes, the index
+tuples which were actually scanned are not of interest in themselves;
+we only care about their "new neighbors" -- later inserts into the
+index which would have been included in the scan had they existed at
+the time. Conceptually, we want to lock the gaps between and
+surrounding index entries within the scanned range.
+
+Correctness requires that any insert into an index generates a
+rw-conflict with a concurrent serializable transaction if, after that
+insert, re-execution of any index scan of the other transaction would
+access the heap for a row not accessed during the previous execution.
+Note that a non-HOT update which expires an old index entry covered
+by the scan and adds a new entry for the modified row's new tuple
+need not generate a conflict, although an update which "moves" a row
+into the scan must generate a conflict. While correctness allows
+false positives, they should be minimized for performance reasons.
+
+Several optimizations are possible, though not all are implemented yet:
+
+ * An index scan which is just finding the right position for an
+index insertion or deletion need not acquire a predicate lock.
+
+ * An index scan which is comparing for equality on the entire key
+for a unique index need not acquire a predicate lock as long as a key
+is found corresponding to a visible tuple which has not been modified
+by another transaction -- there are no "between or around" gaps to
+cover.
+
+ * As long as built-in foreign key enforcement continues to use
+its current "special tricks" to deal with MVCC issues, predicate
+locks should not be needed for scans done by enforcement code.
+
+ * If a search determines that no rows can be found regardless of
+index contents because the search conditions are contradictory (e.g.,
+x = 1 AND x = 2), then no predicate lock is needed.
+
+Other index AM implementation considerations:
+
+ * For an index AM that doesn't have support for predicate locking,
+we just acquire a predicate lock on the whole index for any search.
+
+ * B-tree index searches acquire predicate locks only on the
+index *leaf* pages needed to lock the appropriate index range. If,
+however, a search discovers that no root page has yet been created, a
+predicate lock on the index relation is required.
+
+ * Like a B-tree, GIN searches acquire predicate locks only on the
+leaf pages of entry tree. When performing an equality scan, and an
+entry has a posting tree, the posting tree root is locked instead, to
+lock only that key value. However, fastupdate=on postpones the
+insertion of tuples into index structure by temporarily storing them
+into pending list. That makes us unable to detect r-w conflicts using
+page-level locks. To cope with that, insertions to the pending list
+conflict with all scans.
+
+ * GiST searches can determine that there are no matches at any
+level of the index, so we acquire predicate lock at each index
+level during a GiST search. An index insert at the leaf level can
+then be trusted to ripple up to all levels and locations where
+conflicting predicate locks may exist. In case there is a page split,
+we need to copy predicate lock from the original page to all the new
+pages.
+
+ * Hash index searches acquire predicate locks on the primary
+page of a bucket. It acquires a lock on both the old and new buckets
+for scans that happen concurrently with page splits. During a bucket
+split, a predicate lock is copied from the primary page of an old
+bucket to the primary page of a new bucket.
+
+ * The effects of page splits, overflows, consolidations, and
+removals must be carefully reviewed to ensure that predicate locks
+aren't "lost" during those operations, or kept with pages which could
+get re-used for different parts of the index.
+
+
+Innovations
+-----------
+
+The PostgreSQL implementation of Serializable Snapshot Isolation
+differs from what is described in the cited papers for several
+reasons:
+
+ 1. PostgreSQL didn't have any existing predicate locking. It had
+to be added from scratch.
+
+ 2. The existing in-memory lock structures were not suitable for
+tracking SIREAD locks.
+ * In PostgreSQL, tuple level locks are not held in RAM for
+any length of time; lock information is written to the tuples
+involved in the transactions.
+ * In PostgreSQL, existing lock structures have pointers to
+memory which is related to a session. SIREAD locks need to persist
+past the end of the originating transaction and even the session
+which ran it.
+ * PostgreSQL needs to be able to tolerate a large number of
+transactions executing while one long-running transaction stays open
+-- the in-RAM techniques discussed in the papers wouldn't support
+that.
+
+ 3. Unlike the database products used for the prototypes described
+in the papers, PostgreSQL didn't already have a true serializable
+isolation level distinct from snapshot isolation.
+
+ 4. PostgreSQL supports subtransactions -- an issue not mentioned
+in the papers.
+
+ 5. PostgreSQL doesn't assign a transaction number to a database
+transaction until and unless necessary (normally, when the transaction
+attempts to modify data).
+
+ 6. PostgreSQL has pluggable data types with user-definable
+operators, as well as pluggable index types, not all of which are
+based around data types which support ordering.
+
+ 7. Some possible optimizations became apparent during development
+and testing.
+
+Differences from the implementation described in the papers are
+listed below.
+
+ * New structures needed to be created in shared memory to track
+the proper information for serializable transactions and their SIREAD
+locks.
+
+ * Because PostgreSQL does not have the same concept of an "oldest
+transaction ID" for all serializable transactions as assumed in the
+Cahill thesis, we track the oldest snapshot xmin among serializable
+transactions, and a count of how many active transactions use that
+xmin. When the count hits zero we find the new oldest xmin and run a
+clean-up based on that.
+
+ * Because reads in a subtransaction may cause that subtransaction
+to roll back, thereby affecting what is written by the top level
+transaction, predicate locks must survive a subtransaction rollback.
+As a consequence, all xid usage in SSI, including predicate locking,
+is based on the top level xid. When looking at an xid that comes
+from a tuple's xmin or xmax, for example, we always call
+SubTransGetTopmostTransaction() before doing much else with it.
+
+ * PostgreSQL does not use "update in place" with a rollback log
+for its MVCC implementation. Where possible it uses "HOT" updates on
+the same page (if there is room and no indexed value is changed).
+For non-HOT updates the old tuple is expired in place and a new tuple
+is inserted at a new location. Because of this difference, a tuple
+lock in PostgreSQL doesn't automatically lock any other versions of a
+row. We don't try to copy or expand a tuple lock to any other
+versions of the row, based on the following proof that any additional
+serialization failures we would get from that would be false
+positives:
+
+ o If transaction T1 reads a row version (thus acquiring a
+predicate lock on it) and a second transaction T2 updates that row
+version (thus creating a rw-conflict graph edge from T1 to T2), must a
+third transaction T3 which re-updates the new version of the row also
+have a rw-conflict in from T1 to prevent anomalies? In other words,
+does it matter whether we recognize the edge T1 -> T3?
+
+ o If T1 has a conflict in, it certainly doesn't. Adding the
+edge T1 -> T3 would create a dangerous structure, but we already had
+one from the edge T1 -> T2, so we would have aborted something anyway.
+(T2 has already committed, else T3 could not have updated its output;
+but we would have aborted either T1 or T1's predecessor(s). Hence
+no cycle involving T1 and T3 can survive.)
+
+ o Now let's consider the case where T1 doesn't have a
+rw-conflict in. If that's the case, for this edge T1 -> T3 to make a
+difference, T3 must have a rw-conflict out that induces a cycle in the
+dependency graph, i.e. a conflict out to some transaction preceding T1
+in the graph. (A conflict out to T1 itself would be problematic too,
+but that would mean T1 has a conflict in, the case we already
+eliminated.)
+
+ o So now we're trying to figure out if there can be an
+rw-conflict edge T3 -> T0, where T0 is some transaction that precedes
+T1. For T0 to precede T1, there has to be some edge, or sequence of
+edges, from T0 to T1. At least the last edge has to be a wr-dependency
+or ww-dependency rather than a rw-conflict, because T1 doesn't have a
+rw-conflict in. And that gives us enough information about the order
+of transactions to see that T3 can't have a rw-conflict to T0:
+ - T0 committed before T1 started (the wr/ww-dependency implies this)
+ - T1 started before T2 committed (the T1->T2 rw-conflict implies this)
+ - T2 committed before T3 started (otherwise, T3 would get aborted
+ because of an update conflict)
+
+ o That means T0 committed before T3 started, and therefore
+there can't be a rw-conflict from T3 to T0.
+
+ o So in all cases, we don't need the T1 -> T3 edge to
+recognize cycles. Therefore it's not necessary for T1's SIREAD lock
+on the original tuple version to cover later versions as well.
+
+ * Predicate locking in PostgreSQL starts at the tuple level
+when possible. Multiple fine-grained locks are promoted to a single
+coarser-granularity lock as needed to avoid resource exhaustion. The
+amount of memory used for these structures is configurable, to balance
+RAM usage against SIREAD lock granularity.
+
+ * Each backend keeps a process-local table of the locks it holds.
+To support granularity promotion decisions with low CPU and locking
+overhead, this table also includes the coarser covering locks and the
+number of finer-granularity locks they cover.
+
+ * Conflicts are identified by looking for predicate locks
+when tuples are written, and by looking at the MVCC information when
+tuples are read. There is no matching between two RAM-based locks.
+
+ * Because write locks are stored in the heap tuples rather than a
+RAM-based lock table, the optimization described in the Cahill thesis
+which eliminates an SIREAD lock where there is a write lock is
+implemented by the following:
+ 1. When checking a heap write for conflicts against existing
+predicate locks, a tuple lock on the tuple being written is removed.
+ 2. When acquiring a predicate lock on a heap tuple, we
+return quickly without doing anything if it is a tuple written by the
+reading transaction.
+
+ * Rather than using conflictIn and conflictOut pointers which use
+NULL to indicate no conflict and a self-reference to indicate
+multiple conflicts or conflicts with committed transactions, we use a
+list of rw-conflicts. With the more complete information, false
+positives are reduced and we have sufficient data for more aggressive
+clean-up and other optimizations:
+
+ o We can avoid ever rolling back a transaction until and
+unless there is a pivot where a transaction on the conflict *out*
+side of the pivot committed before either of the other transactions.
+
+ o We can avoid ever rolling back a transaction when the
+transaction on the conflict *in* side of the pivot is explicitly or
+implicitly READ ONLY unless the transaction on the conflict *out*
+side of the pivot committed before the READ ONLY transaction acquired
+its snapshot. (An implicit READ ONLY transaction is one which
+committed without writing, even though it was not explicitly declared
+to be READ ONLY.)
+
+ o We can more aggressively clean up conflicts, predicate
+locks, and SSI transaction information.
+
+ * We allow a READ ONLY transaction to "opt out" of SSI if there are
+no READ WRITE transactions which could cause the READ ONLY
+transaction to ever become part of a "dangerous structure" of
+overlapping transaction dependencies.
+
+ * We allow the user to request that a READ ONLY transaction wait
+until the conditions are right for it to start in the "opt out" state
+described above. We add a DEFERRABLE state to transactions, which is
+specified and maintained in a way similar to READ ONLY. It is
+ignored for transactions that are not SERIALIZABLE and READ ONLY.
+
+ * When a transaction must be rolled back, we pick among the
+active transactions such that an immediate retry will not fail again
+on conflicts with the same transactions.
+
+ * We use the PostgreSQL SLRU system to hold summarized
+information about older committed transactions to put an upper bound
+on RAM used. Beyond that limit, information spills to disk.
+Performance can degrade in a pessimal situation, but it should be
+tolerable, and transactions won't need to be canceled or blocked
+from starting.
+
+
+R&D Issues
+----------
+
+This is intended to be the place to record specific issues which need
+more detailed review or analysis.
+
+ * WAL file replay. While serializable implementations using S2PL
+can guarantee that the write-ahead log contains commits in a sequence
+consistent with some serial execution of serializable transactions,
+SSI cannot make that guarantee. While the WAL replay is no less
+consistent than under snapshot isolation, it is possible that under
+PITR recovery or hot standby a database could reach a readable state
+where some transactions appear before other transactions which would
+have had to precede them to maintain serializable consistency. In
+essence, if we do nothing, WAL replay will be at snapshot isolation
+even for serializable transactions. Is this OK? If not, how do we
+address it?
+
+ * External replication. Look at how this impacts external
+replication solutions, like Postgres-R, Slony, pgpool, HS/SR, etc.
+This is related to the "WAL file replay" issue.
+
+ * UNIQUE btree search for equality on all columns. Since a search
+of a UNIQUE index using equality tests on all columns will lock the
+heap tuple if an entry is found, it appears that there is no need to
+get a predicate lock on the index in that case. A predicate lock is
+still needed for such a search if a matching index entry which points
+to a visible tuple is not found.
+
+ * Minimize touching of shared memory. Should lists in shared
+memory push entries which have just been returned to the front of the
+available list, so they will be popped back off soon and some memory
+might never be touched, or should we keep adding returned items to
+the end of the available list?
+
+
+References
+----------
+
+[1] http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
+Search for serial execution to find the relevant section.
+
+[2] A. Fekete et al. Making Snapshot Isolation Serializable. In ACM
+Transactions on Database Systems 30:2, Jun. 2005.
+http://dx.doi.org/10.1145/1071610.1071615
+
+[3] Joseph M. Hellerstein, Michael Stonebraker and James Hamilton. 2007.
+Architecture of a Database System. Foundations and Trends(R) in
+Databases Vol. 1, No. 2 (2007) 141-259.
+http://db.cs.berkeley.edu/papers/fntdb07-architecture.pdf
+ Of particular interest:
+ * 6.1 A Note on ACID
+ * 6.2 A Brief Review of Serializability
+ * 6.3 Locking and Latching
+ * 6.3.1 Transaction Isolation Levels
+ * 6.5.3 Next-Key Locking: Physical Surrogates for Logical Properties
diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier
new file mode 100644
index 0000000..f78e5ac
--- /dev/null
+++ b/src/backend/storage/lmgr/README.barrier
@@ -0,0 +1,197 @@
+Memory Barriers
+===============
+
+Modern CPUs make extensive use of pipe-lining and out-of-order execution,
+meaning that the CPU is often executing more than one instruction at a
+time, and not necessarily in the order that the source code would suggest.
+Furthermore, even before the CPU gets a chance to reorder operations, the
+compiler may (and often does) reorganize the code for greater efficiency,
+particularly at higher optimization levels. Optimizing compilers and
+out-of-order execution are both critical for good performance, but they
+can lead to surprising results when multiple processes access the same
+memory space.
+
+Example
+=======
+
+Suppose x is a pointer to a structure stored in shared memory, and that the
+entire structure has been initialized to zero bytes. One backend executes
+the following code fragment:
+
+ x->foo = 1;
+ x->bar = 1;
+
+Meanwhile, at approximately the same time, another backend executes this
+code fragment:
+
+ bar = x->bar;
+ foo = x->foo;
+
+The second backend might end up with foo = 1 and bar = 1 (if it executes
+both statements after the first backend), or with foo = 0 and bar = 0 (if
+it executes both statements before the first backend), or with foo = 1 and
+bar = 0 (if the first backend executes the first statement, the second
+backend executes both statements, and then the first backend executes the
+second statement).
+
+Surprisingly, however, the second backend could also end up with foo = 0
+and bar = 1. The compiler might swap the order of the two stores performed
+by the first backend, or the two loads performed by the second backend.
+Even if it doesn't, on a machine with weak memory ordering (such as PowerPC
+or ARM) the CPU might choose to execute either the loads or the stores
+out of order. This surprising result can lead to bugs.
+
+A common pattern where this actually does result in a bug is when adding items
+onto a queue. The writer does this:
+
+ q->items[q->num_items] = new_item;
+ ++q->num_items;
+
+The reader does this:
+
+ num_items = q->num_items;
+ for (i = 0; i < num_items; ++i)
+ /* do something with q->items[i] */
+
+This code turns out to be unsafe, because the writer might increment
+q->num_items before it finishes storing the new item into the appropriate slot.
+More subtly, the reader might prefetch the contents of the q->items array
+before reading q->num_items. Thus, there's still a bug here *even if the
+writer does everything in the order we expect*. We need the writer to update
+the array before bumping the item counter, and the reader to examine the item
+counter before examining the array.
+
+Note that these types of highly counterintuitive bugs can *only* occur when
+multiple processes are interacting with the same memory segment. A given
+process always perceives its *own* writes to memory in program order.
+
+Avoiding Memory Ordering Bugs
+=============================
+
+The simplest (and often best) way to avoid memory ordering bugs is to
+protect the data structures involved with an lwlock. For more details, see
+src/backend/storage/lmgr/README. For instance, in the above example, the
+writer could acquire an lwlock in exclusive mode before appending to the
+queue, and each reader could acquire the same lock in shared mode before
+reading it. If the data structure is not heavily trafficked, this solution is
+generally entirely adequate.
+
+However, in some cases, it is desirable to avoid the overhead of acquiring
+and releasing locks. In this case, memory barriers may be used to ensure
+that the apparent order of execution is as the programmer desires. In
+PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve
+this result. In the example above, we can prevent the reader from seeing a
+garbage value by having the writer do this:
+
+ q->items[q->num_items] = new_item;
+ pg_memory_barrier();
+ ++q->num_items;
+
+And by having the reader do this:
+
+ num_items = q->num_items;
+ pg_memory_barrier();
+ for (i = 0; i < num_items; ++i)
+ /* do something with q->items[i] */
+
+The pg_memory_barrier() macro will (1) prevent the compiler from rearranging
+the code in such a way as to allow the memory accesses to occur out of order
+and (2) generate any code (often, inline assembly) that is needed to prevent
+the CPU from executing the memory accesses out of order. Specifically, the
+barrier prevents loads and stores written after the barrier from being
+performed before the barrier, and vice-versa.
+
+Although this code will work, it is needlessly inefficient. On systems with
+strong memory ordering (such as x86), the CPU never reorders loads with other
+loads, nor stores with other stores. It can, however, allow a load to be
+performed before a subsequent store. To avoid emitting unnecessary memory
+instructions, we provide two additional primitives: pg_read_barrier(), and
+pg_write_barrier(). When a memory barrier is being used to separate two
+loads, use pg_read_barrier(); when it is separating two stores, use
+pg_write_barrier(); when it is a separating a load and a store (in either
+order), use pg_memory_barrier(). pg_memory_barrier() can always substitute
+for either a read or a write barrier, but is typically more expensive, and
+therefore should be used only when needed.
+
+With these guidelines in mind, the writer can do this:
+
+ q->items[q->num_items] = new_item;
+ pg_write_barrier();
+ ++q->num_items;
+
+And the reader can do this:
+
+ num_items = q->num_items;
+ pg_read_barrier();
+ for (i = 0; i < num_items; ++i)
+ /* do something with q->items[i] */
+
+On machines with strong memory ordering, these weaker barriers will simply
+prevent compiler rearrangement, without emitting any actual machine code.
+On machines with weak memory ordering, they will prevent compiler
+reordering and also emit whatever hardware barrier may be required. Even
+on machines with weak memory ordering, a read or write barrier may be able
+to use a less expensive instruction than a full barrier.
+
+Weaknesses of Memory Barriers
+=============================
+
+While memory barriers are a powerful tool, and much cheaper than locks, they
+are also much less capable than locks. Here are some of the problems.
+
+1. Concurrent writers are unsafe. In the above example of a queue, using
+memory barriers doesn't make it safe for two processes to add items to the
+same queue at the same time. If more than one process can write to the queue,
+a spinlock or lwlock must be used to synchronize access. The readers can
+perhaps proceed without any lock, but the writers may not.
+
+Even very simple write operations often require additional synchronization.
+For example, it's not safe for multiple writers to simultaneously execute
+this code (supposing x is a pointer into shared memory):
+
+ x->foo++;
+
+Although this may compile down to a single machine-language instruction,
+the CPU will execute that instruction by reading the current value of foo,
+adding one to it, and then storing the result back to the original address.
+If two CPUs try to do this simultaneously, both may do their reads before
+either one does their writes. Such a case could be made safe by using an
+atomic variable and an atomic add. See port/atomics.h.
+
+2. Eight-byte loads and stores aren't necessarily atomic. We assume in
+various places in the source code that an aligned four-byte load or store is
+atomic, and that other processes therefore won't see a half-set value.
+Sadly, the same can't be said for eight-byte value: on some platforms, an
+aligned eight-byte load or store will generate two four-byte operations. If
+you need an atomic eight-byte read or write, you must either serialize access
+with a lock or use an atomic variable.
+
+3. No ordering guarantees. While memory barriers ensure that any given
+process performs loads and stores to shared memory in order, they don't
+guarantee synchronization. In the queue example above, we can use memory
+barriers to be sure that readers won't see garbage, but there's nothing to
+say whether a given reader will run before or after a given writer. If this
+matters in a given situation, some other mechanism must be used instead of
+or in addition to memory barriers.
+
+4. Barrier proliferation. Many algorithms that at first seem appealing
+require multiple barriers. If the number of barriers required is more than
+one or two, you may be better off just using a lock. Keep in mind that, on
+some platforms, a barrier may be implemented by acquiring and releasing a
+backend-private spinlock. This may be better than a centralized lock under
+contention, but it may also be slower in the uncontended case.
+
+Further Reading
+===============
+
+Much of the documentation about memory barriers appears to be quite
+Linux-specific. The following papers may be helpful:
+
+Memory Ordering in Modern Microprocessors, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
+
+Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf
+
+The Linux kernel also has some useful documentation on this topic. Start
+with Documentation/memory-barriers.txt
diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
new file mode 100644
index 0000000..80d70c1
--- /dev/null
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -0,0 +1,364 @@
+/*-------------------------------------------------------------------------
+ *
+ * condition_variable.c
+ * Implementation of condition variables. Condition variables provide
+ * a way for one process to wait until a specific condition occurs,
+ * without needing to know the specific identity of the process for
+ * which they are waiting. Waits for condition variables can be
+ * interrupted, unlike LWLock waits. Condition variables are safe
+ * to use within dynamic shared memory segments.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/lmgr/condition_variable.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+/* Initially, we are not prepared to sleep on any condition variable. */
+static ConditionVariable *cv_sleep_target = NULL;
+
+/*
+ * Initialize a condition variable.
+ */
+void
+ConditionVariableInit(ConditionVariable *cv)
+{
+ SpinLockInit(&cv->mutex);
+ proclist_init(&cv->wakeup);
+}
+
+/*
+ * Prepare to wait on a given condition variable.
+ *
+ * This can optionally be called before entering a test/sleep loop.
+ * Doing so is more efficient if we'll need to sleep at least once.
+ * However, if the first test of the exit condition is likely to succeed,
+ * it's more efficient to omit the ConditionVariablePrepareToSleep call.
+ * See comments in ConditionVariableSleep for more detail.
+ *
+ * Caution: "before entering the loop" means you *must* test the exit
+ * condition between calling ConditionVariablePrepareToSleep and calling
+ * ConditionVariableSleep. If that is inconvenient, omit calling
+ * ConditionVariablePrepareToSleep.
+ */
+void
+ConditionVariablePrepareToSleep(ConditionVariable *cv)
+{
+ int pgprocno = MyProc->pgprocno;
+
+ /*
+ * If some other sleep is already prepared, cancel it; this is necessary
+ * because we have just one static variable tracking the prepared sleep,
+ * and also only one cvWaitLink in our PGPROC. It's okay to do this
+ * because whenever control does return to the other test-and-sleep loop,
+ * its ConditionVariableSleep call will just re-establish that sleep as
+ * the prepared one.
+ */
+ if (cv_sleep_target != NULL)
+ ConditionVariableCancelSleep();
+
+ /* Record the condition variable on which we will sleep. */
+ cv_sleep_target = cv;
+
+ /* Add myself to the wait queue. */
+ SpinLockAcquire(&cv->mutex);
+ proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink);
+ SpinLockRelease(&cv->mutex);
+}
+
+/*
+ * Wait for the given condition variable to be signaled.
+ *
+ * This should be called in a predicate loop that tests for a specific exit
+ * condition and otherwise sleeps, like so:
+ *
+ * ConditionVariablePrepareToSleep(cv); // optional
+ * while (condition for which we are waiting is not true)
+ * ConditionVariableSleep(cv, wait_event_info);
+ * ConditionVariableCancelSleep();
+ *
+ * wait_event_info should be a value from one of the WaitEventXXX enums
+ * defined in pgstat.h. This controls the contents of pg_stat_activity's
+ * wait_event_type and wait_event columns while waiting.
+ */
+void
+ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
+{
+ (void) ConditionVariableTimedSleep(cv, -1 /* no timeout */ ,
+ wait_event_info);
+}
+
+/*
+ * Wait for a condition variable to be signaled or a timeout to be reached.
+ *
+ * Returns true when timeout expires, otherwise returns false.
+ *
+ * See ConditionVariableSleep() for general usage.
+ */
+bool
+ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
+ uint32 wait_event_info)
+{
+ long cur_timeout = -1;
+ instr_time start_time;
+ instr_time cur_time;
+ int wait_events;
+
+ /*
+ * If the caller didn't prepare to sleep explicitly, then do so now and
+ * return immediately. The caller's predicate loop should immediately
+ * call again if its exit condition is not yet met. This will result in
+ * the exit condition being tested twice before we first sleep. The extra
+ * test can be prevented by calling ConditionVariablePrepareToSleep(cv)
+ * first. Whether it's worth doing that depends on whether you expect the
+ * exit condition to be met initially, in which case skipping the prepare
+ * is recommended because it avoids manipulations of the wait list, or not
+ * met initially, in which case preparing first is better because it
+ * avoids one extra test of the exit condition.
+ *
+ * If we are currently prepared to sleep on some other CV, we just cancel
+ * that and prepare this one; see ConditionVariablePrepareToSleep.
+ */
+ if (cv_sleep_target != cv)
+ {
+ ConditionVariablePrepareToSleep(cv);
+ return false;
+ }
+
+ /*
+ * Record the current time so that we can calculate the remaining timeout
+ * if we are woken up spuriously.
+ */
+ if (timeout >= 0)
+ {
+ INSTR_TIME_SET_CURRENT(start_time);
+ Assert(timeout >= 0 && timeout <= INT_MAX);
+ cur_timeout = timeout;
+ wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
+ }
+ else
+ wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
+
+ while (true)
+ {
+ bool done = false;
+
+ /*
+ * Wait for latch to be set. (If we're awakened for some other
+ * reason, the code below will cope anyway.)
+ */
+ (void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
+
+ /* Reset latch before examining the state of the wait list. */
+ ResetLatch(MyLatch);
+
+ /*
+ * If this process has been taken out of the wait list, then we know
+ * that it has been signaled by ConditionVariableSignal (or
+ * ConditionVariableBroadcast), so we should return to the caller. But
+ * that doesn't guarantee that the exit condition is met, only that we
+ * ought to check it. So we must put the process back into the wait
+ * list, to ensure we don't miss any additional wakeup occurring while
+ * the caller checks its exit condition. We can take ourselves out of
+ * the wait list only when the caller calls
+ * ConditionVariableCancelSleep.
+ *
+ * If we're still in the wait list, then the latch must have been set
+ * by something other than ConditionVariableSignal; though we don't
+ * guarantee not to return spuriously, we'll avoid this obvious case.
+ */
+ SpinLockAcquire(&cv->mutex);
+ if (!proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink))
+ {
+ done = true;
+ proclist_push_tail(&cv->wakeup, MyProc->pgprocno, cvWaitLink);
+ }
+ SpinLockRelease(&cv->mutex);
+
+ /*
+ * Check for interrupts, and return spuriously if that caused the
+ * current sleep target to change (meaning that interrupt handler code
+ * waited for a different condition variable).
+ */
+ CHECK_FOR_INTERRUPTS();
+ if (cv != cv_sleep_target)
+ done = true;
+
+ /* We were signaled, so return */
+ if (done)
+ return false;
+
+ /* If we're not done, update cur_timeout for next iteration */
+ if (timeout >= 0)
+ {
+ INSTR_TIME_SET_CURRENT(cur_time);
+ INSTR_TIME_SUBTRACT(cur_time, start_time);
+ cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+
+ /* Have we crossed the timeout threshold? */
+ if (cur_timeout <= 0)
+ return true;
+ }
+ }
+}
+
+/*
+ * Cancel any pending sleep operation.
+ *
+ * We just need to remove ourselves from the wait queue of any condition
+ * variable for which we have previously prepared a sleep.
+ *
+ * Do nothing if nothing is pending; this allows this function to be called
+ * during transaction abort to clean up any unfinished CV sleep.
+ */
+void
+ConditionVariableCancelSleep(void)
+{
+ ConditionVariable *cv = cv_sleep_target;
+ bool signaled = false;
+
+ if (cv == NULL)
+ return;
+
+ SpinLockAcquire(&cv->mutex);
+ if (proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink))
+ proclist_delete(&cv->wakeup, MyProc->pgprocno, cvWaitLink);
+ else
+ signaled = true;
+ SpinLockRelease(&cv->mutex);
+
+ /*
+ * If we've received a signal, pass it on to another waiting process, if
+ * there is one. Otherwise a call to ConditionVariableSignal() might get
+ * lost, despite there being another process ready to handle it.
+ */
+ if (signaled)
+ ConditionVariableSignal(cv);
+
+ cv_sleep_target = NULL;
+}
+
+/*
+ * Wake up the oldest process sleeping on the CV, if there is any.
+ *
+ * Note: it's difficult to tell whether this has any real effect: we know
+ * whether we took an entry off the list, but the entry might only be a
+ * sentinel. Hence, think twice before proposing that this should return
+ * a flag telling whether it woke somebody.
+ */
+void
+ConditionVariableSignal(ConditionVariable *cv)
+{
+ PGPROC *proc = NULL;
+
+ /* Remove the first process from the wakeup queue (if any). */
+ SpinLockAcquire(&cv->mutex);
+ if (!proclist_is_empty(&cv->wakeup))
+ proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+ SpinLockRelease(&cv->mutex);
+
+ /* If we found someone sleeping, set their latch to wake them up. */
+ if (proc != NULL)
+ SetLatch(&proc->procLatch);
+}
+
+/*
+ * Wake up all processes sleeping on the given CV.
+ *
+ * This guarantees to wake all processes that were sleeping on the CV
+ * at time of call, but processes that add themselves to the list mid-call
+ * will typically not get awakened.
+ */
+void
+ConditionVariableBroadcast(ConditionVariable *cv)
+{
+ int pgprocno = MyProc->pgprocno;
+ PGPROC *proc = NULL;
+ bool have_sentinel = false;
+
+ /*
+ * In some use-cases, it is common for awakened processes to immediately
+ * re-queue themselves. If we just naively try to reduce the wakeup list
+ * to empty, we'll get into a potentially-indefinite loop against such a
+ * process. The semantics we really want are just to be sure that we have
+ * wakened all processes that were in the list at entry. We can use our
+ * own cvWaitLink as a sentinel to detect when we've finished.
+ *
+ * A seeming flaw in this approach is that someone else might signal the
+ * CV and in doing so remove our sentinel entry. But that's fine: since
+ * CV waiters are always added and removed in order, that must mean that
+ * every previous waiter has been wakened, so we're done. We'll get an
+ * extra "set" on our latch from the someone else's signal, which is
+ * slightly inefficient but harmless.
+ *
+ * We can't insert our cvWaitLink as a sentinel if it's already in use in
+ * some other proclist. While that's not expected to be true for typical
+ * uses of this function, we can deal with it by simply canceling any
+ * prepared CV sleep. The next call to ConditionVariableSleep will take
+ * care of re-establishing the lost state.
+ */
+ if (cv_sleep_target != NULL)
+ ConditionVariableCancelSleep();
+
+ /*
+ * Inspect the state of the queue. If it's empty, we have nothing to do.
+ * If there's exactly one entry, we need only remove and signal that
+ * entry. Otherwise, remove the first entry and insert our sentinel.
+ */
+ SpinLockAcquire(&cv->mutex);
+ /* While we're here, let's assert we're not in the list. */
+ Assert(!proclist_contains(&cv->wakeup, pgprocno, cvWaitLink));
+
+ if (!proclist_is_empty(&cv->wakeup))
+ {
+ proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+ if (!proclist_is_empty(&cv->wakeup))
+ {
+ proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink);
+ have_sentinel = true;
+ }
+ }
+ SpinLockRelease(&cv->mutex);
+
+ /* Awaken first waiter, if there was one. */
+ if (proc != NULL)
+ SetLatch(&proc->procLatch);
+
+ while (have_sentinel)
+ {
+ /*
+ * Each time through the loop, remove the first wakeup list entry, and
+ * signal it unless it's our sentinel. Repeat as long as the sentinel
+ * remains in the list.
+ *
+ * Notice that if someone else removes our sentinel, we will waken one
+ * additional process before exiting. That's intentional, because if
+ * someone else signals the CV, they may be intending to waken some
+ * third process that added itself to the list after we added the
+ * sentinel. Better to give a spurious wakeup (which should be
+ * harmless beyond wasting some cycles) than to lose a wakeup.
+ */
+ proc = NULL;
+ SpinLockAcquire(&cv->mutex);
+ if (!proclist_is_empty(&cv->wakeup))
+ proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+ have_sentinel = proclist_contains(&cv->wakeup, pgprocno, cvWaitLink);
+ SpinLockRelease(&cv->mutex);
+
+ if (proc != NULL && proc != MyProc)
+ SetLatch(&proc->procLatch);
+ }
+}
diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c
new file mode 100644
index 0000000..67733c0
--- /dev/null
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -0,0 +1,1177 @@
+/*-------------------------------------------------------------------------
+ *
+ * deadlock.c
+ * POSTGRES deadlock detection code
+ *
+ * See src/backend/storage/lmgr/README for a description of the deadlock
+ * detection and resolution algorithms.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/deadlock.c
+ *
+ * Interface:
+ *
+ * DeadLockCheck()
+ * DeadLockReport()
+ * RememberSimpleDeadLock()
+ * InitDeadLockChecking()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "utils/memutils.h"
+
+
+/*
+ * One edge in the waits-for graph.
+ *
+ * waiter and blocker may or may not be members of a lock group, but if either
+ * is, it will be the leader rather than any other member of the lock group.
+ * The group leaders act as representatives of the whole group even though
+ * those particular processes need not be waiting at all. There will be at
+ * least one member of the waiter's lock group on the wait queue for the given
+ * lock, maybe more.
+ */
+typedef struct
+{
+ PGPROC *waiter; /* the leader of the waiting lock group */
+ PGPROC *blocker; /* the leader of the group it is waiting for */
+ LOCK *lock; /* the lock being waited for */
+ int pred; /* workspace for TopoSort */
+ int link; /* workspace for TopoSort */
+} EDGE;
+
+/* One potential reordering of a lock's wait queue */
+typedef struct
+{
+ LOCK *lock; /* the lock whose wait queue is described */
+ PGPROC **procs; /* array of PGPROC *'s in new wait order */
+ int nProcs;
+} WAIT_ORDER;
+
+/*
+ * Information saved about each edge in a detected deadlock cycle. This
+ * is used to print a diagnostic message upon failure.
+ *
+ * Note: because we want to examine this info after releasing the lock
+ * manager's partition locks, we can't just store LOCK and PGPROC pointers;
+ * we must extract out all the info we want to be able to print.
+ */
+typedef struct
+{
+ LOCKTAG locktag; /* ID of awaited lock object */
+ LOCKMODE lockmode; /* type of lock we're waiting for */
+ int pid; /* PID of blocked backend */
+} DEADLOCK_INFO;
+
+
+static bool DeadLockCheckRecurse(PGPROC *proc);
+static int TestConfiguration(PGPROC *startProc);
+static bool FindLockCycle(PGPROC *checkProc,
+ EDGE *softEdges, int *nSoftEdges);
+static bool FindLockCycleRecurse(PGPROC *checkProc, int depth,
+ EDGE *softEdges, int *nSoftEdges);
+static bool FindLockCycleRecurseMember(PGPROC *checkProc,
+ PGPROC *checkProcLeader,
+ int depth, EDGE *softEdges, int *nSoftEdges);
+static bool ExpandConstraints(EDGE *constraints, int nConstraints);
+static bool TopoSort(LOCK *lock, EDGE *constraints, int nConstraints,
+ PGPROC **ordering);
+
+#ifdef DEBUG_DEADLOCK
+static void PrintLockQueue(LOCK *lock, const char *info);
+#endif
+
+
+/*
+ * Working space for the deadlock detector
+ */
+
+/* Workspace for FindLockCycle */
+static PGPROC **visitedProcs; /* Array of visited procs */
+static int nVisitedProcs;
+
+/* Workspace for TopoSort */
+static PGPROC **topoProcs; /* Array of not-yet-output procs */
+static int *beforeConstraints; /* Counts of remaining before-constraints */
+static int *afterConstraints; /* List head for after-constraints */
+
+/* Output area for ExpandConstraints */
+static WAIT_ORDER *waitOrders; /* Array of proposed queue rearrangements */
+static int nWaitOrders;
+static PGPROC **waitOrderProcs; /* Space for waitOrders queue contents */
+
+/* Current list of constraints being considered */
+static EDGE *curConstraints;
+static int nCurConstraints;
+static int maxCurConstraints;
+
+/* Storage space for results from FindLockCycle */
+static EDGE *possibleConstraints;
+static int nPossibleConstraints;
+static int maxPossibleConstraints;
+static DEADLOCK_INFO *deadlockDetails;
+static int nDeadlockDetails;
+
+/* PGPROC pointer of any blocking autovacuum worker found */
+static PGPROC *blocking_autovacuum_proc = NULL;
+
+
+/*
+ * InitDeadLockChecking -- initialize deadlock checker during backend startup
+ *
+ * This does per-backend initialization of the deadlock checker; primarily,
+ * allocation of working memory for DeadLockCheck. We do this per-backend
+ * since there's no percentage in making the kernel do copy-on-write
+ * inheritance of workspace from the postmaster. We want to allocate the
+ * space at startup because (a) the deadlock checker might be invoked when
+ * there's no free memory left, and (b) the checker is normally run inside a
+ * signal handler, which is a very dangerous place to invoke palloc from.
+ */
+void
+InitDeadLockChecking(void)
+{
+ MemoryContext oldcxt;
+
+ /* Make sure allocations are permanent */
+ oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * FindLockCycle needs at most MaxBackends entries in visitedProcs[] and
+ * deadlockDetails[].
+ */
+ visitedProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
+ deadlockDetails = (DEADLOCK_INFO *) palloc(MaxBackends * sizeof(DEADLOCK_INFO));
+
+ /*
+ * TopoSort needs to consider at most MaxBackends wait-queue entries, and
+ * it needn't run concurrently with FindLockCycle.
+ */
+ topoProcs = visitedProcs; /* re-use this space */
+ beforeConstraints = (int *) palloc(MaxBackends * sizeof(int));
+ afterConstraints = (int *) palloc(MaxBackends * sizeof(int));
+
+ /*
+ * We need to consider rearranging at most MaxBackends/2 wait queues
+ * (since it takes at least two waiters in a queue to create a soft edge),
+ * and the expanded form of the wait queues can't involve more than
+ * MaxBackends total waiters.
+ */
+ waitOrders = (WAIT_ORDER *)
+ palloc((MaxBackends / 2) * sizeof(WAIT_ORDER));
+ waitOrderProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
+
+ /*
+ * Allow at most MaxBackends distinct constraints in a configuration. (Is
+ * this enough? In practice it seems it should be, but I don't quite see
+ * how to prove it. If we run out, we might fail to find a workable wait
+ * queue rearrangement even though one exists.) NOTE that this number
+ * limits the maximum recursion depth of DeadLockCheckRecurse. Making it
+ * really big might potentially allow a stack-overflow problem.
+ */
+ maxCurConstraints = MaxBackends;
+ curConstraints = (EDGE *) palloc(maxCurConstraints * sizeof(EDGE));
+
+ /*
+ * Allow up to 3*MaxBackends constraints to be saved without having to
+ * re-run TestConfiguration. (This is probably more than enough, but we
+ * can survive if we run low on space by doing excess runs of
+ * TestConfiguration to re-compute constraint lists each time needed.) The
+ * last MaxBackends entries in possibleConstraints[] are reserved as
+ * output workspace for FindLockCycle.
+ */
+ maxPossibleConstraints = MaxBackends * 4;
+ possibleConstraints =
+ (EDGE *) palloc(maxPossibleConstraints * sizeof(EDGE));
+
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * DeadLockCheck -- Checks for deadlocks for a given process
+ *
+ * This code looks for deadlocks involving the given process. If any
+ * are found, it tries to rearrange lock wait queues to resolve the
+ * deadlock. If resolution is impossible, return DS_HARD_DEADLOCK ---
+ * the caller is then expected to abort the given proc's transaction.
+ *
+ * Caller must already have locked all partitions of the lock tables.
+ *
+ * On failure, deadlock details are recorded in deadlockDetails[] for
+ * subsequent printing by DeadLockReport(). That activity is separate
+ * because (a) we don't want to do it while holding all those LWLocks,
+ * and (b) we are typically invoked inside a signal handler.
+ */
+DeadLockState
+DeadLockCheck(PGPROC *proc)
+{
+ int i,
+ j;
+
+ /* Initialize to "no constraints" */
+ nCurConstraints = 0;
+ nPossibleConstraints = 0;
+ nWaitOrders = 0;
+
+ /* Initialize to not blocked by an autovacuum worker */
+ blocking_autovacuum_proc = NULL;
+
+ /* Search for deadlocks and possible fixes */
+ if (DeadLockCheckRecurse(proc))
+ {
+ /*
+ * Call FindLockCycle one more time, to record the correct
+ * deadlockDetails[] for the basic state with no rearrangements.
+ */
+ int nSoftEdges;
+
+ TRACE_POSTGRESQL_DEADLOCK_FOUND();
+
+ nWaitOrders = 0;
+ if (!FindLockCycle(proc, possibleConstraints, &nSoftEdges))
+ elog(FATAL, "deadlock seems to have disappeared");
+
+ return DS_HARD_DEADLOCK; /* cannot find a non-deadlocked state */
+ }
+
+ /* Apply any needed rearrangements of wait queues */
+ for (i = 0; i < nWaitOrders; i++)
+ {
+ LOCK *lock = waitOrders[i].lock;
+ PGPROC **procs = waitOrders[i].procs;
+ int nProcs = waitOrders[i].nProcs;
+ PROC_QUEUE *waitQueue = &(lock->waitProcs);
+
+ Assert(nProcs == waitQueue->size);
+
+#ifdef DEBUG_DEADLOCK
+ PrintLockQueue(lock, "DeadLockCheck:");
+#endif
+
+ /* Reset the queue and re-add procs in the desired order */
+ ProcQueueInit(waitQueue);
+ for (j = 0; j < nProcs; j++)
+ {
+ SHMQueueInsertBefore(&(waitQueue->links), &(procs[j]->links));
+ waitQueue->size++;
+ }
+
+#ifdef DEBUG_DEADLOCK
+ PrintLockQueue(lock, "rearranged to:");
+#endif
+
+ /* See if any waiters for the lock can be woken up now */
+ ProcLockWakeup(GetLocksMethodTable(lock), lock);
+ }
+
+ /* Return code tells caller if we had to escape a deadlock or not */
+ if (nWaitOrders > 0)
+ return DS_SOFT_DEADLOCK;
+ else if (blocking_autovacuum_proc != NULL)
+ return DS_BLOCKED_BY_AUTOVACUUM;
+ else
+ return DS_NO_DEADLOCK;
+}
+
+/*
+ * Return the PGPROC of the autovacuum that's blocking a process.
+ *
+ * We reset the saved pointer as soon as we pass it back.
+ */
+PGPROC *
+GetBlockingAutoVacuumPgproc(void)
+{
+ PGPROC *ptr;
+
+ ptr = blocking_autovacuum_proc;
+ blocking_autovacuum_proc = NULL;
+
+ return ptr;
+}
+
+/*
+ * DeadLockCheckRecurse -- recursively search for valid orderings
+ *
+ * curConstraints[] holds the current set of constraints being considered
+ * by an outer level of recursion. Add to this each possible solution
+ * constraint for any cycle detected at this level.
+ *
+ * Returns true if no solution exists. Returns false if a deadlock-free
+ * state is attainable, in which case waitOrders[] shows the required
+ * rearrangements of lock wait queues (if any).
+ */
+static bool
+DeadLockCheckRecurse(PGPROC *proc)
+{
+ int nEdges;
+ int oldPossibleConstraints;
+ bool savedList;
+ int i;
+
+ nEdges = TestConfiguration(proc);
+ if (nEdges < 0)
+ return true; /* hard deadlock --- no solution */
+ if (nEdges == 0)
+ return false; /* good configuration found */
+ if (nCurConstraints >= maxCurConstraints)
+ return true; /* out of room for active constraints? */
+ oldPossibleConstraints = nPossibleConstraints;
+ if (nPossibleConstraints + nEdges + MaxBackends <= maxPossibleConstraints)
+ {
+ /* We can save the edge list in possibleConstraints[] */
+ nPossibleConstraints += nEdges;
+ savedList = true;
+ }
+ else
+ {
+ /* Not room; will need to regenerate the edges on-the-fly */
+ savedList = false;
+ }
+
+ /*
+ * Try each available soft edge as an addition to the configuration.
+ */
+ for (i = 0; i < nEdges; i++)
+ {
+ if (!savedList && i > 0)
+ {
+ /* Regenerate the list of possible added constraints */
+ if (nEdges != TestConfiguration(proc))
+ elog(FATAL, "inconsistent results during deadlock check");
+ }
+ curConstraints[nCurConstraints] =
+ possibleConstraints[oldPossibleConstraints + i];
+ nCurConstraints++;
+ if (!DeadLockCheckRecurse(proc))
+ return false; /* found a valid solution! */
+ /* give up on that added constraint, try again */
+ nCurConstraints--;
+ }
+ nPossibleConstraints = oldPossibleConstraints;
+ return true; /* no solution found */
+}
+
+
+/*--------------------
+ * Test a configuration (current set of constraints) for validity.
+ *
+ * Returns:
+ * 0: the configuration is good (no deadlocks)
+ * -1: the configuration has a hard deadlock or is not self-consistent
+ * >0: the configuration has one or more soft deadlocks
+ *
+ * In the soft-deadlock case, one of the soft cycles is chosen arbitrarily
+ * and a list of its soft edges is returned beginning at
+ * possibleConstraints+nPossibleConstraints. The return value is the
+ * number of soft edges.
+ *--------------------
+ */
+static int
+TestConfiguration(PGPROC *startProc)
+{
+ int softFound = 0;
+ EDGE *softEdges = possibleConstraints + nPossibleConstraints;
+ int nSoftEdges;
+ int i;
+
+ /*
+ * Make sure we have room for FindLockCycle's output.
+ */
+ if (nPossibleConstraints + MaxBackends > maxPossibleConstraints)
+ return -1;
+
+ /*
+ * Expand current constraint set into wait orderings. Fail if the
+ * constraint set is not self-consistent.
+ */
+ if (!ExpandConstraints(curConstraints, nCurConstraints))
+ return -1;
+
+ /*
+ * Check for cycles involving startProc or any of the procs mentioned in
+ * constraints. We check startProc last because if it has a soft cycle
+ * still to be dealt with, we want to deal with that first.
+ */
+ for (i = 0; i < nCurConstraints; i++)
+ {
+ if (FindLockCycle(curConstraints[i].waiter, softEdges, &nSoftEdges))
+ {
+ if (nSoftEdges == 0)
+ return -1; /* hard deadlock detected */
+ softFound = nSoftEdges;
+ }
+ if (FindLockCycle(curConstraints[i].blocker, softEdges, &nSoftEdges))
+ {
+ if (nSoftEdges == 0)
+ return -1; /* hard deadlock detected */
+ softFound = nSoftEdges;
+ }
+ }
+ if (FindLockCycle(startProc, softEdges, &nSoftEdges))
+ {
+ if (nSoftEdges == 0)
+ return -1; /* hard deadlock detected */
+ softFound = nSoftEdges;
+ }
+ return softFound;
+}
+
+
+/*
+ * FindLockCycle -- basic check for deadlock cycles
+ *
+ * Scan outward from the given proc to see if there is a cycle in the
+ * waits-for graph that includes this proc. Return true if a cycle
+ * is found, else false. If a cycle is found, we return a list of
+ * the "soft edges", if any, included in the cycle. These edges could
+ * potentially be eliminated by rearranging wait queues. We also fill
+ * deadlockDetails[] with information about the detected cycle; this info
+ * is not used by the deadlock algorithm itself, only to print a useful
+ * message after failing.
+ *
+ * Since we need to be able to check hypothetical configurations that would
+ * exist after wait queue rearrangement, the routine pays attention to the
+ * table of hypothetical queue orders in waitOrders[]. These orders will
+ * be believed in preference to the actual ordering seen in the locktable.
+ */
+static bool
+FindLockCycle(PGPROC *checkProc,
+ EDGE *softEdges, /* output argument */
+ int *nSoftEdges) /* output argument */
+{
+ nVisitedProcs = 0;
+ nDeadlockDetails = 0;
+ *nSoftEdges = 0;
+ return FindLockCycleRecurse(checkProc, 0, softEdges, nSoftEdges);
+}
+
+static bool
+FindLockCycleRecurse(PGPROC *checkProc,
+ int depth,
+ EDGE *softEdges, /* output argument */
+ int *nSoftEdges) /* output argument */
+{
+ int i;
+ dlist_iter iter;
+
+ /*
+ * If this process is a lock group member, check the leader instead. (Note
+ * that we might be the leader, in which case this is a no-op.)
+ */
+ if (checkProc->lockGroupLeader != NULL)
+ checkProc = checkProc->lockGroupLeader;
+
+ /*
+ * Have we already seen this proc?
+ */
+ for (i = 0; i < nVisitedProcs; i++)
+ {
+ if (visitedProcs[i] == checkProc)
+ {
+ /* If we return to starting point, we have a deadlock cycle */
+ if (i == 0)
+ {
+ /*
+ * record total length of cycle --- outer levels will now fill
+ * deadlockDetails[]
+ */
+ Assert(depth <= MaxBackends);
+ nDeadlockDetails = depth;
+
+ return true;
+ }
+
+ /*
+ * Otherwise, we have a cycle but it does not include the start
+ * point, so say "no deadlock".
+ */
+ return false;
+ }
+ }
+ /* Mark proc as seen */
+ Assert(nVisitedProcs < MaxBackends);
+ visitedProcs[nVisitedProcs++] = checkProc;
+
+ /*
+ * If the process is waiting, there is an outgoing waits-for edge to each
+ * process that blocks it.
+ */
+ if (checkProc->links.next != NULL && checkProc->waitLock != NULL &&
+ FindLockCycleRecurseMember(checkProc, checkProc, depth, softEdges,
+ nSoftEdges))
+ return true;
+
+ /*
+ * If the process is not waiting, there could still be outgoing waits-for
+ * edges if it is part of a lock group, because other members of the lock
+ * group might be waiting even though this process is not. (Given lock
+ * groups {A1, A2} and {B1, B2}, if A1 waits for B1 and B2 waits for A2,
+ * that is a deadlock even neither of B1 and A2 are waiting for anything.)
+ */
+ dlist_foreach(iter, &checkProc->lockGroupMembers)
+ {
+ PGPROC *memberProc;
+
+ memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur);
+
+ if (memberProc->links.next != NULL && memberProc->waitLock != NULL &&
+ memberProc != checkProc &&
+ FindLockCycleRecurseMember(memberProc, checkProc, depth, softEdges,
+ nSoftEdges))
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+FindLockCycleRecurseMember(PGPROC *checkProc,
+ PGPROC *checkProcLeader,
+ int depth,
+ EDGE *softEdges, /* output argument */
+ int *nSoftEdges) /* output argument */
+{
+ PGPROC *proc;
+ LOCK *lock = checkProc->waitLock;
+ PROCLOCK *proclock;
+ SHM_QUEUE *procLocks;
+ LockMethod lockMethodTable;
+ PROC_QUEUE *waitQueue;
+ int queue_size;
+ int conflictMask;
+ int i;
+ int numLockModes,
+ lm;
+
+ /*
+ * The relation extension or page lock can never participate in actual
+ * deadlock cycle. See Asserts in LockAcquireExtended. So, there is no
+ * advantage in checking wait edges from them.
+ */
+ if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND ||
+ (LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE))
+ return false;
+
+ lockMethodTable = GetLocksMethodTable(lock);
+ numLockModes = lockMethodTable->numLockModes;
+ conflictMask = lockMethodTable->conflictTab[checkProc->waitLockMode];
+
+ /*
+ * Scan for procs that already hold conflicting locks. These are "hard"
+ * edges in the waits-for graph.
+ */
+ procLocks = &(lock->procLocks);
+
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, lockLink));
+
+ while (proclock)
+ {
+ PGPROC *leader;
+
+ proc = proclock->tag.myProc;
+ leader = proc->lockGroupLeader == NULL ? proc : proc->lockGroupLeader;
+
+ /* A proc never blocks itself or any other lock group member */
+ if (leader != checkProcLeader)
+ {
+ for (lm = 1; lm <= numLockModes; lm++)
+ {
+ if ((proclock->holdMask & LOCKBIT_ON(lm)) &&
+ (conflictMask & LOCKBIT_ON(lm)))
+ {
+ /* This proc hard-blocks checkProc */
+ if (FindLockCycleRecurse(proc, depth + 1,
+ softEdges, nSoftEdges))
+ {
+ /* fill deadlockDetails[] */
+ DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+ info->locktag = lock->tag;
+ info->lockmode = checkProc->waitLockMode;
+ info->pid = checkProc->pid;
+
+ return true;
+ }
+
+ /*
+ * No deadlock here, but see if this proc is an autovacuum
+ * that is directly hard-blocking our own proc. If so,
+ * report it so that the caller can send a cancel signal
+ * to it, if appropriate. If there's more than one such
+ * proc, it's indeterminate which one will be reported.
+ *
+ * We don't touch autovacuums that are indirectly blocking
+ * us; it's up to the direct blockee to take action. This
+ * rule simplifies understanding the behavior and ensures
+ * that an autovacuum won't be canceled with less than
+ * deadlock_timeout grace period.
+ *
+ * Note we read statusFlags without any locking. This is
+ * OK only for checking the PROC_IS_AUTOVACUUM flag,
+ * because that flag is set at process start and never
+ * reset. There is logic elsewhere to avoid canceling an
+ * autovacuum that is working to prevent XID wraparound
+ * problems (which needs to read a different statusFlags
+ * bit), but we don't do that here to avoid grabbing
+ * ProcArrayLock.
+ */
+ if (checkProc == MyProc &&
+ proc->statusFlags & PROC_IS_AUTOVACUUM)
+ blocking_autovacuum_proc = proc;
+
+ /* We're done looking at this proclock */
+ break;
+ }
+ }
+ }
+
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
+ offsetof(PROCLOCK, lockLink));
+ }
+
+ /*
+ * Scan for procs that are ahead of this one in the lock's wait queue.
+ * Those that have conflicting requests soft-block this one. This must be
+ * done after the hard-block search, since if another proc both hard- and
+ * soft-blocks this one, we want to call it a hard edge.
+ *
+ * If there is a proposed re-ordering of the lock's wait order, use that
+ * rather than the current wait order.
+ */
+ for (i = 0; i < nWaitOrders; i++)
+ {
+ if (waitOrders[i].lock == lock)
+ break;
+ }
+
+ if (i < nWaitOrders)
+ {
+ /* Use the given hypothetical wait queue order */
+ PGPROC **procs = waitOrders[i].procs;
+
+ queue_size = waitOrders[i].nProcs;
+
+ for (i = 0; i < queue_size; i++)
+ {
+ PGPROC *leader;
+
+ proc = procs[i];
+ leader = proc->lockGroupLeader == NULL ? proc :
+ proc->lockGroupLeader;
+
+ /*
+ * TopoSort will always return an ordering with group members
+ * adjacent to each other in the wait queue (see comments
+ * therein). So, as soon as we reach a process in the same lock
+ * group as checkProc, we know we've found all the conflicts that
+ * precede any member of the lock group lead by checkProcLeader.
+ */
+ if (leader == checkProcLeader)
+ break;
+
+ /* Is there a conflict with this guy's request? */
+ if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0)
+ {
+ /* This proc soft-blocks checkProc */
+ if (FindLockCycleRecurse(proc, depth + 1,
+ softEdges, nSoftEdges))
+ {
+ /* fill deadlockDetails[] */
+ DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+ info->locktag = lock->tag;
+ info->lockmode = checkProc->waitLockMode;
+ info->pid = checkProc->pid;
+
+ /*
+ * Add this edge to the list of soft edges in the cycle
+ */
+ Assert(*nSoftEdges < MaxBackends);
+ softEdges[*nSoftEdges].waiter = checkProcLeader;
+ softEdges[*nSoftEdges].blocker = leader;
+ softEdges[*nSoftEdges].lock = lock;
+ (*nSoftEdges)++;
+ return true;
+ }
+ }
+ }
+ }
+ else
+ {
+ PGPROC *lastGroupMember = NULL;
+
+ /* Use the true lock wait queue order */
+ waitQueue = &(lock->waitProcs);
+
+ /*
+ * Find the last member of the lock group that is present in the wait
+ * queue. Anything after this is not a soft lock conflict. If group
+ * locking is not in use, then we know immediately which process we're
+ * looking for, but otherwise we've got to search the wait queue to
+ * find the last process actually present.
+ */
+ if (checkProc->lockGroupLeader == NULL)
+ lastGroupMember = checkProc;
+ else
+ {
+ proc = (PGPROC *) waitQueue->links.next;
+ queue_size = waitQueue->size;
+ while (queue_size-- > 0)
+ {
+ if (proc->lockGroupLeader == checkProcLeader)
+ lastGroupMember = proc;
+ proc = (PGPROC *) proc->links.next;
+ }
+ Assert(lastGroupMember != NULL);
+ }
+
+ /*
+ * OK, now rescan (or scan) the queue to identify the soft conflicts.
+ */
+ queue_size = waitQueue->size;
+ proc = (PGPROC *) waitQueue->links.next;
+ while (queue_size-- > 0)
+ {
+ PGPROC *leader;
+
+ leader = proc->lockGroupLeader == NULL ? proc :
+ proc->lockGroupLeader;
+
+ /* Done when we reach the target proc */
+ if (proc == lastGroupMember)
+ break;
+
+ /* Is there a conflict with this guy's request? */
+ if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0 &&
+ leader != checkProcLeader)
+ {
+ /* This proc soft-blocks checkProc */
+ if (FindLockCycleRecurse(proc, depth + 1,
+ softEdges, nSoftEdges))
+ {
+ /* fill deadlockDetails[] */
+ DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+ info->locktag = lock->tag;
+ info->lockmode = checkProc->waitLockMode;
+ info->pid = checkProc->pid;
+
+ /*
+ * Add this edge to the list of soft edges in the cycle
+ */
+ Assert(*nSoftEdges < MaxBackends);
+ softEdges[*nSoftEdges].waiter = checkProcLeader;
+ softEdges[*nSoftEdges].blocker = leader;
+ softEdges[*nSoftEdges].lock = lock;
+ (*nSoftEdges)++;
+ return true;
+ }
+ }
+
+ proc = (PGPROC *) proc->links.next;
+ }
+ }
+
+ /*
+ * No conflict detected here.
+ */
+ return false;
+}
+
+
+/*
+ * ExpandConstraints -- expand a list of constraints into a set of
+ * specific new orderings for affected wait queues
+ *
+ * Input is a list of soft edges to be reversed. The output is a list
+ * of nWaitOrders WAIT_ORDER structs in waitOrders[], with PGPROC array
+ * workspace in waitOrderProcs[].
+ *
+ * Returns true if able to build an ordering that satisfies all the
+ * constraints, false if not (there are contradictory constraints).
+ */
+static bool
+ExpandConstraints(EDGE *constraints,
+ int nConstraints)
+{
+ int nWaitOrderProcs = 0;
+ int i,
+ j;
+
+ nWaitOrders = 0;
+
+ /*
+ * Scan constraint list backwards. This is because the last-added
+ * constraint is the only one that could fail, and so we want to test it
+ * for inconsistency first.
+ */
+ for (i = nConstraints; --i >= 0;)
+ {
+ LOCK *lock = constraints[i].lock;
+
+ /* Did we already make a list for this lock? */
+ for (j = nWaitOrders; --j >= 0;)
+ {
+ if (waitOrders[j].lock == lock)
+ break;
+ }
+ if (j >= 0)
+ continue;
+ /* No, so allocate a new list */
+ waitOrders[nWaitOrders].lock = lock;
+ waitOrders[nWaitOrders].procs = waitOrderProcs + nWaitOrderProcs;
+ waitOrders[nWaitOrders].nProcs = lock->waitProcs.size;
+ nWaitOrderProcs += lock->waitProcs.size;
+ Assert(nWaitOrderProcs <= MaxBackends);
+
+ /*
+ * Do the topo sort. TopoSort need not examine constraints after this
+ * one, since they must be for different locks.
+ */
+ if (!TopoSort(lock, constraints, i + 1,
+ waitOrders[nWaitOrders].procs))
+ return false;
+ nWaitOrders++;
+ }
+ return true;
+}
+
+
+/*
+ * TopoSort -- topological sort of a wait queue
+ *
+ * Generate a re-ordering of a lock's wait queue that satisfies given
+ * constraints about certain procs preceding others. (Each such constraint
+ * is a fact of a partial ordering.) Minimize rearrangement of the queue
+ * not needed to achieve the partial ordering.
+ *
+ * This is a lot simpler and slower than, for example, the topological sort
+ * algorithm shown in Knuth's Volume 1. However, Knuth's method doesn't
+ * try to minimize the damage to the existing order. In practice we are
+ * not likely to be working with more than a few constraints, so the apparent
+ * slowness of the algorithm won't really matter.
+ *
+ * The initial queue ordering is taken directly from the lock's wait queue.
+ * The output is an array of PGPROC pointers, of length equal to the lock's
+ * wait queue length (the caller is responsible for providing this space).
+ * The partial order is specified by an array of EDGE structs. Each EDGE
+ * is one that we need to reverse, therefore the "waiter" must appear before
+ * the "blocker" in the output array. The EDGE array may well contain
+ * edges associated with other locks; these should be ignored.
+ *
+ * Returns true if able to build an ordering that satisfies all the
+ * constraints, false if not (there are contradictory constraints).
+ */
+static bool
+TopoSort(LOCK *lock,
+ EDGE *constraints,
+ int nConstraints,
+ PGPROC **ordering) /* output argument */
+{
+ PROC_QUEUE *waitQueue = &(lock->waitProcs);
+ int queue_size = waitQueue->size;
+ PGPROC *proc;
+ int i,
+ j,
+ jj,
+ k,
+ kk,
+ last;
+
+ /* First, fill topoProcs[] array with the procs in their current order */
+ proc = (PGPROC *) waitQueue->links.next;
+ for (i = 0; i < queue_size; i++)
+ {
+ topoProcs[i] = proc;
+ proc = (PGPROC *) proc->links.next;
+ }
+
+ /*
+ * Scan the constraints, and for each proc in the array, generate a count
+ * of the number of constraints that say it must be before something else,
+ * plus a list of the constraints that say it must be after something
+ * else. The count for the j'th proc is stored in beforeConstraints[j],
+ * and the head of its list in afterConstraints[j]. Each constraint
+ * stores its list link in constraints[i].link (note any constraint will
+ * be in just one list). The array index for the before-proc of the i'th
+ * constraint is remembered in constraints[i].pred.
+ *
+ * Note that it's not necessarily the case that every constraint affects
+ * this particular wait queue. Prior to group locking, a process could be
+ * waiting for at most one lock. But a lock group can be waiting for
+ * zero, one, or multiple locks. Since topoProcs[] is an array of the
+ * processes actually waiting, while constraints[] is an array of group
+ * leaders, we've got to scan through topoProcs[] for each constraint,
+ * checking whether both a waiter and a blocker for that group are
+ * present. If so, the constraint is relevant to this wait queue; if not,
+ * it isn't.
+ */
+ MemSet(beforeConstraints, 0, queue_size * sizeof(int));
+ MemSet(afterConstraints, 0, queue_size * sizeof(int));
+ for (i = 0; i < nConstraints; i++)
+ {
+ /*
+ * Find a representative process that is on the lock queue and part of
+ * the waiting lock group. This may or may not be the leader, which
+ * may or may not be waiting at all. If there are any other processes
+ * in the same lock group on the queue, set their number of
+ * beforeConstraints to -1 to indicate that they should be emitted
+ * with their groupmates rather than considered separately.
+ *
+ * In this loop and the similar one just below, it's critical that we
+ * consistently select the same representative member of any one lock
+ * group, so that all the constraints are associated with the same
+ * proc, and the -1's are only associated with not-representative
+ * members. We select the last one in the topoProcs array.
+ */
+ proc = constraints[i].waiter;
+ Assert(proc != NULL);
+ jj = -1;
+ for (j = queue_size; --j >= 0;)
+ {
+ PGPROC *waiter = topoProcs[j];
+
+ if (waiter == proc || waiter->lockGroupLeader == proc)
+ {
+ Assert(waiter->waitLock == lock);
+ if (jj == -1)
+ jj = j;
+ else
+ {
+ Assert(beforeConstraints[j] <= 0);
+ beforeConstraints[j] = -1;
+ }
+ }
+ }
+
+ /* If no matching waiter, constraint is not relevant to this lock. */
+ if (jj < 0)
+ continue;
+
+ /*
+ * Similarly, find a representative process that is on the lock queue
+ * and waiting for the blocking lock group. Again, this could be the
+ * leader but does not need to be.
+ */
+ proc = constraints[i].blocker;
+ Assert(proc != NULL);
+ kk = -1;
+ for (k = queue_size; --k >= 0;)
+ {
+ PGPROC *blocker = topoProcs[k];
+
+ if (blocker == proc || blocker->lockGroupLeader == proc)
+ {
+ Assert(blocker->waitLock == lock);
+ if (kk == -1)
+ kk = k;
+ else
+ {
+ Assert(beforeConstraints[k] <= 0);
+ beforeConstraints[k] = -1;
+ }
+ }
+ }
+
+ /* If no matching blocker, constraint is not relevant to this lock. */
+ if (kk < 0)
+ continue;
+
+ Assert(beforeConstraints[jj] >= 0);
+ beforeConstraints[jj]++; /* waiter must come before */
+ /* add this constraint to list of after-constraints for blocker */
+ constraints[i].pred = jj;
+ constraints[i].link = afterConstraints[kk];
+ afterConstraints[kk] = i + 1;
+ }
+
+ /*--------------------
+ * Now scan the topoProcs array backwards. At each step, output the
+ * last proc that has no remaining before-constraints plus any other
+ * members of the same lock group; then decrease the beforeConstraints
+ * count of each of the procs it was constrained against.
+ * i = index of ordering[] entry we want to output this time
+ * j = search index for topoProcs[]
+ * k = temp for scanning constraint list for proc j
+ * last = last non-null index in topoProcs (avoid redundant searches)
+ *--------------------
+ */
+ last = queue_size - 1;
+ for (i = queue_size - 1; i >= 0;)
+ {
+ int c;
+ int nmatches = 0;
+
+ /* Find next candidate to output */
+ while (topoProcs[last] == NULL)
+ last--;
+ for (j = last; j >= 0; j--)
+ {
+ if (topoProcs[j] != NULL && beforeConstraints[j] == 0)
+ break;
+ }
+
+ /* If no available candidate, topological sort fails */
+ if (j < 0)
+ return false;
+
+ /*
+ * Output everything in the lock group. There's no point in
+ * outputting an ordering where members of the same lock group are not
+ * consecutive on the wait queue: if some other waiter is between two
+ * requests that belong to the same group, then either it conflicts
+ * with both of them and is certainly not a solution; or it conflicts
+ * with at most one of them and is thus isomorphic to an ordering
+ * where the group members are consecutive.
+ */
+ proc = topoProcs[j];
+ if (proc->lockGroupLeader != NULL)
+ proc = proc->lockGroupLeader;
+ Assert(proc != NULL);
+ for (c = 0; c <= last; ++c)
+ {
+ if (topoProcs[c] == proc || (topoProcs[c] != NULL &&
+ topoProcs[c]->lockGroupLeader == proc))
+ {
+ ordering[i - nmatches] = topoProcs[c];
+ topoProcs[c] = NULL;
+ ++nmatches;
+ }
+ }
+ Assert(nmatches > 0);
+ i -= nmatches;
+
+ /* Update beforeConstraints counts of its predecessors */
+ for (k = afterConstraints[j]; k > 0; k = constraints[k - 1].link)
+ beforeConstraints[constraints[k - 1].pred]--;
+ }
+
+ /* Done */
+ return true;
+}
+
+#ifdef DEBUG_DEADLOCK
+static void
+PrintLockQueue(LOCK *lock, const char *info)
+{
+ PROC_QUEUE *waitQueue = &(lock->waitProcs);
+ int queue_size = waitQueue->size;
+ PGPROC *proc;
+ int i;
+
+ printf("%s lock %p queue ", info, lock);
+ proc = (PGPROC *) waitQueue->links.next;
+ for (i = 0; i < queue_size; i++)
+ {
+ printf(" %d", proc->pid);
+ proc = (PGPROC *) proc->links.next;
+ }
+ printf("\n");
+ fflush(stdout);
+}
+#endif
+
+/*
+ * Report a detected deadlock, with available details.
+ */
+void
+DeadLockReport(void)
+{
+ StringInfoData clientbuf; /* errdetail for client */
+ StringInfoData logbuf; /* errdetail for server log */
+ StringInfoData locktagbuf;
+ int i;
+
+ initStringInfo(&clientbuf);
+ initStringInfo(&logbuf);
+ initStringInfo(&locktagbuf);
+
+ /* Generate the "waits for" lines sent to the client */
+ for (i = 0; i < nDeadlockDetails; i++)
+ {
+ DEADLOCK_INFO *info = &deadlockDetails[i];
+ int nextpid;
+
+ /* The last proc waits for the first one... */
+ if (i < nDeadlockDetails - 1)
+ nextpid = info[1].pid;
+ else
+ nextpid = deadlockDetails[0].pid;
+
+ /* reset locktagbuf to hold next object description */
+ resetStringInfo(&locktagbuf);
+
+ DescribeLockTag(&locktagbuf, &info->locktag);
+
+ if (i > 0)
+ appendStringInfoChar(&clientbuf, '\n');
+
+ appendStringInfo(&clientbuf,
+ _("Process %d waits for %s on %s; blocked by process %d."),
+ info->pid,
+ GetLockmodeName(info->locktag.locktag_lockmethodid,
+ info->lockmode),
+ locktagbuf.data,
+ nextpid);
+ }
+
+ /* Duplicate all the above for the server ... */
+ appendBinaryStringInfo(&logbuf, clientbuf.data, clientbuf.len);
+
+ /* ... and add info about query strings */
+ for (i = 0; i < nDeadlockDetails; i++)
+ {
+ DEADLOCK_INFO *info = &deadlockDetails[i];
+
+ appendStringInfoChar(&logbuf, '\n');
+
+ appendStringInfo(&logbuf,
+ _("Process %d: %s"),
+ info->pid,
+ pgstat_get_backend_current_activity(info->pid, false));
+ }
+
+ pgstat_report_deadlock();
+
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
+ errmsg("deadlock detected"),
+ errdetail_internal("%s", clientbuf.data),
+ errdetail_log("%s", logbuf.data),
+ errhint("See server log for query details.")));
+}
+
+/*
+ * RememberSimpleDeadLock: set up info for DeadLockReport when ProcSleep
+ * detects a trivial (two-way) deadlock. proc1 wants to block for lockmode
+ * on lock, but proc2 is already waiting and would be blocked by proc1.
+ */
+void
+RememberSimpleDeadLock(PGPROC *proc1,
+ LOCKMODE lockmode,
+ LOCK *lock,
+ PGPROC *proc2)
+{
+ DEADLOCK_INFO *info = &deadlockDetails[0];
+
+ info->locktag = lock->tag;
+ info->lockmode = lockmode;
+ info->pid = proc1->pid;
+ info++;
+ info->locktag = proc2->waitLock->tag;
+ info->lockmode = proc2->waitLockMode;
+ info->pid = proc2->pid;
+ nDeadlockDetails = 2;
+}
diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl
new file mode 100644
index 0000000..8a44946
--- /dev/null
+++ b/src/backend/storage/lmgr/generate-lwlocknames.pl
@@ -0,0 +1,71 @@
+#!/usr/bin/perl
+#
+# Generate lwlocknames.h and lwlocknames.c from lwlocknames.txt
+# Copyright (c) 2000-2021, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+
+my $lastlockidx = -1;
+my $continue = "\n";
+
+open my $lwlocknames, '<', $ARGV[0] or die;
+
+# Include PID in suffix in case parallel make runs this multiple times.
+my $htmp = "lwlocknames.h.tmp$$";
+my $ctmp = "lwlocknames.c.tmp$$";
+open my $h, '>', $htmp or die "Could not open $htmp: $!";
+open my $c, '>', $ctmp or die "Could not open $ctmp: $!";
+
+my $autogen =
+ "/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */\n";
+print $h $autogen;
+print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n";
+print $c $autogen, "\n";
+
+print $c "const char *const IndividualLWLockNames[] = {";
+
+while (<$lwlocknames>)
+{
+ chomp;
+
+ # Skip comments
+ next if /^#/;
+ next if /^\s*$/;
+
+ die "unable to parse lwlocknames.txt"
+ unless /^(\w+)\s+(\d+)$/;
+
+ (my $lockname, my $lockidx) = ($1, $2);
+
+ my $trimmedlockname = $lockname;
+ $trimmedlockname =~ s/Lock$//;
+ die "lock names must end with 'Lock'" if $trimmedlockname eq $lockname;
+
+ die "lwlocknames.txt not in order" if $lockidx < $lastlockidx;
+ die "lwlocknames.txt has duplicates" if $lockidx == $lastlockidx;
+
+ while ($lastlockidx < $lockidx - 1)
+ {
+ ++$lastlockidx;
+ printf $c "%s \"<unassigned:%d>\"", $continue, $lastlockidx;
+ $continue = ",\n";
+ }
+ printf $c "%s \"%s\"", $continue, $trimmedlockname;
+ $lastlockidx = $lockidx;
+ $continue = ",\n";
+
+ print $h "#define $lockname (&MainLWLockArray[$lockidx].lock)\n";
+}
+
+printf $c "\n};\n";
+print $h "\n";
+printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1;
+
+close $h;
+close $c;
+
+rename($htmp, 'lwlocknames.h') || die "rename: $htmp: $!";
+rename($ctmp, 'lwlocknames.c') || die "rename: $ctmp: $!";
+
+close $lwlocknames;
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
new file mode 100644
index 0000000..2db0424
--- /dev/null
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -0,0 +1,1196 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.c
+ * POSTGRES lock manager code
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/lmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/catalog.h"
+#include "commands/progress.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "utils/inval.h"
+
+
+/*
+ * Per-backend counter for generating speculative insertion tokens.
+ *
+ * This may wrap around, but that's OK as it's only used for the short
+ * duration between inserting a tuple and checking that there are no (unique)
+ * constraint violations. It's theoretically possible that a backend sees a
+ * tuple that was speculatively inserted by another backend, but before it has
+ * started waiting on the token, the other backend completes its insertion,
+ * and then performs 2^32 unrelated insertions. And after all that, the
+ * first backend finally calls SpeculativeInsertionLockAcquire(), with the
+ * intention of waiting for the first insertion to complete, but ends up
+ * waiting for the latest unrelated insertion instead. Even then, nothing
+ * particularly bad happens: in the worst case they deadlock, causing one of
+ * the transactions to abort.
+ */
+static uint32 speculativeInsertionToken = 0;
+
+
+/*
+ * Struct to hold context info for transaction lock waits.
+ *
+ * 'oper' is the operation that needs to wait for the other transaction; 'rel'
+ * and 'ctid' specify the address of the tuple being waited for.
+ */
+typedef struct XactLockTableWaitInfo
+{
+ XLTW_Oper oper;
+ Relation rel;
+ ItemPointer ctid;
+} XactLockTableWaitInfo;
+
+static void XactLockTableWaitErrorCb(void *arg);
+
+/*
+ * RelationInitLockInfo
+ * Initializes the lock information in a relation descriptor.
+ *
+ * relcache.c must call this during creation of any reldesc.
+ */
+void
+RelationInitLockInfo(Relation relation)
+{
+ Assert(RelationIsValid(relation));
+ Assert(OidIsValid(RelationGetRelid(relation)));
+
+ relation->rd_lockInfo.lockRelId.relId = RelationGetRelid(relation);
+
+ if (relation->rd_rel->relisshared)
+ relation->rd_lockInfo.lockRelId.dbId = InvalidOid;
+ else
+ relation->rd_lockInfo.lockRelId.dbId = MyDatabaseId;
+}
+
+/*
+ * SetLocktagRelationOid
+ * Set up a locktag for a relation, given only relation OID
+ */
+static inline void
+SetLocktagRelationOid(LOCKTAG *tag, Oid relid)
+{
+ Oid dbid;
+
+ if (IsSharedRelation(relid))
+ dbid = InvalidOid;
+ else
+ dbid = MyDatabaseId;
+
+ SET_LOCKTAG_RELATION(*tag, dbid, relid);
+}
+
+/*
+ * LockRelationOid
+ *
+ * Lock a relation given only its OID. This should generally be used
+ * before attempting to open the relation's relcache entry.
+ */
+void
+LockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SetLocktagRelationOid(&tag, relid);
+
+ res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+ /*
+ * Now that we have the lock, check for invalidation messages, so that we
+ * will update or flush any stale relcache entry before we try to use it.
+ * RangeVarGetRelid() specifically relies on us for this. We can skip
+ * this in the not-uncommon case that we already had the same type of lock
+ * being requested, since then no one else could have modified the
+ * relcache entry in an undesirable way. (In the case where our own xact
+ * modifies the rel, the relcache update happens via
+ * CommandCounterIncrement, not here.)
+ *
+ * However, in corner cases where code acts on tables (usually catalogs)
+ * recursively, we might get here while still processing invalidation
+ * messages in some outer execution of this function or a sibling. The
+ * "cleared" status of the lock tells us whether we really are done
+ * absorbing relevant inval messages.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+}
+
+/*
+ * ConditionalLockRelationOid
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ *
+ * NOTE: we do not currently need conditional versions of all the
+ * LockXXX routines in this file, but they could easily be added if needed.
+ */
+bool
+ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SetLocktagRelationOid(&tag, relid);
+
+ res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+ if (res == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+
+ /*
+ * Now that we have the lock, check for invalidation messages; see notes
+ * in LockRelationOid.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+
+ return true;
+}
+
+/*
+ * UnlockRelationId
+ *
+ * Unlock, given a LockRelId. This is preferred over UnlockRelationOid
+ * for speed reasons.
+ */
+void
+UnlockRelationId(LockRelId *relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * UnlockRelationOid
+ *
+ * Unlock, given only a relation Oid. Use UnlockRelationId if you can.
+ */
+void
+UnlockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SetLocktagRelationOid(&tag, relid);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockRelation
+ *
+ * This is a convenience routine for acquiring an additional lock on an
+ * already-open relation. Never try to do "relation_open(foo, NoLock)"
+ * and then lock with this.
+ */
+void
+LockRelation(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+ /*
+ * Now that we have the lock, check for invalidation messages; see notes
+ * in LockRelationOid.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+}
+
+/*
+ * ConditionalLockRelation
+ *
+ * This is a convenience routine for acquiring an additional lock on an
+ * already-open relation. Never try to do "relation_open(foo, NoLock)"
+ * and then lock with this.
+ */
+bool
+ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+ if (res == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+
+ /*
+ * Now that we have the lock, check for invalidation messages; see notes
+ * in LockRelationOid.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+
+ return true;
+}
+
+/*
+ * UnlockRelation
+ *
+ * This is a convenience routine for unlocking a relation without also
+ * closing it.
+ */
+void
+UnlockRelation(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * CheckRelationLockedByMe
+ *
+ * Returns true if current transaction holds a lock on 'relation' of mode
+ * 'lockmode'. If 'orstronger' is true, a stronger lockmode is also OK.
+ * ("Stronger" is defined as "numerically higher", which is a bit
+ * semantically dubious but is OK for the purposes we use this for.)
+ */
+bool
+CheckRelationLockedByMe(Relation relation, LOCKMODE lockmode, bool orstronger)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ if (LockHeldByMe(&tag, lockmode))
+ return true;
+
+ if (orstronger)
+ {
+ LOCKMODE slockmode;
+
+ for (slockmode = lockmode + 1;
+ slockmode <= MaxLockMode;
+ slockmode++)
+ {
+ if (LockHeldByMe(&tag, slockmode))
+ {
+#ifdef NOT_USED
+ /* Sometimes this might be useful for debugging purposes */
+ elog(WARNING, "lock mode %s substituted for %s on relation %s",
+ GetLockmodeName(tag.locktag_lockmethodid, slockmode),
+ GetLockmodeName(tag.locktag_lockmethodid, lockmode),
+ RelationGetRelationName(relation));
+#endif
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/*
+ * LockHasWaitersRelation
+ *
+ * This is a function to check whether someone else is waiting for a
+ * lock which we are currently holding.
+ */
+bool
+LockHasWaitersRelation(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ return LockHasWaiters(&tag, lockmode, false);
+}
+
+/*
+ * LockRelationIdForSession
+ *
+ * This routine grabs a session-level lock on the target relation. The
+ * session lock persists across transaction boundaries. It will be removed
+ * when UnlockRelationIdForSession() is called, or if an ereport(ERROR) occurs,
+ * or if the backend exits.
+ *
+ * Note that one should also grab a transaction-level lock on the rel
+ * in any transaction that actually uses the rel, to ensure that the
+ * relcache entry is up to date.
+ */
+void
+LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+ (void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ * UnlockRelationIdForSession
+ */
+void
+UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+ LockRelease(&tag, lockmode, true);
+}
+
+/*
+ * LockRelationForExtension
+ *
+ * This lock tag is used to interlock addition of pages to relations.
+ * We need such locking because bufmgr/smgr definition of P_NEW is not
+ * race-condition-proof.
+ *
+ * We assume the caller is already holding some type of regular lock on
+ * the relation, so no AcceptInvalidationMessages call is needed here.
+ */
+void
+LockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION_EXTEND(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ * ConditionalLockRelationForExtension
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION_EXTEND(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ * RelationExtensionLockWaiterCount
+ *
+ * Count the number of processes waiting for the given relation extension lock.
+ */
+int
+RelationExtensionLockWaiterCount(Relation relation)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION_EXTEND(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ return LockWaiterCount(&tag);
+}
+
+/*
+ * UnlockRelationForExtension
+ */
+void
+UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION_EXTEND(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ * LockPage
+ *
+ * Obtain a page-level lock. This is currently used by some index access
+ * methods to lock individual index pages.
+ */
+void
+LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_PAGE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ blkno);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ * ConditionalLockPage
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_PAGE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ blkno);
+
+ return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ * UnlockPage
+ */
+void
+UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_PAGE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ blkno);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockTuple
+ *
+ * Obtain a tuple-level lock. This is used in a less-than-intuitive fashion
+ * because we can't afford to keep a separate lock in shared memory for every
+ * tuple. See heap_lock_tuple before using this!
+ */
+void
+LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TUPLE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ * ConditionalLockTuple
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TUPLE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+
+ return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ * UnlockTuple
+ */
+void
+UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TUPLE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * XactLockTableInsert
+ *
+ * Insert a lock showing that the given transaction ID is running ---
+ * this is done when an XID is acquired by a transaction or subtransaction.
+ * The lock can then be used to wait for the transaction to finish.
+ */
+void
+XactLockTableInsert(TransactionId xid)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+
+ (void) LockAcquire(&tag, ExclusiveLock, false, false);
+}
+
+/*
+ * XactLockTableDelete
+ *
+ * Delete the lock showing that the given transaction ID is running.
+ * (This is never used for main transaction IDs; those locks are only
+ * released implicitly at transaction end. But we do use it for subtrans IDs.)
+ */
+void
+XactLockTableDelete(TransactionId xid)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+
+ LockRelease(&tag, ExclusiveLock, false);
+}
+
+/*
+ * XactLockTableWait
+ *
+ * Wait for the specified transaction to commit or abort. If an operation
+ * is specified, an error context callback is set up. If 'oper' is passed as
+ * None, no error context callback is set up.
+ *
+ * Note that this does the right thing for subtransactions: if we wait on a
+ * subtransaction, we will exit as soon as it aborts or its top parent commits.
+ * It takes some extra work to ensure this, because to save on shared memory
+ * the XID lock of a subtransaction is released when it ends, whether
+ * successfully or unsuccessfully. So we have to check if it's "still running"
+ * and if so wait for its parent.
+ */
+void
+XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid,
+ XLTW_Oper oper)
+{
+ LOCKTAG tag;
+ XactLockTableWaitInfo info;
+ ErrorContextCallback callback;
+ bool first = true;
+
+ /*
+ * If an operation is specified, set up our verbose error context
+ * callback.
+ */
+ if (oper != XLTW_None)
+ {
+ Assert(RelationIsValid(rel));
+ Assert(ItemPointerIsValid(ctid));
+
+ info.rel = rel;
+ info.ctid = ctid;
+ info.oper = oper;
+
+ callback.callback = XactLockTableWaitErrorCb;
+ callback.arg = &info;
+ callback.previous = error_context_stack;
+ error_context_stack = &callback;
+ }
+
+ for (;;)
+ {
+ Assert(TransactionIdIsValid(xid));
+ Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+
+ (void) LockAcquire(&tag, ShareLock, false, false);
+
+ LockRelease(&tag, ShareLock, false);
+
+ if (!TransactionIdIsInProgress(xid))
+ break;
+
+ /*
+ * If the Xid belonged to a subtransaction, then the lock would have
+ * gone away as soon as it was finished; for correct tuple visibility,
+ * the right action is to wait on its parent transaction to go away.
+ * But instead of going levels up one by one, we can just wait for the
+ * topmost transaction to finish with the same end result, which also
+ * incurs less locktable traffic.
+ *
+ * Some uses of this function don't involve tuple visibility -- such
+ * as when building snapshots for logical decoding. It is possible to
+ * see a transaction in ProcArray before it registers itself in the
+ * locktable. The topmost transaction in that case is the same xid,
+ * so we try again after a short sleep. (Don't sleep the first time
+ * through, to avoid slowing down the normal case.)
+ */
+ if (!first)
+ pg_usleep(1000L);
+ first = false;
+ xid = SubTransGetTopmostTransaction(xid);
+ }
+
+ if (oper != XLTW_None)
+ error_context_stack = callback.previous;
+}
+
+/*
+ * ConditionalXactLockTableWait
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true if the lock was acquired.
+ */
+bool
+ConditionalXactLockTableWait(TransactionId xid)
+{
+ LOCKTAG tag;
+ bool first = true;
+
+ for (;;)
+ {
+ Assert(TransactionIdIsValid(xid));
+ Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+
+ if (LockAcquire(&tag, ShareLock, false, true) == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+
+ LockRelease(&tag, ShareLock, false);
+
+ if (!TransactionIdIsInProgress(xid))
+ break;
+
+ /* See XactLockTableWait about this case */
+ if (!first)
+ pg_usleep(1000L);
+ first = false;
+ xid = SubTransGetTopmostTransaction(xid);
+ }
+
+ return true;
+}
+
+/*
+ * SpeculativeInsertionLockAcquire
+ *
+ * Insert a lock showing that the given transaction ID is inserting a tuple,
+ * but hasn't yet decided whether it's going to keep it. The lock can then be
+ * used to wait for the decision to go ahead with the insertion, or aborting
+ * it.
+ *
+ * The token is used to distinguish multiple insertions by the same
+ * transaction. It is returned to caller.
+ */
+uint32
+SpeculativeInsertionLockAcquire(TransactionId xid)
+{
+ LOCKTAG tag;
+
+ speculativeInsertionToken++;
+
+ /*
+ * Check for wrap-around. Zero means no token is held, so don't use that.
+ */
+ if (speculativeInsertionToken == 0)
+ speculativeInsertionToken = 1;
+
+ SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken);
+
+ (void) LockAcquire(&tag, ExclusiveLock, false, false);
+
+ return speculativeInsertionToken;
+}
+
+/*
+ * SpeculativeInsertionLockRelease
+ *
+ * Delete the lock showing that the given transaction is speculatively
+ * inserting a tuple.
+ */
+void
+SpeculativeInsertionLockRelease(TransactionId xid)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken);
+
+ LockRelease(&tag, ExclusiveLock, false);
+}
+
+/*
+ * SpeculativeInsertionWait
+ *
+ * Wait for the specified transaction to finish or abort the insertion of a
+ * tuple.
+ */
+void
+SpeculativeInsertionWait(TransactionId xid, uint32 token)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, token);
+
+ Assert(TransactionIdIsValid(xid));
+ Assert(token != 0);
+
+ (void) LockAcquire(&tag, ShareLock, false, false);
+ LockRelease(&tag, ShareLock, false);
+}
+
+/*
+ * XactLockTableWaitErrorCb
+ * Error context callback for transaction lock waits.
+ */
+static void
+XactLockTableWaitErrorCb(void *arg)
+{
+ XactLockTableWaitInfo *info = (XactLockTableWaitInfo *) arg;
+
+ /*
+ * We would like to print schema name too, but that would require a
+ * syscache lookup.
+ */
+ if (info->oper != XLTW_None &&
+ ItemPointerIsValid(info->ctid) && RelationIsValid(info->rel))
+ {
+ const char *cxt;
+
+ switch (info->oper)
+ {
+ case XLTW_Update:
+ cxt = gettext_noop("while updating tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_Delete:
+ cxt = gettext_noop("while deleting tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_Lock:
+ cxt = gettext_noop("while locking tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_LockUpdated:
+ cxt = gettext_noop("while locking updated version (%u,%u) of tuple in relation \"%s\"");
+ break;
+ case XLTW_InsertIndex:
+ cxt = gettext_noop("while inserting index tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_InsertIndexUnique:
+ cxt = gettext_noop("while checking uniqueness of tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_FetchUpdated:
+ cxt = gettext_noop("while rechecking updated tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_RecheckExclusionConstr:
+ cxt = gettext_noop("while checking exclusion constraint on tuple (%u,%u) in relation \"%s\"");
+ break;
+
+ default:
+ return;
+ }
+
+ errcontext(cxt,
+ ItemPointerGetBlockNumber(info->ctid),
+ ItemPointerGetOffsetNumber(info->ctid),
+ RelationGetRelationName(info->rel));
+ }
+}
+
+/*
+ * WaitForLockersMultiple
+ * Wait until no transaction holds locks that conflict with the given
+ * locktags at the given lockmode.
+ *
+ * To do this, obtain the current list of lockers, and wait on their VXIDs
+ * until they are finished.
+ *
+ * Note we don't try to acquire the locks on the given locktags, only the
+ * VXIDs and XIDs of their lock holders; if somebody grabs a conflicting lock
+ * on the objects after we obtained our initial list of lockers, we will not
+ * wait for them.
+ */
+void
+WaitForLockersMultiple(List *locktags, LOCKMODE lockmode, bool progress)
+{
+ List *holders = NIL;
+ ListCell *lc;
+ int total = 0;
+ int done = 0;
+
+ /* Done if no locks to wait for */
+ if (list_length(locktags) == 0)
+ return;
+
+ /* Collect the transactions we need to wait on */
+ foreach(lc, locktags)
+ {
+ LOCKTAG *locktag = lfirst(lc);
+ int count;
+
+ holders = lappend(holders,
+ GetLockConflicts(locktag, lockmode,
+ progress ? &count : NULL));
+ if (progress)
+ total += count;
+ }
+
+ if (progress)
+ pgstat_progress_update_param(PROGRESS_WAITFOR_TOTAL, total);
+
+ /*
+ * Note: GetLockConflicts() never reports our own xid, hence we need not
+ * check for that. Also, prepared xacts are reported and awaited.
+ */
+
+ /* Finally wait for each such transaction to complete */
+ foreach(lc, holders)
+ {
+ VirtualTransactionId *lockholders = lfirst(lc);
+
+ while (VirtualTransactionIdIsValid(*lockholders))
+ {
+ /* If requested, publish who we're going to wait for. */
+ if (progress)
+ {
+ PGPROC *holder = BackendIdGetProc(lockholders->backendId);
+
+ if (holder)
+ pgstat_progress_update_param(PROGRESS_WAITFOR_CURRENT_PID,
+ holder->pid);
+ }
+ VirtualXactLock(*lockholders, true);
+ lockholders++;
+
+ if (progress)
+ pgstat_progress_update_param(PROGRESS_WAITFOR_DONE, ++done);
+ }
+ }
+ if (progress)
+ {
+ const int index[] = {
+ PROGRESS_WAITFOR_TOTAL,
+ PROGRESS_WAITFOR_DONE,
+ PROGRESS_WAITFOR_CURRENT_PID
+ };
+ const int64 values[] = {
+ 0, 0, 0
+ };
+
+ pgstat_progress_update_multi_param(3, index, values);
+ }
+
+ list_free_deep(holders);
+}
+
+/*
+ * WaitForLockers
+ *
+ * Same as WaitForLockersMultiple, for a single lock tag.
+ */
+void
+WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode, bool progress)
+{
+ List *l;
+
+ l = list_make1(&heaplocktag);
+ WaitForLockersMultiple(l, lockmode, progress);
+ list_free(l);
+}
+
+
+/*
+ * LockDatabaseObject
+ *
+ * Obtain a lock on a general object of the current database. Don't use
+ * this for shared objects (such as tablespaces). It's unwise to apply it
+ * to relations, also, since a lock taken this way will NOT conflict with
+ * locks taken via LockRelation and friends.
+ */
+void
+LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ MyDatabaseId,
+ classid,
+ objid,
+ objsubid);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+
+ /* Make sure syscaches are up-to-date with any changes we waited for */
+ AcceptInvalidationMessages();
+}
+
+/*
+ * UnlockDatabaseObject
+ */
+void
+UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ MyDatabaseId,
+ classid,
+ objid,
+ objsubid);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockSharedObject
+ *
+ * Obtain a lock on a shared-across-databases object.
+ */
+void
+LockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ InvalidOid,
+ classid,
+ objid,
+ objsubid);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+
+ /* Make sure syscaches are up-to-date with any changes we waited for */
+ AcceptInvalidationMessages();
+}
+
+/*
+ * UnlockSharedObject
+ */
+void
+UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ InvalidOid,
+ classid,
+ objid,
+ objsubid);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockSharedObjectForSession
+ *
+ * Obtain a session-level lock on a shared-across-databases object.
+ * See LockRelationIdForSession for notes about session-level locks.
+ */
+void
+LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ InvalidOid,
+ classid,
+ objid,
+ objsubid);
+
+ (void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ * UnlockSharedObjectForSession
+ */
+void
+UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ InvalidOid,
+ classid,
+ objid,
+ objsubid);
+
+ LockRelease(&tag, lockmode, true);
+}
+
+
+/*
+ * Append a description of a lockable object to buf.
+ *
+ * Ideally we would print names for the numeric values, but that requires
+ * getting locks on system tables, which might cause problems since this is
+ * typically used to report deadlock situations.
+ */
+void
+DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
+{
+ switch ((LockTagType) tag->locktag_type)
+ {
+ case LOCKTAG_RELATION:
+ appendStringInfo(buf,
+ _("relation %u of database %u"),
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_RELATION_EXTEND:
+ appendStringInfo(buf,
+ _("extension of relation %u of database %u"),
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_DATABASE_FROZEN_IDS:
+ appendStringInfo(buf,
+ _("pg_database.datfrozenxid of database %u"),
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_PAGE:
+ appendStringInfo(buf,
+ _("page %u of relation %u of database %u"),
+ tag->locktag_field3,
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_TUPLE:
+ appendStringInfo(buf,
+ _("tuple (%u,%u) of relation %u of database %u"),
+ tag->locktag_field3,
+ tag->locktag_field4,
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_TRANSACTION:
+ appendStringInfo(buf,
+ _("transaction %u"),
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_VIRTUALTRANSACTION:
+ appendStringInfo(buf,
+ _("virtual transaction %d/%u"),
+ tag->locktag_field1,
+ tag->locktag_field2);
+ break;
+ case LOCKTAG_SPECULATIVE_TOKEN:
+ appendStringInfo(buf,
+ _("speculative token %u of transaction %u"),
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_OBJECT:
+ appendStringInfo(buf,
+ _("object %u of class %u of database %u"),
+ tag->locktag_field3,
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_USERLOCK:
+ /* reserved for old contrib code, now on pgfoundry */
+ appendStringInfo(buf,
+ _("user lock [%u,%u,%u]"),
+ tag->locktag_field1,
+ tag->locktag_field2,
+ tag->locktag_field3);
+ break;
+ case LOCKTAG_ADVISORY:
+ appendStringInfo(buf,
+ _("advisory lock [%u,%u,%u,%u]"),
+ tag->locktag_field1,
+ tag->locktag_field2,
+ tag->locktag_field3,
+ tag->locktag_field4);
+ break;
+ default:
+ appendStringInfo(buf,
+ _("unrecognized locktag type %d"),
+ (int) tag->locktag_type);
+ break;
+ }
+}
+
+/*
+ * GetLockNameFromTagType
+ *
+ * Given locktag type, return the corresponding lock name.
+ */
+const char *
+GetLockNameFromTagType(uint16 locktag_type)
+{
+ if (locktag_type > LOCKTAG_LAST_TYPE)
+ return "???";
+ return LockTagTypeNames[locktag_type];
+}
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
new file mode 100644
index 0000000..818666f
--- /dev/null
+++ b/src/backend/storage/lmgr/lock.c
@@ -0,0 +1,4738 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.c
+ * POSTGRES primary lock mechanism
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/lock.c
+ *
+ * NOTES
+ * A lock table is a shared memory hash table. When
+ * a process tries to acquire a lock of a type that conflicts
+ * with existing locks, it is put to sleep using the routines
+ * in storage/lmgr/proc.c.
+ *
+ * For the most part, this code should be invoked via lmgr.c
+ * or another lock-management module, not directly.
+ *
+ * Interface:
+ *
+ * InitLocks(), GetLocksMethodTable(), GetLockTagsMethodTable(),
+ * LockAcquire(), LockRelease(), LockReleaseAll(),
+ * LockCheckConflicts(), GrantLock()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "storage/standby.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/resowner_private.h"
+
+
+/* This configuration variable is used to set the lock table size */
+int max_locks_per_xact; /* set by guc.c */
+
+#define NLOCKENTS() \
+ mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+
+
+/*
+ * Data structures defining the semantics of the standard lock methods.
+ *
+ * The conflict table defines the semantics of the various lock modes.
+ */
+static const LOCKMASK LockConflicts[] = {
+ 0,
+
+ /* AccessShareLock */
+ LOCKBIT_ON(AccessExclusiveLock),
+
+ /* RowShareLock */
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* RowExclusiveLock */
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* ShareUpdateExclusiveLock */
+ LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* ShareLock */
+ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* ShareRowExclusiveLock */
+ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* ExclusiveLock */
+ LOCKBIT_ON(RowShareLock) |
+ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* AccessExclusiveLock */
+ LOCKBIT_ON(AccessShareLock) | LOCKBIT_ON(RowShareLock) |
+ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock)
+
+};
+
+/* Names of lock modes, for debug printouts */
+static const char *const lock_mode_names[] =
+{
+ "INVALID",
+ "AccessShareLock",
+ "RowShareLock",
+ "RowExclusiveLock",
+ "ShareUpdateExclusiveLock",
+ "ShareLock",
+ "ShareRowExclusiveLock",
+ "ExclusiveLock",
+ "AccessExclusiveLock"
+};
+
+#ifndef LOCK_DEBUG
+static bool Dummy_trace = false;
+#endif
+
+static const LockMethodData default_lockmethod = {
+ AccessExclusiveLock, /* highest valid lock mode number */
+ LockConflicts,
+ lock_mode_names,
+#ifdef LOCK_DEBUG
+ &Trace_locks
+#else
+ &Dummy_trace
+#endif
+};
+
+static const LockMethodData user_lockmethod = {
+ AccessExclusiveLock, /* highest valid lock mode number */
+ LockConflicts,
+ lock_mode_names,
+#ifdef LOCK_DEBUG
+ &Trace_userlocks
+#else
+ &Dummy_trace
+#endif
+};
+
+/*
+ * map from lock method id to the lock table data structures
+ */
+static const LockMethod LockMethods[] = {
+ NULL,
+ &default_lockmethod,
+ &user_lockmethod
+};
+
+
+/* Record that's written to 2PC state file when a lock is persisted */
+typedef struct TwoPhaseLockRecord
+{
+ LOCKTAG locktag;
+ LOCKMODE lockmode;
+} TwoPhaseLockRecord;
+
+
+/*
+ * Count of the number of fast path lock slots we believe to be used. This
+ * might be higher than the real number if another backend has transferred
+ * our locks to the primary lock table, but it can never be lower than the
+ * real value, since only we can acquire locks on our own behalf.
+ */
+static int FastPathLocalUseCount = 0;
+
+/*
+ * Flag to indicate if the relation extension lock is held by this backend.
+ * This flag is used to ensure that while holding the relation extension lock
+ * we don't try to acquire a heavyweight lock on any other object. This
+ * restriction implies that the relation extension lock won't ever participate
+ * in the deadlock cycle because we can never wait for any other heavyweight
+ * lock after acquiring this lock.
+ *
+ * Such a restriction is okay for relation extension locks as unlike other
+ * heavyweight locks these are not held till the transaction end. These are
+ * taken for a short duration to extend a particular relation and then
+ * released.
+ */
+static bool IsRelationExtensionLockHeld PG_USED_FOR_ASSERTS_ONLY = false;
+
+/*
+ * Flag to indicate if the page lock is held by this backend. We don't
+ * acquire any other heavyweight lock while holding the page lock except for
+ * relation extension. However, these locks are never taken in reverse order
+ * which implies that page locks will also never participate in the deadlock
+ * cycle.
+ *
+ * Similar to relation extension, page locks are also held for a short
+ * duration, so imposing such a restriction won't hurt.
+ */
+static bool IsPageLockHeld PG_USED_FOR_ASSERTS_ONLY = false;
+
+/* Macros for manipulating proc->fpLockBits */
+#define FAST_PATH_BITS_PER_SLOT 3
+#define FAST_PATH_LOCKNUMBER_OFFSET 1
+#define FAST_PATH_MASK ((1 << FAST_PATH_BITS_PER_SLOT) - 1)
+#define FAST_PATH_GET_BITS(proc, n) \
+ (((proc)->fpLockBits >> (FAST_PATH_BITS_PER_SLOT * n)) & FAST_PATH_MASK)
+#define FAST_PATH_BIT_POSITION(n, l) \
+ (AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \
+ AssertMacro((l) < FAST_PATH_BITS_PER_SLOT+FAST_PATH_LOCKNUMBER_OFFSET), \
+ AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \
+ ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n)))
+#define FAST_PATH_SET_LOCKMODE(proc, n, l) \
+ (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)
+#define FAST_PATH_CLEAR_LOCKMODE(proc, n, l) \
+ (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))
+#define FAST_PATH_CHECK_LOCKMODE(proc, n, l) \
+ ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)))
+
+/*
+ * The fast-path lock mechanism is concerned only with relation locks on
+ * unshared relations by backends bound to a database. The fast-path
+ * mechanism exists mostly to accelerate acquisition and release of locks
+ * that rarely conflict. Because ShareUpdateExclusiveLock is
+ * self-conflicting, it can't use the fast-path mechanism; but it also does
+ * not conflict with any of the locks that do, so we can ignore it completely.
+ */
+#define EligibleForRelationFastPath(locktag, mode) \
+ ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \
+ (locktag)->locktag_type == LOCKTAG_RELATION && \
+ (locktag)->locktag_field1 == MyDatabaseId && \
+ MyDatabaseId != InvalidOid && \
+ (mode) < ShareUpdateExclusiveLock)
+#define ConflictsWithRelationFastPath(locktag, mode) \
+ ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \
+ (locktag)->locktag_type == LOCKTAG_RELATION && \
+ (locktag)->locktag_field1 != InvalidOid && \
+ (mode) > ShareUpdateExclusiveLock)
+
+static bool FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode);
+static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode);
+static bool FastPathTransferRelationLocks(LockMethod lockMethodTable,
+ const LOCKTAG *locktag, uint32 hashcode);
+static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock);
+
+/*
+ * To make the fast-path lock mechanism work, we must have some way of
+ * preventing the use of the fast-path when a conflicting lock might be present.
+ * We partition* the locktag space into FAST_PATH_STRONG_LOCK_HASH_PARTITIONS,
+ * and maintain an integer count of the number of "strong" lockers
+ * in each partition. When any "strong" lockers are present (which is
+ * hopefully not very often), the fast-path mechanism can't be used, and we
+ * must fall back to the slower method of pushing matching locks directly
+ * into the main lock tables.
+ *
+ * The deadlock detector does not know anything about the fast path mechanism,
+ * so any locks that might be involved in a deadlock must be transferred from
+ * the fast-path queues to the main lock table.
+ */
+
+#define FAST_PATH_STRONG_LOCK_HASH_BITS 10
+#define FAST_PATH_STRONG_LOCK_HASH_PARTITIONS \
+ (1 << FAST_PATH_STRONG_LOCK_HASH_BITS)
+#define FastPathStrongLockHashPartition(hashcode) \
+ ((hashcode) % FAST_PATH_STRONG_LOCK_HASH_PARTITIONS)
+
+typedef struct
+{
+ slock_t mutex;
+ uint32 count[FAST_PATH_STRONG_LOCK_HASH_PARTITIONS];
+} FastPathStrongRelationLockData;
+
+static volatile FastPathStrongRelationLockData *FastPathStrongRelationLocks;
+
+
+/*
+ * Pointers to hash tables containing lock state
+ *
+ * The LockMethodLockHash and LockMethodProcLockHash hash tables are in
+ * shared memory; LockMethodLocalHash is local to each backend.
+ */
+static HTAB *LockMethodLockHash;
+static HTAB *LockMethodProcLockHash;
+static HTAB *LockMethodLocalHash;
+
+
+/* private state for error cleanup */
+static LOCALLOCK *StrongLockInProgress;
+static LOCALLOCK *awaitedLock;
+static ResourceOwner awaitedOwner;
+
+
+#ifdef LOCK_DEBUG
+
+/*------
+ * The following configuration options are available for lock debugging:
+ *
+ * TRACE_LOCKS -- give a bunch of output what's going on in this file
+ * TRACE_USERLOCKS -- same but for user locks
+ * TRACE_LOCK_OIDMIN-- do not trace locks for tables below this oid
+ * (use to avoid output on system tables)
+ * TRACE_LOCK_TABLE -- trace locks on this table (oid) unconditionally
+ * DEBUG_DEADLOCKS -- currently dumps locks at untimely occasions ;)
+ *
+ * Furthermore, but in storage/lmgr/lwlock.c:
+ * TRACE_LWLOCKS -- trace lightweight locks (pretty useless)
+ *
+ * Define LOCK_DEBUG at compile time to get all these enabled.
+ * --------
+ */
+
+int Trace_lock_oidmin = FirstNormalObjectId;
+bool Trace_locks = false;
+bool Trace_userlocks = false;
+int Trace_lock_table = 0;
+bool Debug_deadlocks = false;
+
+
+inline static bool
+LOCK_DEBUG_ENABLED(const LOCKTAG *tag)
+{
+ return
+ (*(LockMethods[tag->locktag_lockmethodid]->trace_flag) &&
+ ((Oid) tag->locktag_field2 >= (Oid) Trace_lock_oidmin))
+ || (Trace_lock_table &&
+ (tag->locktag_field2 == Trace_lock_table));
+}
+
+
+inline static void
+LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type)
+{
+ if (LOCK_DEBUG_ENABLED(&lock->tag))
+ elog(LOG,
+ "%s: lock(%p) id(%u,%u,%u,%u,%u,%u) grantMask(%x) "
+ "req(%d,%d,%d,%d,%d,%d,%d)=%d "
+ "grant(%d,%d,%d,%d,%d,%d,%d)=%d wait(%d) type(%s)",
+ where, lock,
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3, lock->tag.locktag_field4,
+ lock->tag.locktag_type, lock->tag.locktag_lockmethodid,
+ lock->grantMask,
+ lock->requested[1], lock->requested[2], lock->requested[3],
+ lock->requested[4], lock->requested[5], lock->requested[6],
+ lock->requested[7], lock->nRequested,
+ lock->granted[1], lock->granted[2], lock->granted[3],
+ lock->granted[4], lock->granted[5], lock->granted[6],
+ lock->granted[7], lock->nGranted,
+ lock->waitProcs.size,
+ LockMethods[LOCK_LOCKMETHOD(*lock)]->lockModeNames[type]);
+}
+
+
+inline static void
+PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
+{
+ if (LOCK_DEBUG_ENABLED(&proclockP->tag.myLock->tag))
+ elog(LOG,
+ "%s: proclock(%p) lock(%p) method(%u) proc(%p) hold(%x)",
+ where, proclockP, proclockP->tag.myLock,
+ PROCLOCK_LOCKMETHOD(*(proclockP)),
+ proclockP->tag.myProc, (int) proclockP->holdMask);
+}
+#else /* not LOCK_DEBUG */
+
+#define LOCK_PRINT(where, lock, type) ((void) 0)
+#define PROCLOCK_PRINT(where, proclockP) ((void) 0)
+#endif /* not LOCK_DEBUG */
+
+
+static uint32 proclock_hash(const void *key, Size keysize);
+static void RemoveLocalLock(LOCALLOCK *locallock);
+static PROCLOCK *SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
+ const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode);
+static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner);
+static void BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode);
+static void FinishStrongLockAcquire(void);
+static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner);
+static void ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock);
+static void LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent);
+static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode,
+ PROCLOCK *proclock, LockMethod lockMethodTable);
+static void CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+ LockMethod lockMethodTable, uint32 hashcode,
+ bool wakeupNeeded);
+static void LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
+ LOCKTAG *locktag, LOCKMODE lockmode,
+ bool decrement_strong_lock_count);
+static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc,
+ BlockedProcsData *data);
+
+
+/*
+ * InitLocks -- Initialize the lock manager's data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments. In the normal postmaster case, the shared hash tables
+ * are created here, as well as a locallock hash table that will remain
+ * unused and empty in the postmaster itself. Backends inherit the pointers
+ * to the shared tables via fork(), and also inherit an image of the locallock
+ * hash table, which they proceed to use. In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables and to create its locallock hash table.
+ */
+void
+InitLocks(void)
+{
+ HASHCTL info;
+ long init_table_size,
+ max_table_size;
+ bool found;
+
+ /*
+ * Compute init/max size to request for lock hashtables. Note these
+ * calculations must agree with LockShmemSize!
+ */
+ max_table_size = NLOCKENTS();
+ init_table_size = max_table_size / 2;
+
+ /*
+ * Allocate hash table for LOCK structs. This stores per-locked-object
+ * information.
+ */
+ info.keysize = sizeof(LOCKTAG);
+ info.entrysize = sizeof(LOCK);
+ info.num_partitions = NUM_LOCK_PARTITIONS;
+
+ LockMethodLockHash = ShmemInitHash("LOCK hash",
+ init_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
+
+ /* Assume an average of 2 holders per lock */
+ max_table_size *= 2;
+ init_table_size *= 2;
+
+ /*
+ * Allocate hash table for PROCLOCK structs. This stores
+ * per-lock-per-holder information.
+ */
+ info.keysize = sizeof(PROCLOCKTAG);
+ info.entrysize = sizeof(PROCLOCK);
+ info.hash = proclock_hash;
+ info.num_partitions = NUM_LOCK_PARTITIONS;
+
+ LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash",
+ init_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+ /*
+ * Allocate fast-path structures.
+ */
+ FastPathStrongRelationLocks =
+ ShmemInitStruct("Fast Path Strong Relation Lock Data",
+ sizeof(FastPathStrongRelationLockData), &found);
+ if (!found)
+ SpinLockInit(&FastPathStrongRelationLocks->mutex);
+
+ /*
+ * Allocate non-shared hash table for LOCALLOCK structs. This stores lock
+ * counts and resource owner information.
+ *
+ * The non-shared table could already exist in this process (this occurs
+ * when the postmaster is recreating shared memory after a backend crash).
+ * If so, delete and recreate it. (We could simply leave it, since it
+ * ought to be empty in the postmaster, but for safety let's zap it.)
+ */
+ if (LockMethodLocalHash)
+ hash_destroy(LockMethodLocalHash);
+
+ info.keysize = sizeof(LOCALLOCKTAG);
+ info.entrysize = sizeof(LOCALLOCK);
+
+ LockMethodLocalHash = hash_create("LOCALLOCK hash",
+ 16,
+ &info,
+ HASH_ELEM | HASH_BLOBS);
+}
+
+
+/*
+ * Fetch the lock method table associated with a given lock
+ */
+LockMethod
+GetLocksMethodTable(const LOCK *lock)
+{
+ LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*lock);
+
+ Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+ return LockMethods[lockmethodid];
+}
+
+/*
+ * Fetch the lock method table associated with a given locktag
+ */
+LockMethod
+GetLockTagsMethodTable(const LOCKTAG *locktag)
+{
+ LOCKMETHODID lockmethodid = (LOCKMETHODID) locktag->locktag_lockmethodid;
+
+ Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+ return LockMethods[lockmethodid];
+}
+
+
+/*
+ * Compute the hash code associated with a LOCKTAG.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed. Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+uint32
+LockTagHashCode(const LOCKTAG *locktag)
+{
+ return get_hash_value(LockMethodLockHash, (const void *) locktag);
+}
+
+/*
+ * Compute the hash code associated with a PROCLOCKTAG.
+ *
+ * Because we want to use just one set of partition locks for both the
+ * LOCK and PROCLOCK hash tables, we have to make sure that PROCLOCKs
+ * fall into the same partition number as their associated LOCKs.
+ * dynahash.c expects the partition number to be the low-order bits of
+ * the hash code, and therefore a PROCLOCKTAG's hash code must have the
+ * same low-order bits as the associated LOCKTAG's hash code. We achieve
+ * this with this specialized hash function.
+ */
+static uint32
+proclock_hash(const void *key, Size keysize)
+{
+ const PROCLOCKTAG *proclocktag = (const PROCLOCKTAG *) key;
+ uint32 lockhash;
+ Datum procptr;
+
+ Assert(keysize == sizeof(PROCLOCKTAG));
+
+ /* Look into the associated LOCK object, and compute its hash code */
+ lockhash = LockTagHashCode(&proclocktag->myLock->tag);
+
+ /*
+ * To make the hash code also depend on the PGPROC, we xor the proc
+ * struct's address into the hash code, left-shifted so that the
+ * partition-number bits don't change. Since this is only a hash, we
+ * don't care if we lose high-order bits of the address; use an
+ * intermediate variable to suppress cast-pointer-to-int warnings.
+ */
+ procptr = PointerGetDatum(proclocktag->myProc);
+ lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS;
+
+ return lockhash;
+}
+
+/*
+ * Compute the hash code associated with a PROCLOCKTAG, given the hashcode
+ * for its underlying LOCK.
+ *
+ * We use this just to avoid redundant calls of LockTagHashCode().
+ */
+static inline uint32
+ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode)
+{
+ uint32 lockhash = hashcode;
+ Datum procptr;
+
+ /*
+ * This must match proclock_hash()!
+ */
+ procptr = PointerGetDatum(proclocktag->myProc);
+ lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS;
+
+ return lockhash;
+}
+
+/*
+ * Given two lock modes, return whether they would conflict.
+ */
+bool
+DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
+{
+ LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+
+ if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2))
+ return true;
+
+ return false;
+}
+
+/*
+ * LockHeldByMe -- test whether lock 'locktag' is held with mode 'lockmode'
+ * by the current transaction
+ */
+bool
+LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode)
+{
+ LOCALLOCKTAG localtag;
+ LOCALLOCK *locallock;
+
+ /*
+ * See if there is a LOCALLOCK entry for this lock and lockmode
+ */
+ MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+ localtag.lock = *locktag;
+ localtag.mode = lockmode;
+
+ locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+ (void *) &localtag,
+ HASH_FIND, NULL);
+
+ return (locallock && locallock->nLocks > 0);
+}
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * GetLockMethodLocalHash -- return the hash of local locks, for modules that
+ * evaluate assertions based on all locks held.
+ */
+HTAB *
+GetLockMethodLocalHash(void)
+{
+ return LockMethodLocalHash;
+}
+#endif
+
+/*
+ * LockHasWaiters -- look up 'locktag' and check if releasing this
+ * lock would wake up other processes waiting for it.
+ */
+bool
+LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
+{
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LockMethod lockMethodTable;
+ LOCALLOCKTAG localtag;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ LWLock *partitionLock;
+ bool hasWaiters = false;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+ if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+ elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+#ifdef LOCK_DEBUG
+ if (LOCK_DEBUG_ENABLED(locktag))
+ elog(LOG, "LockHasWaiters: lock [%u,%u] %s",
+ locktag->locktag_field1, locktag->locktag_field2,
+ lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+ /*
+ * Find the LOCALLOCK entry for this lock and lockmode
+ */
+ MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+ localtag.lock = *locktag;
+ localtag.mode = lockmode;
+
+ locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+ (void *) &localtag,
+ HASH_FIND, NULL);
+
+ /*
+ * let the caller print its own error message, too. Do not ereport(ERROR).
+ */
+ if (!locallock || locallock->nLocks <= 0)
+ {
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ return false;
+ }
+
+ /*
+ * Check the shared lock table.
+ */
+ partitionLock = LockHashPartitionLock(locallock->hashcode);
+
+ LWLockAcquire(partitionLock, LW_SHARED);
+
+ /*
+ * We don't need to re-find the lock or proclock, since we kept their
+ * addresses in the locallock table, and they couldn't have been removed
+ * while we were holding a lock on them.
+ */
+ lock = locallock->lock;
+ LOCK_PRINT("LockHasWaiters: found", lock, lockmode);
+ proclock = locallock->proclock;
+ PROCLOCK_PRINT("LockHasWaiters: found", proclock);
+
+ /*
+ * Double-check that we are actually holding a lock of the type we want to
+ * release.
+ */
+ if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+ {
+ PROCLOCK_PRINT("LockHasWaiters: WRONGTYPE", proclock);
+ LWLockRelease(partitionLock);
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ RemoveLocalLock(locallock);
+ return false;
+ }
+
+ /*
+ * Do the checking.
+ */
+ if ((lockMethodTable->conflictTab[lockmode] & lock->waitMask) != 0)
+ hasWaiters = true;
+
+ LWLockRelease(partitionLock);
+
+ return hasWaiters;
+}
+
+/*
+ * LockAcquire -- Check for lock conflicts, sleep if conflict found,
+ * set lock if/when no conflicts.
+ *
+ * Inputs:
+ * locktag: unique identifier for the lockable object
+ * lockmode: lock mode to acquire
+ * sessionLock: if true, acquire lock for session not current transaction
+ * dontWait: if true, don't wait to acquire lock
+ *
+ * Returns one of:
+ * LOCKACQUIRE_NOT_AVAIL lock not available, and dontWait=true
+ * LOCKACQUIRE_OK lock successfully acquired
+ * LOCKACQUIRE_ALREADY_HELD incremented count for lock already held
+ * LOCKACQUIRE_ALREADY_CLEAR incremented count for lock already clear
+ *
+ * In the normal case where dontWait=false and the caller doesn't need to
+ * distinguish a freshly acquired lock from one already taken earlier in
+ * this same transaction, there is no need to examine the return value.
+ *
+ * Side Effects: The lock is acquired and recorded in lock tables.
+ *
+ * NOTE: if we wait for the lock, there is no way to abort the wait
+ * short of aborting the transaction.
+ */
+LockAcquireResult
+LockAcquire(const LOCKTAG *locktag,
+ LOCKMODE lockmode,
+ bool sessionLock,
+ bool dontWait)
+{
+ return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait,
+ true, NULL);
+}
+
+/*
+ * LockAcquireExtended - allows us to specify additional options
+ *
+ * reportMemoryError specifies whether a lock request that fills the lock
+ * table should generate an ERROR or not. Passing "false" allows the caller
+ * to attempt to recover from lock-table-full situations, perhaps by forcibly
+ * canceling other lock holders and then retrying. Note, however, that the
+ * return code for that is LOCKACQUIRE_NOT_AVAIL, so that it's unsafe to use
+ * in combination with dontWait = true, as the cause of failure couldn't be
+ * distinguished.
+ *
+ * If locallockp isn't NULL, *locallockp receives a pointer to the LOCALLOCK
+ * table entry if a lock is successfully acquired, or NULL if not.
+ */
+LockAcquireResult
+LockAcquireExtended(const LOCKTAG *locktag,
+ LOCKMODE lockmode,
+ bool sessionLock,
+ bool dontWait,
+ bool reportMemoryError,
+ LOCALLOCK **locallockp)
+{
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LockMethod lockMethodTable;
+ LOCALLOCKTAG localtag;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ bool found;
+ ResourceOwner owner;
+ uint32 hashcode;
+ LWLock *partitionLock;
+ bool found_conflict;
+ bool log_lock = false;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+ if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+ elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+ if (RecoveryInProgress() && !InRecovery &&
+ (locktag->locktag_type == LOCKTAG_OBJECT ||
+ locktag->locktag_type == LOCKTAG_RELATION) &&
+ lockmode > RowExclusiveLock)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot acquire lock mode %s on database objects while recovery is in progress",
+ lockMethodTable->lockModeNames[lockmode]),
+ errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery.")));
+
+#ifdef LOCK_DEBUG
+ if (LOCK_DEBUG_ENABLED(locktag))
+ elog(LOG, "LockAcquire: lock [%u,%u] %s",
+ locktag->locktag_field1, locktag->locktag_field2,
+ lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+ /* Identify owner for lock */
+ if (sessionLock)
+ owner = NULL;
+ else
+ owner = CurrentResourceOwner;
+
+ /*
+ * Find or create a LOCALLOCK entry for this lock and lockmode
+ */
+ MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+ localtag.lock = *locktag;
+ localtag.mode = lockmode;
+
+ locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+ (void *) &localtag,
+ HASH_ENTER, &found);
+
+ /*
+ * if it's a new locallock object, initialize it
+ */
+ if (!found)
+ {
+ locallock->lock = NULL;
+ locallock->proclock = NULL;
+ locallock->hashcode = LockTagHashCode(&(localtag.lock));
+ locallock->nLocks = 0;
+ locallock->holdsStrongLockCount = false;
+ locallock->lockCleared = false;
+ locallock->numLockOwners = 0;
+ locallock->maxLockOwners = 8;
+ locallock->lockOwners = NULL; /* in case next line fails */
+ locallock->lockOwners = (LOCALLOCKOWNER *)
+ MemoryContextAlloc(TopMemoryContext,
+ locallock->maxLockOwners * sizeof(LOCALLOCKOWNER));
+ }
+ else
+ {
+ /* Make sure there will be room to remember the lock */
+ if (locallock->numLockOwners >= locallock->maxLockOwners)
+ {
+ int newsize = locallock->maxLockOwners * 2;
+
+ locallock->lockOwners = (LOCALLOCKOWNER *)
+ repalloc(locallock->lockOwners,
+ newsize * sizeof(LOCALLOCKOWNER));
+ locallock->maxLockOwners = newsize;
+ }
+ }
+ hashcode = locallock->hashcode;
+
+ if (locallockp)
+ *locallockp = locallock;
+
+ /*
+ * If we already hold the lock, we can just increase the count locally.
+ *
+ * If lockCleared is already set, caller need not worry about absorbing
+ * sinval messages related to the lock's object.
+ */
+ if (locallock->nLocks > 0)
+ {
+ GrantLockLocal(locallock, owner);
+ if (locallock->lockCleared)
+ return LOCKACQUIRE_ALREADY_CLEAR;
+ else
+ return LOCKACQUIRE_ALREADY_HELD;
+ }
+
+ /*
+ * We don't acquire any other heavyweight lock while holding the relation
+ * extension lock. We do allow to acquire the same relation extension
+ * lock more than once but that case won't reach here.
+ */
+ Assert(!IsRelationExtensionLockHeld);
+
+ /*
+ * We don't acquire any other heavyweight lock while holding the page lock
+ * except for relation extension.
+ */
+ Assert(!IsPageLockHeld ||
+ (locktag->locktag_type == LOCKTAG_RELATION_EXTEND));
+
+ /*
+ * Prepare to emit a WAL record if acquisition of this lock needs to be
+ * replayed in a standby server.
+ *
+ * Here we prepare to log; after lock is acquired we'll issue log record.
+ * This arrangement simplifies error recovery in case the preparation step
+ * fails.
+ *
+ * Only AccessExclusiveLocks can conflict with lock types that read-only
+ * transactions can acquire in a standby server. Make sure this definition
+ * matches the one in GetRunningTransactionLocks().
+ */
+ if (lockmode >= AccessExclusiveLock &&
+ locktag->locktag_type == LOCKTAG_RELATION &&
+ !RecoveryInProgress() &&
+ XLogStandbyInfoActive())
+ {
+ LogAccessExclusiveLockPrepare();
+ log_lock = true;
+ }
+
+ /*
+ * Attempt to take lock via fast path, if eligible. But if we remember
+ * having filled up the fast path array, we don't attempt to make any
+ * further use of it until we release some locks. It's possible that some
+ * other backend has transferred some of those locks to the shared hash
+ * table, leaving space free, but it's not worth acquiring the LWLock just
+ * to check. It's also possible that we're acquiring a second or third
+ * lock type on a relation we have already locked using the fast-path, but
+ * for now we don't worry about that case either.
+ */
+ if (EligibleForRelationFastPath(locktag, lockmode) &&
+ FastPathLocalUseCount < FP_LOCK_SLOTS_PER_BACKEND)
+ {
+ uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode);
+ bool acquired;
+
+ /*
+ * LWLockAcquire acts as a memory sequencing point, so it's safe to
+ * assume that any strong locker whose increment to
+ * FastPathStrongRelationLocks->counts becomes visible after we test
+ * it has yet to begin to transfer fast-path locks.
+ */
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+ if (FastPathStrongRelationLocks->count[fasthashcode] != 0)
+ acquired = false;
+ else
+ acquired = FastPathGrantRelationLock(locktag->locktag_field2,
+ lockmode);
+ LWLockRelease(&MyProc->fpInfoLock);
+ if (acquired)
+ {
+ /*
+ * The locallock might contain stale pointers to some old shared
+ * objects; we MUST reset these to null before considering the
+ * lock to be acquired via fast-path.
+ */
+ locallock->lock = NULL;
+ locallock->proclock = NULL;
+ GrantLockLocal(locallock, owner);
+ return LOCKACQUIRE_OK;
+ }
+ }
+
+ /*
+ * If this lock could potentially have been taken via the fast-path by
+ * some other backend, we must (temporarily) disable further use of the
+ * fast-path for this lock tag, and migrate any locks already taken via
+ * this method to the main lock table.
+ */
+ if (ConflictsWithRelationFastPath(locktag, lockmode))
+ {
+ uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+ BeginStrongLockAcquire(locallock, fasthashcode);
+ if (!FastPathTransferRelationLocks(lockMethodTable, locktag,
+ hashcode))
+ {
+ AbortStrongLockAcquire();
+ if (locallock->nLocks == 0)
+ RemoveLocalLock(locallock);
+ if (locallockp)
+ *locallockp = NULL;
+ if (reportMemoryError)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_locks_per_transaction.")));
+ else
+ return LOCKACQUIRE_NOT_AVAIL;
+ }
+ }
+
+ /*
+ * We didn't find the lock in our LOCALLOCK table, and we didn't manage to
+ * take it via the fast-path, either, so we've got to mess with the shared
+ * lock table.
+ */
+ partitionLock = LockHashPartitionLock(hashcode);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Find or create lock and proclock entries with this tag
+ *
+ * Note: if the locallock object already existed, it might have a pointer
+ * to the lock already ... but we should not assume that that pointer is
+ * valid, since a lock object with zero hold and request counts can go
+ * away anytime. So we have to use SetupLockInTable() to recompute the
+ * lock and proclock pointers, even if they're already set.
+ */
+ proclock = SetupLockInTable(lockMethodTable, MyProc, locktag,
+ hashcode, lockmode);
+ if (!proclock)
+ {
+ AbortStrongLockAcquire();
+ LWLockRelease(partitionLock);
+ if (locallock->nLocks == 0)
+ RemoveLocalLock(locallock);
+ if (locallockp)
+ *locallockp = NULL;
+ if (reportMemoryError)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_locks_per_transaction.")));
+ else
+ return LOCKACQUIRE_NOT_AVAIL;
+ }
+ locallock->proclock = proclock;
+ lock = proclock->tag.myLock;
+ locallock->lock = lock;
+
+ /*
+ * If lock requested conflicts with locks requested by waiters, must join
+ * wait queue. Otherwise, check for conflict with already-held locks.
+ * (That's last because most complex check.)
+ */
+ if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+ found_conflict = true;
+ else
+ found_conflict = LockCheckConflicts(lockMethodTable, lockmode,
+ lock, proclock);
+
+ if (!found_conflict)
+ {
+ /* No conflict with held or previously requested locks */
+ GrantLock(lock, proclock, lockmode);
+ GrantLockLocal(locallock, owner);
+ }
+ else
+ {
+ /*
+ * We can't acquire the lock immediately. If caller specified no
+ * blocking, remove useless table entries and return
+ * LOCKACQUIRE_NOT_AVAIL without waiting.
+ */
+ if (dontWait)
+ {
+ AbortStrongLockAcquire();
+ if (proclock->holdMask == 0)
+ {
+ uint32 proclock_hashcode;
+
+ proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
+ SHMQueueDelete(&proclock->lockLink);
+ SHMQueueDelete(&proclock->procLink);
+ if (!hash_search_with_hash_value(LockMethodProcLockHash,
+ (void *) &(proclock->tag),
+ proclock_hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "proclock table corrupted");
+ }
+ else
+ PROCLOCK_PRINT("LockAcquire: NOWAIT", proclock);
+ lock->nRequested--;
+ lock->requested[lockmode]--;
+ LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode);
+ Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0));
+ Assert(lock->nGranted <= lock->nRequested);
+ LWLockRelease(partitionLock);
+ if (locallock->nLocks == 0)
+ RemoveLocalLock(locallock);
+ if (locallockp)
+ *locallockp = NULL;
+ return LOCKACQUIRE_NOT_AVAIL;
+ }
+
+ /*
+ * Set bitmask of locks this process already holds on this object.
+ */
+ MyProc->heldLocks = proclock->holdMask;
+
+ /*
+ * Sleep till someone wakes me up.
+ */
+
+ TRACE_POSTGRESQL_LOCK_WAIT_START(locktag->locktag_field1,
+ locktag->locktag_field2,
+ locktag->locktag_field3,
+ locktag->locktag_field4,
+ locktag->locktag_type,
+ lockmode);
+
+ WaitOnLock(locallock, owner);
+
+ TRACE_POSTGRESQL_LOCK_WAIT_DONE(locktag->locktag_field1,
+ locktag->locktag_field2,
+ locktag->locktag_field3,
+ locktag->locktag_field4,
+ locktag->locktag_type,
+ lockmode);
+
+ /*
+ * NOTE: do not do any material change of state between here and
+ * return. All required changes in locktable state must have been
+ * done when the lock was granted to us --- see notes in WaitOnLock.
+ */
+
+ /*
+ * Check the proclock entry status, in case something in the ipc
+ * communication doesn't work correctly.
+ */
+ if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+ {
+ AbortStrongLockAcquire();
+ PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock);
+ LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
+ /* Should we retry ? */
+ LWLockRelease(partitionLock);
+ elog(ERROR, "LockAcquire failed");
+ }
+ PROCLOCK_PRINT("LockAcquire: granted", proclock);
+ LOCK_PRINT("LockAcquire: granted", lock, lockmode);
+ }
+
+ /*
+ * Lock state is fully up-to-date now; if we error out after this, no
+ * special error cleanup is required.
+ */
+ FinishStrongLockAcquire();
+
+ LWLockRelease(partitionLock);
+
+ /*
+ * Emit a WAL record if acquisition of this lock needs to be replayed in a
+ * standby server.
+ */
+ if (log_lock)
+ {
+ /*
+ * Decode the locktag back to the original values, to avoid sending
+ * lots of empty bytes with every message. See lock.h to check how a
+ * locktag is defined for LOCKTAG_RELATION
+ */
+ LogAccessExclusiveLock(locktag->locktag_field1,
+ locktag->locktag_field2);
+ }
+
+ return LOCKACQUIRE_OK;
+}
+
+/*
+ * Find or create LOCK and PROCLOCK objects as needed for a new lock
+ * request.
+ *
+ * Returns the PROCLOCK object, or NULL if we failed to create the objects
+ * for lack of shared memory.
+ *
+ * The appropriate partition lock must be held at entry, and will be
+ * held at exit.
+ */
+static PROCLOCK *
+SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
+ const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode)
+{
+ LOCK *lock;
+ PROCLOCK *proclock;
+ PROCLOCKTAG proclocktag;
+ uint32 proclock_hashcode;
+ bool found;
+
+ /*
+ * Find or create a lock with this tag.
+ */
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ (const void *) locktag,
+ hashcode,
+ HASH_ENTER_NULL,
+ &found);
+ if (!lock)
+ return NULL;
+
+ /*
+ * if it's a new lock object, initialize it
+ */
+ if (!found)
+ {
+ lock->grantMask = 0;
+ lock->waitMask = 0;
+ SHMQueueInit(&(lock->procLocks));
+ ProcQueueInit(&(lock->waitProcs));
+ lock->nRequested = 0;
+ lock->nGranted = 0;
+ MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+ MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+ LOCK_PRINT("LockAcquire: new", lock, lockmode);
+ }
+ else
+ {
+ LOCK_PRINT("LockAcquire: found", lock, lockmode);
+ Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+ Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+ Assert(lock->nGranted <= lock->nRequested);
+ }
+
+ /*
+ * Create the hash key for the proclock table.
+ */
+ proclocktag.myLock = lock;
+ proclocktag.myProc = proc;
+
+ proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+ /*
+ * Find or create a proclock entry with this tag
+ */
+ proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ (void *) &proclocktag,
+ proclock_hashcode,
+ HASH_ENTER_NULL,
+ &found);
+ if (!proclock)
+ {
+ /* Oops, not enough shmem for the proclock */
+ if (lock->nRequested == 0)
+ {
+ /*
+ * There are no other requestors of this lock, so garbage-collect
+ * the lock object. We *must* do this to avoid a permanent leak
+ * of shared memory, because there won't be anything to cause
+ * anyone to release the lock object later.
+ */
+ Assert(SHMQueueEmpty(&(lock->procLocks)));
+ if (!hash_search_with_hash_value(LockMethodLockHash,
+ (void *) &(lock->tag),
+ hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "lock table corrupted");
+ }
+ return NULL;
+ }
+
+ /*
+ * If new, initialize the new entry
+ */
+ if (!found)
+ {
+ uint32 partition = LockHashPartition(hashcode);
+
+ /*
+ * It might seem unsafe to access proclock->groupLeader without a
+ * lock, but it's not really. Either we are initializing a proclock
+ * on our own behalf, in which case our group leader isn't changing
+ * because the group leader for a process can only ever be changed by
+ * the process itself; or else we are transferring a fast-path lock to
+ * the main lock table, in which case that process can't change it's
+ * lock group leader without first releasing all of its locks (and in
+ * particular the one we are currently transferring).
+ */
+ proclock->groupLeader = proc->lockGroupLeader != NULL ?
+ proc->lockGroupLeader : proc;
+ proclock->holdMask = 0;
+ proclock->releaseMask = 0;
+ /* Add proclock to appropriate lists */
+ SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
+ SHMQueueInsertBefore(&(proc->myProcLocks[partition]),
+ &proclock->procLink);
+ PROCLOCK_PRINT("LockAcquire: new", proclock);
+ }
+ else
+ {
+ PROCLOCK_PRINT("LockAcquire: found", proclock);
+ Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+#ifdef CHECK_DEADLOCK_RISK
+
+ /*
+ * Issue warning if we already hold a lower-level lock on this object
+ * and do not hold a lock of the requested level or higher. This
+ * indicates a deadlock-prone coding practice (eg, we'd have a
+ * deadlock if another backend were following the same code path at
+ * about the same time).
+ *
+ * This is not enabled by default, because it may generate log entries
+ * about user-level coding practices that are in fact safe in context.
+ * It can be enabled to help find system-level problems.
+ *
+ * XXX Doing numeric comparison on the lockmodes is a hack; it'd be
+ * better to use a table. For now, though, this works.
+ */
+ {
+ int i;
+
+ for (i = lockMethodTable->numLockModes; i > 0; i--)
+ {
+ if (proclock->holdMask & LOCKBIT_ON(i))
+ {
+ if (i >= (int) lockmode)
+ break; /* safe: we have a lock >= req level */
+ elog(LOG, "deadlock risk: raising lock level"
+ " from %s to %s on object %u/%u/%u",
+ lockMethodTable->lockModeNames[i],
+ lockMethodTable->lockModeNames[lockmode],
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3);
+ break;
+ }
+ }
+ }
+#endif /* CHECK_DEADLOCK_RISK */
+ }
+
+ /*
+ * lock->nRequested and lock->requested[] count the total number of
+ * requests, whether granted or waiting, so increment those immediately.
+ * The other counts don't increment till we get the lock.
+ */
+ lock->nRequested++;
+ lock->requested[lockmode]++;
+ Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+ /*
+ * We shouldn't already hold the desired lock; else locallock table is
+ * broken.
+ */
+ if (proclock->holdMask & LOCKBIT_ON(lockmode))
+ elog(ERROR, "lock %s on object %u/%u/%u is already held",
+ lockMethodTable->lockModeNames[lockmode],
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3);
+
+ return proclock;
+}
+
+/*
+ * Check and set/reset the flag that we hold the relation extension/page lock.
+ *
+ * It is callers responsibility that this function is called after
+ * acquiring/releasing the relation extension/page lock.
+ *
+ * Pass acquired as true if lock is acquired, false otherwise.
+ */
+static inline void
+CheckAndSetLockHeld(LOCALLOCK *locallock, bool acquired)
+{
+#ifdef USE_ASSERT_CHECKING
+ if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_RELATION_EXTEND)
+ IsRelationExtensionLockHeld = acquired;
+ else if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_PAGE)
+ IsPageLockHeld = acquired;
+
+#endif
+}
+
+/*
+ * Subroutine to free a locallock entry
+ */
+static void
+RemoveLocalLock(LOCALLOCK *locallock)
+{
+ int i;
+
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (locallock->lockOwners[i].owner != NULL)
+ ResourceOwnerForgetLock(locallock->lockOwners[i].owner, locallock);
+ }
+ locallock->numLockOwners = 0;
+ if (locallock->lockOwners != NULL)
+ pfree(locallock->lockOwners);
+ locallock->lockOwners = NULL;
+
+ if (locallock->holdsStrongLockCount)
+ {
+ uint32 fasthashcode;
+
+ fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
+
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+ FastPathStrongRelationLocks->count[fasthashcode]--;
+ locallock->holdsStrongLockCount = false;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+ }
+
+ if (!hash_search(LockMethodLocalHash,
+ (void *) &(locallock->tag),
+ HASH_REMOVE, NULL))
+ elog(WARNING, "locallock table corrupted");
+
+ /*
+ * Indicate that the lock is released for certain types of locks
+ */
+ CheckAndSetLockHeld(locallock, false);
+}
+
+/*
+ * LockCheckConflicts -- test whether requested lock conflicts
+ * with those already granted
+ *
+ * Returns true if conflict, false if no conflict.
+ *
+ * NOTES:
+ * Here's what makes this complicated: one process's locks don't
+ * conflict with one another, no matter what purpose they are held for
+ * (eg, session and transaction locks do not conflict). Nor do the locks
+ * of one process in a lock group conflict with those of another process in
+ * the same group. So, we must subtract off these locks when determining
+ * whether the requested new lock conflicts with those already held.
+ */
+bool
+LockCheckConflicts(LockMethod lockMethodTable,
+ LOCKMODE lockmode,
+ LOCK *lock,
+ PROCLOCK *proclock)
+{
+ int numLockModes = lockMethodTable->numLockModes;
+ LOCKMASK myLocks;
+ int conflictMask = lockMethodTable->conflictTab[lockmode];
+ int conflictsRemaining[MAX_LOCKMODES];
+ int totalConflictsRemaining = 0;
+ int i;
+ SHM_QUEUE *procLocks;
+ PROCLOCK *otherproclock;
+
+ /*
+ * first check for global conflicts: If no locks conflict with my request,
+ * then I get the lock.
+ *
+ * Checking for conflict: lock->grantMask represents the types of
+ * currently held locks. conflictTable[lockmode] has a bit set for each
+ * type of lock that conflicts with request. Bitwise compare tells if
+ * there is a conflict.
+ */
+ if (!(conflictMask & lock->grantMask))
+ {
+ PROCLOCK_PRINT("LockCheckConflicts: no conflict", proclock);
+ return false;
+ }
+
+ /*
+ * Rats. Something conflicts. But it could still be my own lock, or a
+ * lock held by another member of my locking group. First, figure out how
+ * many conflicts remain after subtracting out any locks I hold myself.
+ */
+ myLocks = proclock->holdMask;
+ for (i = 1; i <= numLockModes; i++)
+ {
+ if ((conflictMask & LOCKBIT_ON(i)) == 0)
+ {
+ conflictsRemaining[i] = 0;
+ continue;
+ }
+ conflictsRemaining[i] = lock->granted[i];
+ if (myLocks & LOCKBIT_ON(i))
+ --conflictsRemaining[i];
+ totalConflictsRemaining += conflictsRemaining[i];
+ }
+
+ /* If no conflicts remain, we get the lock. */
+ if (totalConflictsRemaining == 0)
+ {
+ PROCLOCK_PRINT("LockCheckConflicts: resolved (simple)", proclock);
+ return false;
+ }
+
+ /* If no group locking, it's definitely a conflict. */
+ if (proclock->groupLeader == MyProc && MyProc->lockGroupLeader == NULL)
+ {
+ Assert(proclock->tag.myProc == MyProc);
+ PROCLOCK_PRINT("LockCheckConflicts: conflicting (simple)",
+ proclock);
+ return true;
+ }
+
+ /*
+ * The relation extension or page lock conflict even between the group
+ * members.
+ */
+ if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND ||
+ (LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE))
+ {
+ PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)",
+ proclock);
+ return true;
+ }
+
+ /*
+ * Locks held in conflicting modes by members of our own lock group are
+ * not real conflicts; we can subtract those out and see if we still have
+ * a conflict. This is O(N) in the number of processes holding or
+ * awaiting locks on this object. We could improve that by making the
+ * shared memory state more complex (and larger) but it doesn't seem worth
+ * it.
+ */
+ procLocks = &(lock->procLocks);
+ otherproclock = (PROCLOCK *)
+ SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink));
+ while (otherproclock != NULL)
+ {
+ if (proclock != otherproclock &&
+ proclock->groupLeader == otherproclock->groupLeader &&
+ (otherproclock->holdMask & conflictMask) != 0)
+ {
+ int intersectMask = otherproclock->holdMask & conflictMask;
+
+ for (i = 1; i <= numLockModes; i++)
+ {
+ if ((intersectMask & LOCKBIT_ON(i)) != 0)
+ {
+ if (conflictsRemaining[i] <= 0)
+ elog(PANIC, "proclocks held do not match lock");
+ conflictsRemaining[i]--;
+ totalConflictsRemaining--;
+ }
+ }
+
+ if (totalConflictsRemaining == 0)
+ {
+ PROCLOCK_PRINT("LockCheckConflicts: resolved (group)",
+ proclock);
+ return false;
+ }
+ }
+ otherproclock = (PROCLOCK *)
+ SHMQueueNext(procLocks, &otherproclock->lockLink,
+ offsetof(PROCLOCK, lockLink));
+ }
+
+ /* Nope, it's a real conflict. */
+ PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)", proclock);
+ return true;
+}
+
+/*
+ * GrantLock -- update the lock and proclock data structures to show
+ * the lock request has been granted.
+ *
+ * NOTE: if proc was blocked, it also needs to be removed from the wait list
+ * and have its waitLock/waitProcLock fields cleared. That's not done here.
+ *
+ * NOTE: the lock grant also has to be recorded in the associated LOCALLOCK
+ * table entry; but since we may be awaking some other process, we can't do
+ * that here; it's done by GrantLockLocal, instead.
+ */
+void
+GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode)
+{
+ lock->nGranted++;
+ lock->granted[lockmode]++;
+ lock->grantMask |= LOCKBIT_ON(lockmode);
+ if (lock->granted[lockmode] == lock->requested[lockmode])
+ lock->waitMask &= LOCKBIT_OFF(lockmode);
+ proclock->holdMask |= LOCKBIT_ON(lockmode);
+ LOCK_PRINT("GrantLock", lock, lockmode);
+ Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
+ Assert(lock->nGranted <= lock->nRequested);
+}
+
+/*
+ * UnGrantLock -- opposite of GrantLock.
+ *
+ * Updates the lock and proclock data structures to show that the lock
+ * is no longer held nor requested by the current holder.
+ *
+ * Returns true if there were any waiters waiting on the lock that
+ * should now be woken up with ProcLockWakeup.
+ */
+static bool
+UnGrantLock(LOCK *lock, LOCKMODE lockmode,
+ PROCLOCK *proclock, LockMethod lockMethodTable)
+{
+ bool wakeupNeeded = false;
+
+ Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+ Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
+ Assert(lock->nGranted <= lock->nRequested);
+
+ /*
+ * fix the general lock stats
+ */
+ lock->nRequested--;
+ lock->requested[lockmode]--;
+ lock->nGranted--;
+ lock->granted[lockmode]--;
+
+ if (lock->granted[lockmode] == 0)
+ {
+ /* change the conflict mask. No more of this lock type. */
+ lock->grantMask &= LOCKBIT_OFF(lockmode);
+ }
+
+ LOCK_PRINT("UnGrantLock: updated", lock, lockmode);
+
+ /*
+ * We need only run ProcLockWakeup if the released lock conflicts with at
+ * least one of the lock types requested by waiter(s). Otherwise whatever
+ * conflict made them wait must still exist. NOTE: before MVCC, we could
+ * skip wakeup if lock->granted[lockmode] was still positive. But that's
+ * not true anymore, because the remaining granted locks might belong to
+ * some waiter, who could now be awakened because he doesn't conflict with
+ * his own locks.
+ */
+ if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+ wakeupNeeded = true;
+
+ /*
+ * Now fix the per-proclock state.
+ */
+ proclock->holdMask &= LOCKBIT_OFF(lockmode);
+ PROCLOCK_PRINT("UnGrantLock: updated", proclock);
+
+ return wakeupNeeded;
+}
+
+/*
+ * CleanUpLock -- clean up after releasing a lock. We garbage-collect the
+ * proclock and lock objects if possible, and call ProcLockWakeup if there
+ * are remaining requests and the caller says it's OK. (Normally, this
+ * should be called after UnGrantLock, and wakeupNeeded is the result from
+ * UnGrantLock.)
+ *
+ * The appropriate partition lock must be held at entry, and will be
+ * held at exit.
+ */
+static void
+CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+ LockMethod lockMethodTable, uint32 hashcode,
+ bool wakeupNeeded)
+{
+ /*
+ * If this was my last hold on this lock, delete my entry in the proclock
+ * table.
+ */
+ if (proclock->holdMask == 0)
+ {
+ uint32 proclock_hashcode;
+
+ PROCLOCK_PRINT("CleanUpLock: deleting", proclock);
+ SHMQueueDelete(&proclock->lockLink);
+ SHMQueueDelete(&proclock->procLink);
+ proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
+ if (!hash_search_with_hash_value(LockMethodProcLockHash,
+ (void *) &(proclock->tag),
+ proclock_hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "proclock table corrupted");
+ }
+
+ if (lock->nRequested == 0)
+ {
+ /*
+ * The caller just released the last lock, so garbage-collect the lock
+ * object.
+ */
+ LOCK_PRINT("CleanUpLock: deleting", lock, 0);
+ Assert(SHMQueueEmpty(&(lock->procLocks)));
+ if (!hash_search_with_hash_value(LockMethodLockHash,
+ (void *) &(lock->tag),
+ hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "lock table corrupted");
+ }
+ else if (wakeupNeeded)
+ {
+ /* There are waiters on this lock, so wake them up. */
+ ProcLockWakeup(lockMethodTable, lock);
+ }
+}
+
+/*
+ * GrantLockLocal -- update the locallock data structures to show
+ * the lock request has been granted.
+ *
+ * We expect that LockAcquire made sure there is room to add a new
+ * ResourceOwner entry.
+ */
+static void
+GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner)
+{
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ int i;
+
+ Assert(locallock->numLockOwners < locallock->maxLockOwners);
+ /* Count the total */
+ locallock->nLocks++;
+ /* Count the per-owner lock */
+ for (i = 0; i < locallock->numLockOwners; i++)
+ {
+ if (lockOwners[i].owner == owner)
+ {
+ lockOwners[i].nLocks++;
+ return;
+ }
+ }
+ lockOwners[i].owner = owner;
+ lockOwners[i].nLocks = 1;
+ locallock->numLockOwners++;
+ if (owner != NULL)
+ ResourceOwnerRememberLock(owner, locallock);
+
+ /* Indicate that the lock is acquired for certain types of locks. */
+ CheckAndSetLockHeld(locallock, true);
+}
+
+/*
+ * BeginStrongLockAcquire - inhibit use of fastpath for a given LOCALLOCK,
+ * and arrange for error cleanup if it fails
+ */
+static void
+BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode)
+{
+ Assert(StrongLockInProgress == NULL);
+ Assert(locallock->holdsStrongLockCount == false);
+
+ /*
+ * Adding to a memory location is not atomic, so we take a spinlock to
+ * ensure we don't collide with someone else trying to bump the count at
+ * the same time.
+ *
+ * XXX: It might be worth considering using an atomic fetch-and-add
+ * instruction here, on architectures where that is supported.
+ */
+
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ FastPathStrongRelationLocks->count[fasthashcode]++;
+ locallock->holdsStrongLockCount = true;
+ StrongLockInProgress = locallock;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+}
+
+/*
+ * FinishStrongLockAcquire - cancel pending cleanup for a strong lock
+ * acquisition once it's no longer needed
+ */
+static void
+FinishStrongLockAcquire(void)
+{
+ StrongLockInProgress = NULL;
+}
+
+/*
+ * AbortStrongLockAcquire - undo strong lock state changes performed by
+ * BeginStrongLockAcquire.
+ */
+void
+AbortStrongLockAcquire(void)
+{
+ uint32 fasthashcode;
+ LOCALLOCK *locallock = StrongLockInProgress;
+
+ if (locallock == NULL)
+ return;
+
+ fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
+ Assert(locallock->holdsStrongLockCount == true);
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+ FastPathStrongRelationLocks->count[fasthashcode]--;
+ locallock->holdsStrongLockCount = false;
+ StrongLockInProgress = NULL;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+}
+
+/*
+ * GrantAwaitedLock -- call GrantLockLocal for the lock we are doing
+ * WaitOnLock on.
+ *
+ * proc.c needs this for the case where we are booted off the lock by
+ * timeout, but discover that someone granted us the lock anyway.
+ *
+ * We could just export GrantLockLocal, but that would require including
+ * resowner.h in lock.h, which creates circularity.
+ */
+void
+GrantAwaitedLock(void)
+{
+ GrantLockLocal(awaitedLock, awaitedOwner);
+}
+
+/*
+ * MarkLockClear -- mark an acquired lock as "clear"
+ *
+ * This means that we know we have absorbed all sinval messages that other
+ * sessions generated before we acquired this lock, and so we can confidently
+ * assume we know about any catalog changes protected by this lock.
+ */
+void
+MarkLockClear(LOCALLOCK *locallock)
+{
+ Assert(locallock->nLocks > 0);
+ locallock->lockCleared = true;
+}
+
+/*
+ * WaitOnLock -- wait to acquire a lock
+ *
+ * Caller must have set MyProc->heldLocks to reflect locks already held
+ * on the lockable object by this process.
+ *
+ * The appropriate partition lock must be held at entry.
+ */
+static void
+WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner)
+{
+ LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock);
+ LockMethod lockMethodTable = LockMethods[lockmethodid];
+ char *volatile new_status = NULL;
+
+ LOCK_PRINT("WaitOnLock: sleeping on lock",
+ locallock->lock, locallock->tag.mode);
+
+ /* Report change to waiting status */
+ if (update_process_title)
+ {
+ const char *old_status;
+ int len;
+
+ old_status = get_ps_display(&len);
+ new_status = (char *) palloc(len + 8 + 1);
+ memcpy(new_status, old_status, len);
+ strcpy(new_status + len, " waiting");
+ set_ps_display(new_status);
+ new_status[len] = '\0'; /* truncate off " waiting" */
+ }
+
+ awaitedLock = locallock;
+ awaitedOwner = owner;
+
+ /*
+ * NOTE: Think not to put any shared-state cleanup after the call to
+ * ProcSleep, in either the normal or failure path. The lock state must
+ * be fully set by the lock grantor, or by CheckDeadLock if we give up
+ * waiting for the lock. This is necessary because of the possibility
+ * that a cancel/die interrupt will interrupt ProcSleep after someone else
+ * grants us the lock, but before we've noticed it. Hence, after granting,
+ * the locktable state must fully reflect the fact that we own the lock;
+ * we can't do additional work on return.
+ *
+ * We can and do use a PG_TRY block to try to clean up after failure, but
+ * this still has a major limitation: elog(FATAL) can occur while waiting
+ * (eg, a "die" interrupt), and then control won't come back here. So all
+ * cleanup of essential state should happen in LockErrorCleanup, not here.
+ * We can use PG_TRY to clear the "waiting" status flags, since doing that
+ * is unimportant if the process exits.
+ */
+ PG_TRY();
+ {
+ if (ProcSleep(locallock, lockMethodTable) != PROC_WAIT_STATUS_OK)
+ {
+ /*
+ * We failed as a result of a deadlock, see CheckDeadLock(). Quit
+ * now.
+ */
+ awaitedLock = NULL;
+ LOCK_PRINT("WaitOnLock: aborting on lock",
+ locallock->lock, locallock->tag.mode);
+ LWLockRelease(LockHashPartitionLock(locallock->hashcode));
+
+ /*
+ * Now that we aren't holding the partition lock, we can give an
+ * error report including details about the detected deadlock.
+ */
+ DeadLockReport();
+ /* not reached */
+ }
+ }
+ PG_CATCH();
+ {
+ /* In this path, awaitedLock remains set until LockErrorCleanup */
+
+ /* Report change to non-waiting status */
+ if (update_process_title)
+ {
+ set_ps_display(new_status);
+ pfree(new_status);
+ }
+
+ /* and propagate the error */
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ awaitedLock = NULL;
+
+ /* Report change to non-waiting status */
+ if (update_process_title)
+ {
+ set_ps_display(new_status);
+ pfree(new_status);
+ }
+
+ LOCK_PRINT("WaitOnLock: wakeup on lock",
+ locallock->lock, locallock->tag.mode);
+}
+
+/*
+ * Remove a proc from the wait-queue it is on (caller must know it is on one).
+ * This is only used when the proc has failed to get the lock, so we set its
+ * waitStatus to PROC_WAIT_STATUS_ERROR.
+ *
+ * Appropriate partition lock must be held by caller. Also, caller is
+ * responsible for signaling the proc if needed.
+ *
+ * NB: this does not clean up any locallock object that may exist for the lock.
+ */
+void
+RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode)
+{
+ LOCK *waitLock = proc->waitLock;
+ PROCLOCK *proclock = proc->waitProcLock;
+ LOCKMODE lockmode = proc->waitLockMode;
+ LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*waitLock);
+
+ /* Make sure proc is waiting */
+ Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING);
+ Assert(proc->links.next != NULL);
+ Assert(waitLock);
+ Assert(waitLock->waitProcs.size > 0);
+ Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+
+ /* Remove proc from lock's wait queue */
+ SHMQueueDelete(&(proc->links));
+ waitLock->waitProcs.size--;
+
+ /* Undo increments of request counts by waiting process */
+ Assert(waitLock->nRequested > 0);
+ Assert(waitLock->nRequested > proc->waitLock->nGranted);
+ waitLock->nRequested--;
+ Assert(waitLock->requested[lockmode] > 0);
+ waitLock->requested[lockmode]--;
+ /* don't forget to clear waitMask bit if appropriate */
+ if (waitLock->granted[lockmode] == waitLock->requested[lockmode])
+ waitLock->waitMask &= LOCKBIT_OFF(lockmode);
+
+ /* Clean up the proc's own state, and pass it the ok/fail signal */
+ proc->waitLock = NULL;
+ proc->waitProcLock = NULL;
+ proc->waitStatus = PROC_WAIT_STATUS_ERROR;
+
+ /*
+ * Delete the proclock immediately if it represents no already-held locks.
+ * (This must happen now because if the owner of the lock decides to
+ * release it, and the requested/granted counts then go to zero,
+ * LockRelease expects there to be no remaining proclocks.) Then see if
+ * any other waiters for the lock can be woken up now.
+ */
+ CleanUpLock(waitLock, proclock,
+ LockMethods[lockmethodid], hashcode,
+ true);
+}
+
+/*
+ * LockRelease -- look up 'locktag' and release one 'lockmode' lock on it.
+ * Release a session lock if 'sessionLock' is true, else release a
+ * regular transaction lock.
+ *
+ * Side Effects: find any waiting processes that are now wakable,
+ * grant them their requested locks and awaken them.
+ * (We have to grant the lock here to avoid a race between
+ * the waking process and any new process to
+ * come along and request the lock.)
+ */
+bool
+LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
+{
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LockMethod lockMethodTable;
+ LOCALLOCKTAG localtag;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ LWLock *partitionLock;
+ bool wakeupNeeded;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+ if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+ elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+#ifdef LOCK_DEBUG
+ if (LOCK_DEBUG_ENABLED(locktag))
+ elog(LOG, "LockRelease: lock [%u,%u] %s",
+ locktag->locktag_field1, locktag->locktag_field2,
+ lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+ /*
+ * Find the LOCALLOCK entry for this lock and lockmode
+ */
+ MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+ localtag.lock = *locktag;
+ localtag.mode = lockmode;
+
+ locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+ (void *) &localtag,
+ HASH_FIND, NULL);
+
+ /*
+ * let the caller print its own error message, too. Do not ereport(ERROR).
+ */
+ if (!locallock || locallock->nLocks <= 0)
+ {
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ return false;
+ }
+
+ /*
+ * Decrease the count for the resource owner.
+ */
+ {
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ ResourceOwner owner;
+ int i;
+
+ /* Identify owner for lock */
+ if (sessionLock)
+ owner = NULL;
+ else
+ owner = CurrentResourceOwner;
+
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == owner)
+ {
+ Assert(lockOwners[i].nLocks > 0);
+ if (--lockOwners[i].nLocks == 0)
+ {
+ if (owner != NULL)
+ ResourceOwnerForgetLock(owner, locallock);
+ /* compact out unused slot */
+ locallock->numLockOwners--;
+ if (i < locallock->numLockOwners)
+ lockOwners[i] = lockOwners[locallock->numLockOwners];
+ }
+ break;
+ }
+ }
+ if (i < 0)
+ {
+ /* don't release a lock belonging to another owner */
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ return false;
+ }
+ }
+
+ /*
+ * Decrease the total local count. If we're still holding the lock, we're
+ * done.
+ */
+ locallock->nLocks--;
+
+ if (locallock->nLocks > 0)
+ return true;
+
+ /*
+ * At this point we can no longer suppose we are clear of invalidation
+ * messages related to this lock. Although we'll delete the LOCALLOCK
+ * object before any intentional return from this routine, it seems worth
+ * the trouble to explicitly reset lockCleared right now, just in case
+ * some error prevents us from deleting the LOCALLOCK.
+ */
+ locallock->lockCleared = false;
+
+ /* Attempt fast release of any lock eligible for the fast path. */
+ if (EligibleForRelationFastPath(locktag, lockmode) &&
+ FastPathLocalUseCount > 0)
+ {
+ bool released;
+
+ /*
+ * We might not find the lock here, even if we originally entered it
+ * here. Another backend may have moved it to the main table.
+ */
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+ released = FastPathUnGrantRelationLock(locktag->locktag_field2,
+ lockmode);
+ LWLockRelease(&MyProc->fpInfoLock);
+ if (released)
+ {
+ RemoveLocalLock(locallock);
+ return true;
+ }
+ }
+
+ /*
+ * Otherwise we've got to mess with the shared lock table.
+ */
+ partitionLock = LockHashPartitionLock(locallock->hashcode);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Normally, we don't need to re-find the lock or proclock, since we kept
+ * their addresses in the locallock table, and they couldn't have been
+ * removed while we were holding a lock on them. But it's possible that
+ * the lock was taken fast-path and has since been moved to the main hash
+ * table by another backend, in which case we will need to look up the
+ * objects here. We assume the lock field is NULL if so.
+ */
+ lock = locallock->lock;
+ if (!lock)
+ {
+ PROCLOCKTAG proclocktag;
+
+ Assert(EligibleForRelationFastPath(locktag, lockmode));
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ (const void *) locktag,
+ locallock->hashcode,
+ HASH_FIND,
+ NULL);
+ if (!lock)
+ elog(ERROR, "failed to re-find shared lock object");
+ locallock->lock = lock;
+
+ proclocktag.myLock = lock;
+ proclocktag.myProc = MyProc;
+ locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+ (void *) &proclocktag,
+ HASH_FIND,
+ NULL);
+ if (!locallock->proclock)
+ elog(ERROR, "failed to re-find shared proclock object");
+ }
+ LOCK_PRINT("LockRelease: found", lock, lockmode);
+ proclock = locallock->proclock;
+ PROCLOCK_PRINT("LockRelease: found", proclock);
+
+ /*
+ * Double-check that we are actually holding a lock of the type we want to
+ * release.
+ */
+ if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+ {
+ PROCLOCK_PRINT("LockRelease: WRONGTYPE", proclock);
+ LWLockRelease(partitionLock);
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ RemoveLocalLock(locallock);
+ return false;
+ }
+
+ /*
+ * Do the releasing. CleanUpLock will waken any now-wakable waiters.
+ */
+ wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+ CleanUpLock(lock, proclock,
+ lockMethodTable, locallock->hashcode,
+ wakeupNeeded);
+
+ LWLockRelease(partitionLock);
+
+ RemoveLocalLock(locallock);
+ return true;
+}
+
+/*
+ * LockReleaseAll -- Release all locks of the specified lock method that
+ * are held by the current process.
+ *
+ * Well, not necessarily *all* locks. The available behaviors are:
+ * allLocks == true: release all locks including session locks.
+ * allLocks == false: release all non-session locks.
+ */
+void
+LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
+{
+ HASH_SEQ_STATUS status;
+ LockMethod lockMethodTable;
+ int i,
+ numLockModes;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ int partition;
+ bool have_fast_path_lwlock = false;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+
+#ifdef LOCK_DEBUG
+ if (*(lockMethodTable->trace_flag))
+ elog(LOG, "LockReleaseAll: lockmethod=%d", lockmethodid);
+#endif
+
+ /*
+ * Get rid of our fast-path VXID lock, if appropriate. Note that this is
+ * the only way that the lock we hold on our own VXID can ever get
+ * released: it is always and only released when a toplevel transaction
+ * ends.
+ */
+ if (lockmethodid == DEFAULT_LOCKMETHOD)
+ VirtualXactLockTableCleanup();
+
+ numLockModes = lockMethodTable->numLockModes;
+
+ /*
+ * First we run through the locallock table and get rid of unwanted
+ * entries, then we scan the process's proclocks and get rid of those. We
+ * do this separately because we may have multiple locallock entries
+ * pointing to the same proclock, and we daren't end up with any dangling
+ * pointers. Fast-path locks are cleaned up during the locallock table
+ * scan, though.
+ */
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ /*
+ * If the LOCALLOCK entry is unused, we must've run out of shared
+ * memory while trying to set up this lock. Just forget the local
+ * entry.
+ */
+ if (locallock->nLocks == 0)
+ {
+ RemoveLocalLock(locallock);
+ continue;
+ }
+
+ /* Ignore items that are not of the lockmethod to be removed */
+ if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+ continue;
+
+ /*
+ * If we are asked to release all locks, we can just zap the entry.
+ * Otherwise, must scan to see if there are session locks. We assume
+ * there is at most one lockOwners entry for session locks.
+ */
+ if (!allLocks)
+ {
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+
+ /* If session lock is above array position 0, move it down to 0 */
+ for (i = 0; i < locallock->numLockOwners; i++)
+ {
+ if (lockOwners[i].owner == NULL)
+ lockOwners[0] = lockOwners[i];
+ else
+ ResourceOwnerForgetLock(lockOwners[i].owner, locallock);
+ }
+
+ if (locallock->numLockOwners > 0 &&
+ lockOwners[0].owner == NULL &&
+ lockOwners[0].nLocks > 0)
+ {
+ /* Fix the locallock to show just the session locks */
+ locallock->nLocks = lockOwners[0].nLocks;
+ locallock->numLockOwners = 1;
+ /* We aren't deleting this locallock, so done */
+ continue;
+ }
+ else
+ locallock->numLockOwners = 0;
+ }
+
+ /*
+ * If the lock or proclock pointers are NULL, this lock was taken via
+ * the relation fast-path (and is not known to have been transferred).
+ */
+ if (locallock->proclock == NULL || locallock->lock == NULL)
+ {
+ LOCKMODE lockmode = locallock->tag.mode;
+ Oid relid;
+
+ /* Verify that a fast-path lock is what we've got. */
+ if (!EligibleForRelationFastPath(&locallock->tag.lock, lockmode))
+ elog(PANIC, "locallock table corrupted");
+
+ /*
+ * If we don't currently hold the LWLock that protects our
+ * fast-path data structures, we must acquire it before attempting
+ * to release the lock via the fast-path. We will continue to
+ * hold the LWLock until we're done scanning the locallock table,
+ * unless we hit a transferred fast-path lock. (XXX is this
+ * really such a good idea? There could be a lot of entries ...)
+ */
+ if (!have_fast_path_lwlock)
+ {
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+ have_fast_path_lwlock = true;
+ }
+
+ /* Attempt fast-path release. */
+ relid = locallock->tag.lock.locktag_field2;
+ if (FastPathUnGrantRelationLock(relid, lockmode))
+ {
+ RemoveLocalLock(locallock);
+ continue;
+ }
+
+ /*
+ * Our lock, originally taken via the fast path, has been
+ * transferred to the main lock table. That's going to require
+ * some extra work, so release our fast-path lock before starting.
+ */
+ LWLockRelease(&MyProc->fpInfoLock);
+ have_fast_path_lwlock = false;
+
+ /*
+ * Now dump the lock. We haven't got a pointer to the LOCK or
+ * PROCLOCK in this case, so we have to handle this a bit
+ * differently than a normal lock release. Unfortunately, this
+ * requires an extra LWLock acquire-and-release cycle on the
+ * partitionLock, but hopefully it shouldn't happen often.
+ */
+ LockRefindAndRelease(lockMethodTable, MyProc,
+ &locallock->tag.lock, lockmode, false);
+ RemoveLocalLock(locallock);
+ continue;
+ }
+
+ /* Mark the proclock to show we need to release this lockmode */
+ if (locallock->nLocks > 0)
+ locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+ /* And remove the locallock hashtable entry */
+ RemoveLocalLock(locallock);
+ }
+
+ /* Done with the fast-path data structures */
+ if (have_fast_path_lwlock)
+ LWLockRelease(&MyProc->fpInfoLock);
+
+ /*
+ * Now, scan each lock partition separately.
+ */
+ for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+ {
+ LWLock *partitionLock;
+ SHM_QUEUE *procLocks = &(MyProc->myProcLocks[partition]);
+ PROCLOCK *nextplock;
+
+ partitionLock = LockHashPartitionLockByIndex(partition);
+
+ /*
+ * If the proclock list for this partition is empty, we can skip
+ * acquiring the partition lock. This optimization is trickier than
+ * it looks, because another backend could be in process of adding
+ * something to our proclock list due to promoting one of our
+ * fast-path locks. However, any such lock must be one that we
+ * decided not to delete above, so it's okay to skip it again now;
+ * we'd just decide not to delete it again. We must, however, be
+ * careful to re-fetch the list header once we've acquired the
+ * partition lock, to be sure we have a valid, up-to-date pointer.
+ * (There is probably no significant risk if pointer fetch/store is
+ * atomic, but we don't wish to assume that.)
+ *
+ * XXX This argument assumes that the locallock table correctly
+ * represents all of our fast-path locks. While allLocks mode
+ * guarantees to clean up all of our normal locks regardless of the
+ * locallock situation, we lose that guarantee for fast-path locks.
+ * This is not ideal.
+ */
+ if (SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, procLink)) == NULL)
+ continue; /* needn't examine this partition */
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ for (proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, procLink));
+ proclock;
+ proclock = nextplock)
+ {
+ bool wakeupNeeded = false;
+
+ /* Get link first, since we may unlink/delete this proclock */
+ nextplock = (PROCLOCK *)
+ SHMQueueNext(procLocks, &proclock->procLink,
+ offsetof(PROCLOCK, procLink));
+
+ Assert(proclock->tag.myProc == MyProc);
+
+ lock = proclock->tag.myLock;
+
+ /* Ignore items that are not of the lockmethod to be removed */
+ if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
+ continue;
+
+ /*
+ * In allLocks mode, force release of all locks even if locallock
+ * table had problems
+ */
+ if (allLocks)
+ proclock->releaseMask = proclock->holdMask;
+ else
+ Assert((proclock->releaseMask & ~proclock->holdMask) == 0);
+
+ /*
+ * Ignore items that have nothing to be released, unless they have
+ * holdMask == 0 and are therefore recyclable
+ */
+ if (proclock->releaseMask == 0 && proclock->holdMask != 0)
+ continue;
+
+ PROCLOCK_PRINT("LockReleaseAll", proclock);
+ LOCK_PRINT("LockReleaseAll", lock, 0);
+ Assert(lock->nRequested >= 0);
+ Assert(lock->nGranted >= 0);
+ Assert(lock->nGranted <= lock->nRequested);
+ Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+ /*
+ * Release the previously-marked lock modes
+ */
+ for (i = 1; i <= numLockModes; i++)
+ {
+ if (proclock->releaseMask & LOCKBIT_ON(i))
+ wakeupNeeded |= UnGrantLock(lock, i, proclock,
+ lockMethodTable);
+ }
+ Assert((lock->nRequested >= 0) && (lock->nGranted >= 0));
+ Assert(lock->nGranted <= lock->nRequested);
+ LOCK_PRINT("LockReleaseAll: updated", lock, 0);
+
+ proclock->releaseMask = 0;
+
+ /* CleanUpLock will wake up waiters if needed. */
+ CleanUpLock(lock, proclock,
+ lockMethodTable,
+ LockTagHashCode(&lock->tag),
+ wakeupNeeded);
+ } /* loop over PROCLOCKs within this partition */
+
+ LWLockRelease(partitionLock);
+ } /* loop over partitions */
+
+#ifdef LOCK_DEBUG
+ if (*(lockMethodTable->trace_flag))
+ elog(LOG, "LockReleaseAll done");
+#endif
+}
+
+/*
+ * LockReleaseSession -- Release all session locks of the specified lock method
+ * that are held by the current process.
+ */
+void
+LockReleaseSession(LOCKMETHODID lockmethodid)
+{
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ /* Ignore items that are not of the specified lock method */
+ if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+ continue;
+
+ ReleaseLockIfHeld(locallock, true);
+ }
+}
+
+/*
+ * LockReleaseCurrentOwner
+ * Release all locks belonging to CurrentResourceOwner
+ *
+ * If the caller knows what those locks are, it can pass them as an array.
+ * That speeds up the call significantly, when a lot of locks are held.
+ * Otherwise, pass NULL for locallocks, and we'll traverse through our hash
+ * table to find them.
+ */
+void
+LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks)
+{
+ if (locallocks == NULL)
+ {
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ ReleaseLockIfHeld(locallock, false);
+ }
+ else
+ {
+ int i;
+
+ for (i = nlocks - 1; i >= 0; i--)
+ ReleaseLockIfHeld(locallocks[i], false);
+ }
+}
+
+/*
+ * ReleaseLockIfHeld
+ * Release any session-level locks on this lockable object if sessionLock
+ * is true; else, release any locks held by CurrentResourceOwner.
+ *
+ * It is tempting to pass this a ResourceOwner pointer (or NULL for session
+ * locks), but without refactoring LockRelease() we cannot support releasing
+ * locks belonging to resource owners other than CurrentResourceOwner.
+ * If we were to refactor, it'd be a good idea to fix it so we don't have to
+ * do a hashtable lookup of the locallock, too. However, currently this
+ * function isn't used heavily enough to justify refactoring for its
+ * convenience.
+ */
+static void
+ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock)
+{
+ ResourceOwner owner;
+ LOCALLOCKOWNER *lockOwners;
+ int i;
+
+ /* Identify owner for lock (must match LockRelease!) */
+ if (sessionLock)
+ owner = NULL;
+ else
+ owner = CurrentResourceOwner;
+
+ /* Scan to see if there are any locks belonging to the target owner */
+ lockOwners = locallock->lockOwners;
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == owner)
+ {
+ Assert(lockOwners[i].nLocks > 0);
+ if (lockOwners[i].nLocks < locallock->nLocks)
+ {
+ /*
+ * We will still hold this lock after forgetting this
+ * ResourceOwner.
+ */
+ locallock->nLocks -= lockOwners[i].nLocks;
+ /* compact out unused slot */
+ locallock->numLockOwners--;
+ if (owner != NULL)
+ ResourceOwnerForgetLock(owner, locallock);
+ if (i < locallock->numLockOwners)
+ lockOwners[i] = lockOwners[locallock->numLockOwners];
+ }
+ else
+ {
+ Assert(lockOwners[i].nLocks == locallock->nLocks);
+ /* We want to call LockRelease just once */
+ lockOwners[i].nLocks = 1;
+ locallock->nLocks = 1;
+ if (!LockRelease(&locallock->tag.lock,
+ locallock->tag.mode,
+ sessionLock))
+ elog(WARNING, "ReleaseLockIfHeld: failed??");
+ }
+ break;
+ }
+ }
+}
+
+/*
+ * LockReassignCurrentOwner
+ * Reassign all locks belonging to CurrentResourceOwner to belong
+ * to its parent resource owner.
+ *
+ * If the caller knows what those locks are, it can pass them as an array.
+ * That speeds up the call significantly, when a lot of locks are held
+ * (e.g pg_dump with a large schema). Otherwise, pass NULL for locallocks,
+ * and we'll traverse through our hash table to find them.
+ */
+void
+LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks)
+{
+ ResourceOwner parent = ResourceOwnerGetParent(CurrentResourceOwner);
+
+ Assert(parent != NULL);
+
+ if (locallocks == NULL)
+ {
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ LockReassignOwner(locallock, parent);
+ }
+ else
+ {
+ int i;
+
+ for (i = nlocks - 1; i >= 0; i--)
+ LockReassignOwner(locallocks[i], parent);
+ }
+}
+
+/*
+ * Subroutine of LockReassignCurrentOwner. Reassigns a given lock belonging to
+ * CurrentResourceOwner to its parent.
+ */
+static void
+LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent)
+{
+ LOCALLOCKOWNER *lockOwners;
+ int i;
+ int ic = -1;
+ int ip = -1;
+
+ /*
+ * Scan to see if there are any locks belonging to current owner or its
+ * parent
+ */
+ lockOwners = locallock->lockOwners;
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == CurrentResourceOwner)
+ ic = i;
+ else if (lockOwners[i].owner == parent)
+ ip = i;
+ }
+
+ if (ic < 0)
+ return; /* no current locks */
+
+ if (ip < 0)
+ {
+ /* Parent has no slot, so just give it the child's slot */
+ lockOwners[ic].owner = parent;
+ ResourceOwnerRememberLock(parent, locallock);
+ }
+ else
+ {
+ /* Merge child's count with parent's */
+ lockOwners[ip].nLocks += lockOwners[ic].nLocks;
+ /* compact out unused slot */
+ locallock->numLockOwners--;
+ if (ic < locallock->numLockOwners)
+ lockOwners[ic] = lockOwners[locallock->numLockOwners];
+ }
+ ResourceOwnerForgetLock(CurrentResourceOwner, locallock);
+}
+
+/*
+ * FastPathGrantRelationLock
+ * Grant lock using per-backend fast-path array, if there is space.
+ */
+static bool
+FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode)
+{
+ uint32 f;
+ uint32 unused_slot = FP_LOCK_SLOTS_PER_BACKEND;
+
+ /* Scan for existing entry for this relid, remembering empty slot. */
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ if (FAST_PATH_GET_BITS(MyProc, f) == 0)
+ unused_slot = f;
+ else if (MyProc->fpRelId[f] == relid)
+ {
+ Assert(!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode));
+ FAST_PATH_SET_LOCKMODE(MyProc, f, lockmode);
+ return true;
+ }
+ }
+
+ /* If no existing entry, use any empty slot. */
+ if (unused_slot < FP_LOCK_SLOTS_PER_BACKEND)
+ {
+ MyProc->fpRelId[unused_slot] = relid;
+ FAST_PATH_SET_LOCKMODE(MyProc, unused_slot, lockmode);
+ ++FastPathLocalUseCount;
+ return true;
+ }
+
+ /* No existing entry, and no empty slot. */
+ return false;
+}
+
+/*
+ * FastPathUnGrantRelationLock
+ * Release fast-path lock, if present. Update backend-private local
+ * use count, while we're at it.
+ */
+static bool
+FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode)
+{
+ uint32 f;
+ bool result = false;
+
+ FastPathLocalUseCount = 0;
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ if (MyProc->fpRelId[f] == relid
+ && FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
+ {
+ Assert(!result);
+ FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode);
+ result = true;
+ /* we continue iterating so as to update FastPathLocalUseCount */
+ }
+ if (FAST_PATH_GET_BITS(MyProc, f) != 0)
+ ++FastPathLocalUseCount;
+ }
+ return result;
+}
+
+/*
+ * FastPathTransferRelationLocks
+ * Transfer locks matching the given lock tag from per-backend fast-path
+ * arrays to the shared hash table.
+ *
+ * Returns true if successful, false if ran out of shared memory.
+ */
+static bool
+FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag,
+ uint32 hashcode)
+{
+ LWLock *partitionLock = LockHashPartitionLock(hashcode);
+ Oid relid = locktag->locktag_field2;
+ uint32 i;
+
+ /*
+ * Every PGPROC that can potentially hold a fast-path lock is present in
+ * ProcGlobal->allProcs. Prepared transactions are not, but any
+ * outstanding fast-path locks held by prepared transactions are
+ * transferred to the main lock table.
+ */
+ for (i = 0; i < ProcGlobal->allProcCount; i++)
+ {
+ PGPROC *proc = &ProcGlobal->allProcs[i];
+ uint32 f;
+
+ LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
+
+ /*
+ * If the target backend isn't referencing the same database as the
+ * lock, then we needn't examine the individual relation IDs at all;
+ * none of them can be relevant.
+ *
+ * proc->databaseId is set at backend startup time and never changes
+ * thereafter, so it might be safe to perform this test before
+ * acquiring &proc->fpInfoLock. In particular, it's certainly safe to
+ * assume that if the target backend holds any fast-path locks, it
+ * must have performed a memory-fencing operation (in particular, an
+ * LWLock acquisition) since setting proc->databaseId. However, it's
+ * less clear that our backend is certain to have performed a memory
+ * fencing operation since the other backend set proc->databaseId. So
+ * for now, we test it after acquiring the LWLock just to be safe.
+ */
+ if (proc->databaseId != locktag->locktag_field1)
+ {
+ LWLockRelease(&proc->fpInfoLock);
+ continue;
+ }
+
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ uint32 lockmode;
+
+ /* Look for an allocated slot matching the given relid. */
+ if (relid != proc->fpRelId[f] || FAST_PATH_GET_BITS(proc, f) == 0)
+ continue;
+
+ /* Find or create lock object. */
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ for (lockmode = FAST_PATH_LOCKNUMBER_OFFSET;
+ lockmode < FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT;
+ ++lockmode)
+ {
+ PROCLOCK *proclock;
+
+ if (!FAST_PATH_CHECK_LOCKMODE(proc, f, lockmode))
+ continue;
+ proclock = SetupLockInTable(lockMethodTable, proc, locktag,
+ hashcode, lockmode);
+ if (!proclock)
+ {
+ LWLockRelease(partitionLock);
+ LWLockRelease(&proc->fpInfoLock);
+ return false;
+ }
+ GrantLock(proclock->tag.myLock, proclock, lockmode);
+ FAST_PATH_CLEAR_LOCKMODE(proc, f, lockmode);
+ }
+ LWLockRelease(partitionLock);
+
+ /* No need to examine remaining slots. */
+ break;
+ }
+ LWLockRelease(&proc->fpInfoLock);
+ }
+ return true;
+}
+
+/*
+ * FastPathGetRelationLockEntry
+ * Return the PROCLOCK for a lock originally taken via the fast-path,
+ * transferring it to the primary lock table if necessary.
+ *
+ * Note: caller takes care of updating the locallock object.
+ */
+static PROCLOCK *
+FastPathGetRelationLockEntry(LOCALLOCK *locallock)
+{
+ LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+ LOCKTAG *locktag = &locallock->tag.lock;
+ PROCLOCK *proclock = NULL;
+ LWLock *partitionLock = LockHashPartitionLock(locallock->hashcode);
+ Oid relid = locktag->locktag_field2;
+ uint32 f;
+
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ uint32 lockmode;
+
+ /* Look for an allocated slot matching the given relid. */
+ if (relid != MyProc->fpRelId[f] || FAST_PATH_GET_BITS(MyProc, f) == 0)
+ continue;
+
+ /* If we don't have a lock of the given mode, forget it! */
+ lockmode = locallock->tag.mode;
+ if (!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
+ break;
+
+ /* Find or create lock object. */
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ proclock = SetupLockInTable(lockMethodTable, MyProc, locktag,
+ locallock->hashcode, lockmode);
+ if (!proclock)
+ {
+ LWLockRelease(partitionLock);
+ LWLockRelease(&MyProc->fpInfoLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_locks_per_transaction.")));
+ }
+ GrantLock(proclock->tag.myLock, proclock, lockmode);
+ FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode);
+
+ LWLockRelease(partitionLock);
+
+ /* No need to examine remaining slots. */
+ break;
+ }
+
+ LWLockRelease(&MyProc->fpInfoLock);
+
+ /* Lock may have already been transferred by some other backend. */
+ if (proclock == NULL)
+ {
+ LOCK *lock;
+ PROCLOCKTAG proclocktag;
+ uint32 proclock_hashcode;
+
+ LWLockAcquire(partitionLock, LW_SHARED);
+
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ (void *) locktag,
+ locallock->hashcode,
+ HASH_FIND,
+ NULL);
+ if (!lock)
+ elog(ERROR, "failed to re-find shared lock object");
+
+ proclocktag.myLock = lock;
+ proclocktag.myProc = MyProc;
+
+ proclock_hashcode = ProcLockHashCode(&proclocktag, locallock->hashcode);
+ proclock = (PROCLOCK *)
+ hash_search_with_hash_value(LockMethodProcLockHash,
+ (void *) &proclocktag,
+ proclock_hashcode,
+ HASH_FIND,
+ NULL);
+ if (!proclock)
+ elog(ERROR, "failed to re-find shared proclock object");
+ LWLockRelease(partitionLock);
+ }
+
+ return proclock;
+}
+
+/*
+ * GetLockConflicts
+ * Get an array of VirtualTransactionIds of xacts currently holding locks
+ * that would conflict with the specified lock/lockmode.
+ * xacts merely awaiting such a lock are NOT reported.
+ *
+ * The result array is palloc'd and is terminated with an invalid VXID.
+ * *countp, if not null, is updated to the number of items set.
+ *
+ * Of course, the result could be out of date by the time it's returned, so
+ * use of this function has to be thought about carefully. Similarly, a
+ * PGPROC with no "lxid" will be considered non-conflicting regardless of any
+ * lock it holds. Existing callers don't care about a locker after that
+ * locker's pg_xact updates complete. CommitTransaction() clears "lxid" after
+ * pg_xact updates and before releasing locks.
+ *
+ * Note we never include the current xact's vxid in the result array,
+ * since an xact never blocks itself.
+ */
+VirtualTransactionId *
+GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
+{
+ static VirtualTransactionId *vxids;
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LockMethod lockMethodTable;
+ LOCK *lock;
+ LOCKMASK conflictMask;
+ SHM_QUEUE *procLocks;
+ PROCLOCK *proclock;
+ uint32 hashcode;
+ LWLock *partitionLock;
+ int count = 0;
+ int fast_count = 0;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+ if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+ elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+ /*
+ * Allocate memory to store results, and fill with InvalidVXID. We only
+ * need enough space for MaxBackends + max_prepared_xacts + a terminator.
+ * InHotStandby allocate once in TopMemoryContext.
+ */
+ if (InHotStandby)
+ {
+ if (vxids == NULL)
+ vxids = (VirtualTransactionId *)
+ MemoryContextAlloc(TopMemoryContext,
+ sizeof(VirtualTransactionId) *
+ (MaxBackends + max_prepared_xacts + 1));
+ }
+ else
+ vxids = (VirtualTransactionId *)
+ palloc0(sizeof(VirtualTransactionId) *
+ (MaxBackends + max_prepared_xacts + 1));
+
+ /* Compute hash code and partition lock, and look up conflicting modes. */
+ hashcode = LockTagHashCode(locktag);
+ partitionLock = LockHashPartitionLock(hashcode);
+ conflictMask = lockMethodTable->conflictTab[lockmode];
+
+ /*
+ * Fast path locks might not have been entered in the primary lock table.
+ * If the lock we're dealing with could conflict with such a lock, we must
+ * examine each backend's fast-path array for conflicts.
+ */
+ if (ConflictsWithRelationFastPath(locktag, lockmode))
+ {
+ int i;
+ Oid relid = locktag->locktag_field2;
+ VirtualTransactionId vxid;
+
+ /*
+ * Iterate over relevant PGPROCs. Anything held by a prepared
+ * transaction will have been transferred to the primary lock table,
+ * so we need not worry about those. This is all a bit fuzzy, because
+ * new locks could be taken after we've visited a particular
+ * partition, but the callers had better be prepared to deal with that
+ * anyway, since the locks could equally well be taken between the
+ * time we return the value and the time the caller does something
+ * with it.
+ */
+ for (i = 0; i < ProcGlobal->allProcCount; i++)
+ {
+ PGPROC *proc = &ProcGlobal->allProcs[i];
+ uint32 f;
+
+ /* A backend never blocks itself */
+ if (proc == MyProc)
+ continue;
+
+ LWLockAcquire(&proc->fpInfoLock, LW_SHARED);
+
+ /*
+ * If the target backend isn't referencing the same database as
+ * the lock, then we needn't examine the individual relation IDs
+ * at all; none of them can be relevant.
+ *
+ * See FastPathTransferRelationLocks() for discussion of why we do
+ * this test after acquiring the lock.
+ */
+ if (proc->databaseId != locktag->locktag_field1)
+ {
+ LWLockRelease(&proc->fpInfoLock);
+ continue;
+ }
+
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ uint32 lockmask;
+
+ /* Look for an allocated slot matching the given relid. */
+ if (relid != proc->fpRelId[f])
+ continue;
+ lockmask = FAST_PATH_GET_BITS(proc, f);
+ if (!lockmask)
+ continue;
+ lockmask <<= FAST_PATH_LOCKNUMBER_OFFSET;
+
+ /*
+ * There can only be one entry per relation, so if we found it
+ * and it doesn't conflict, we can skip the rest of the slots.
+ */
+ if ((lockmask & conflictMask) == 0)
+ break;
+
+ /* Conflict! */
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+
+ if (VirtualTransactionIdIsValid(vxid))
+ vxids[count++] = vxid;
+ /* else, xact already committed or aborted */
+
+ /* No need to examine remaining slots. */
+ break;
+ }
+
+ LWLockRelease(&proc->fpInfoLock);
+ }
+ }
+
+ /* Remember how many fast-path conflicts we found. */
+ fast_count = count;
+
+ /*
+ * Look up the lock object matching the tag.
+ */
+ LWLockAcquire(partitionLock, LW_SHARED);
+
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ (const void *) locktag,
+ hashcode,
+ HASH_FIND,
+ NULL);
+ if (!lock)
+ {
+ /*
+ * If the lock object doesn't exist, there is nothing holding a lock
+ * on this lockable object.
+ */
+ LWLockRelease(partitionLock);
+ vxids[count].backendId = InvalidBackendId;
+ vxids[count].localTransactionId = InvalidLocalTransactionId;
+ if (countp)
+ *countp = count;
+ return vxids;
+ }
+
+ /*
+ * Examine each existing holder (or awaiter) of the lock.
+ */
+
+ procLocks = &(lock->procLocks);
+
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, lockLink));
+
+ while (proclock)
+ {
+ if (conflictMask & proclock->holdMask)
+ {
+ PGPROC *proc = proclock->tag.myProc;
+
+ /* A backend never blocks itself */
+ if (proc != MyProc)
+ {
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+
+ if (VirtualTransactionIdIsValid(vxid))
+ {
+ int i;
+
+ /* Avoid duplicate entries. */
+ for (i = 0; i < fast_count; ++i)
+ if (VirtualTransactionIdEquals(vxids[i], vxid))
+ break;
+ if (i >= fast_count)
+ vxids[count++] = vxid;
+ }
+ /* else, xact already committed or aborted */
+ }
+ }
+
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
+ offsetof(PROCLOCK, lockLink));
+ }
+
+ LWLockRelease(partitionLock);
+
+ if (count > MaxBackends + max_prepared_xacts) /* should never happen */
+ elog(PANIC, "too many conflicting locks found");
+
+ vxids[count].backendId = InvalidBackendId;
+ vxids[count].localTransactionId = InvalidLocalTransactionId;
+ if (countp)
+ *countp = count;
+ return vxids;
+}
+
+/*
+ * Find a lock in the shared lock table and release it. It is the caller's
+ * responsibility to verify that this is a sane thing to do. (For example, it
+ * would be bad to release a lock here if there might still be a LOCALLOCK
+ * object with pointers to it.)
+ *
+ * We currently use this in two situations: first, to release locks held by
+ * prepared transactions on commit (see lock_twophase_postcommit); and second,
+ * to release locks taken via the fast-path, transferred to the main hash
+ * table, and then released (see LockReleaseAll).
+ */
+static void
+LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
+ LOCKTAG *locktag, LOCKMODE lockmode,
+ bool decrement_strong_lock_count)
+{
+ LOCK *lock;
+ PROCLOCK *proclock;
+ PROCLOCKTAG proclocktag;
+ uint32 hashcode;
+ uint32 proclock_hashcode;
+ LWLock *partitionLock;
+ bool wakeupNeeded;
+
+ hashcode = LockTagHashCode(locktag);
+ partitionLock = LockHashPartitionLock(hashcode);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Re-find the lock object (it had better be there).
+ */
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ (void *) locktag,
+ hashcode,
+ HASH_FIND,
+ NULL);
+ if (!lock)
+ elog(PANIC, "failed to re-find shared lock object");
+
+ /*
+ * Re-find the proclock object (ditto).
+ */
+ proclocktag.myLock = lock;
+ proclocktag.myProc = proc;
+
+ proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+ proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ (void *) &proclocktag,
+ proclock_hashcode,
+ HASH_FIND,
+ NULL);
+ if (!proclock)
+ elog(PANIC, "failed to re-find shared proclock object");
+
+ /*
+ * Double-check that we are actually holding a lock of the type we want to
+ * release.
+ */
+ if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+ {
+ PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock);
+ LWLockRelease(partitionLock);
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ return;
+ }
+
+ /*
+ * Do the releasing. CleanUpLock will waken any now-wakable waiters.
+ */
+ wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+ CleanUpLock(lock, proclock,
+ lockMethodTable, hashcode,
+ wakeupNeeded);
+
+ LWLockRelease(partitionLock);
+
+ /*
+ * Decrement strong lock count. This logic is needed only for 2PC.
+ */
+ if (decrement_strong_lock_count
+ && ConflictsWithRelationFastPath(locktag, lockmode))
+ {
+ uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+ FastPathStrongRelationLocks->count[fasthashcode]--;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+ }
+}
+
+/*
+ * CheckForSessionAndXactLocks
+ * Check to see if transaction holds both session-level and xact-level
+ * locks on the same object; if so, throw an error.
+ *
+ * If we have both session- and transaction-level locks on the same object,
+ * PREPARE TRANSACTION must fail. This should never happen with regular
+ * locks, since we only take those at session level in some special operations
+ * like VACUUM. It's possible to hit this with advisory locks, though.
+ *
+ * It would be nice if we could keep the session hold and give away the
+ * transactional hold to the prepared xact. However, that would require two
+ * PROCLOCK objects, and we cannot be sure that another PROCLOCK will be
+ * available when it comes time for PostPrepare_Locks to do the deed.
+ * So for now, we error out while we can still do so safely.
+ *
+ * Since the LOCALLOCK table stores a separate entry for each lockmode,
+ * we can't implement this check by examining LOCALLOCK entries in isolation.
+ * We must build a transient hashtable that is indexed by locktag only.
+ */
+static void
+CheckForSessionAndXactLocks(void)
+{
+ typedef struct
+ {
+ LOCKTAG lock; /* identifies the lockable object */
+ bool sessLock; /* is any lockmode held at session level? */
+ bool xactLock; /* is any lockmode held at xact level? */
+ } PerLockTagEntry;
+
+ HASHCTL hash_ctl;
+ HTAB *lockhtab;
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ /* Create a local hash table keyed by LOCKTAG only */
+ hash_ctl.keysize = sizeof(LOCKTAG);
+ hash_ctl.entrysize = sizeof(PerLockTagEntry);
+ hash_ctl.hcxt = CurrentMemoryContext;
+
+ lockhtab = hash_create("CheckForSessionAndXactLocks table",
+ 256, /* arbitrary initial size */
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ /* Scan local lock table to find entries for each LOCKTAG */
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ PerLockTagEntry *hentry;
+ bool found;
+ int i;
+
+ /*
+ * Ignore VXID locks. We don't want those to be held by prepared
+ * transactions, since they aren't meaningful after a restart.
+ */
+ if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+ continue;
+
+ /* Ignore it if we don't actually hold the lock */
+ if (locallock->nLocks <= 0)
+ continue;
+
+ /* Otherwise, find or make an entry in lockhtab */
+ hentry = (PerLockTagEntry *) hash_search(lockhtab,
+ (void *) &locallock->tag.lock,
+ HASH_ENTER, &found);
+ if (!found) /* initialize, if newly created */
+ hentry->sessLock = hentry->xactLock = false;
+
+ /* Scan to see if we hold lock at session or xact level or both */
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == NULL)
+ hentry->sessLock = true;
+ else
+ hentry->xactLock = true;
+ }
+
+ /*
+ * We can throw error immediately when we see both types of locks; no
+ * need to wait around to see if there are more violations.
+ */
+ if (hentry->sessLock && hentry->xactLock)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+ }
+
+ /* Success, so clean up */
+ hash_destroy(lockhtab);
+}
+
+/*
+ * AtPrepare_Locks
+ * Do the preparatory work for a PREPARE: make 2PC state file records
+ * for all locks currently held.
+ *
+ * Session-level locks are ignored, as are VXID locks.
+ *
+ * For the most part, we don't need to touch shared memory for this ---
+ * all the necessary state information is in the locallock table.
+ * Fast-path locks are an exception, however: we move any such locks to
+ * the main table before allowing PREPARE TRANSACTION to succeed.
+ */
+void
+AtPrepare_Locks(void)
+{
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ /* First, verify there aren't locks of both xact and session level */
+ CheckForSessionAndXactLocks();
+
+ /* Now do the per-locallock cleanup work */
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ TwoPhaseLockRecord record;
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ bool haveSessionLock;
+ bool haveXactLock;
+ int i;
+
+ /*
+ * Ignore VXID locks. We don't want those to be held by prepared
+ * transactions, since they aren't meaningful after a restart.
+ */
+ if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+ continue;
+
+ /* Ignore it if we don't actually hold the lock */
+ if (locallock->nLocks <= 0)
+ continue;
+
+ /* Scan to see whether we hold it at session or transaction level */
+ haveSessionLock = haveXactLock = false;
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == NULL)
+ haveSessionLock = true;
+ else
+ haveXactLock = true;
+ }
+
+ /* Ignore it if we have only session lock */
+ if (!haveXactLock)
+ continue;
+
+ /* This can't happen, because we already checked it */
+ if (haveSessionLock)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+
+ /*
+ * If the local lock was taken via the fast-path, we need to move it
+ * to the primary lock table, or just get a pointer to the existing
+ * primary lock table entry if by chance it's already been
+ * transferred.
+ */
+ if (locallock->proclock == NULL)
+ {
+ locallock->proclock = FastPathGetRelationLockEntry(locallock);
+ locallock->lock = locallock->proclock->tag.myLock;
+ }
+
+ /*
+ * Arrange to not release any strong lock count held by this lock
+ * entry. We must retain the count until the prepared transaction is
+ * committed or rolled back.
+ */
+ locallock->holdsStrongLockCount = false;
+
+ /*
+ * Create a 2PC record.
+ */
+ memcpy(&(record.locktag), &(locallock->tag.lock), sizeof(LOCKTAG));
+ record.lockmode = locallock->tag.mode;
+
+ RegisterTwoPhaseRecord(TWOPHASE_RM_LOCK_ID, 0,
+ &record, sizeof(TwoPhaseLockRecord));
+ }
+}
+
+/*
+ * PostPrepare_Locks
+ * Clean up after successful PREPARE
+ *
+ * Here, we want to transfer ownership of our locks to a dummy PGPROC
+ * that's now associated with the prepared transaction, and we want to
+ * clean out the corresponding entries in the LOCALLOCK table.
+ *
+ * Note: by removing the LOCALLOCK entries, we are leaving dangling
+ * pointers in the transaction's resource owner. This is OK at the
+ * moment since resowner.c doesn't try to free locks retail at a toplevel
+ * transaction commit or abort. We could alternatively zero out nLocks
+ * and leave the LOCALLOCK entries to be garbage-collected by LockReleaseAll,
+ * but that probably costs more cycles.
+ */
+void
+PostPrepare_Locks(TransactionId xid)
+{
+ PGPROC *newproc = TwoPhaseGetDummyProc(xid, false);
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ PROCLOCKTAG proclocktag;
+ int partition;
+
+ /* Can't prepare a lock group follower. */
+ Assert(MyProc->lockGroupLeader == NULL ||
+ MyProc->lockGroupLeader == MyProc);
+
+ /* This is a critical section: any error means big trouble */
+ START_CRIT_SECTION();
+
+ /*
+ * First we run through the locallock table and get rid of unwanted
+ * entries, then we scan the process's proclocks and transfer them to the
+ * target proc.
+ *
+ * We do this separately because we may have multiple locallock entries
+ * pointing to the same proclock, and we daren't end up with any dangling
+ * pointers.
+ */
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ bool haveSessionLock;
+ bool haveXactLock;
+ int i;
+
+ if (locallock->proclock == NULL || locallock->lock == NULL)
+ {
+ /*
+ * We must've run out of shared memory while trying to set up this
+ * lock. Just forget the local entry.
+ */
+ Assert(locallock->nLocks == 0);
+ RemoveLocalLock(locallock);
+ continue;
+ }
+
+ /* Ignore VXID locks */
+ if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+ continue;
+
+ /* Scan to see whether we hold it at session or transaction level */
+ haveSessionLock = haveXactLock = false;
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == NULL)
+ haveSessionLock = true;
+ else
+ haveXactLock = true;
+ }
+
+ /* Ignore it if we have only session lock */
+ if (!haveXactLock)
+ continue;
+
+ /* This can't happen, because we already checked it */
+ if (haveSessionLock)
+ ereport(PANIC,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+
+ /* Mark the proclock to show we need to release this lockmode */
+ if (locallock->nLocks > 0)
+ locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+ /* And remove the locallock hashtable entry */
+ RemoveLocalLock(locallock);
+ }
+
+ /*
+ * Now, scan each lock partition separately.
+ */
+ for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+ {
+ LWLock *partitionLock;
+ SHM_QUEUE *procLocks = &(MyProc->myProcLocks[partition]);
+ PROCLOCK *nextplock;
+
+ partitionLock = LockHashPartitionLockByIndex(partition);
+
+ /*
+ * If the proclock list for this partition is empty, we can skip
+ * acquiring the partition lock. This optimization is safer than the
+ * situation in LockReleaseAll, because we got rid of any fast-path
+ * locks during AtPrepare_Locks, so there cannot be any case where
+ * another backend is adding something to our lists now. For safety,
+ * though, we code this the same way as in LockReleaseAll.
+ */
+ if (SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, procLink)) == NULL)
+ continue; /* needn't examine this partition */
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ for (proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, procLink));
+ proclock;
+ proclock = nextplock)
+ {
+ /* Get link first, since we may unlink/relink this proclock */
+ nextplock = (PROCLOCK *)
+ SHMQueueNext(procLocks, &proclock->procLink,
+ offsetof(PROCLOCK, procLink));
+
+ Assert(proclock->tag.myProc == MyProc);
+
+ lock = proclock->tag.myLock;
+
+ /* Ignore VXID locks */
+ if (lock->tag.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+ continue;
+
+ PROCLOCK_PRINT("PostPrepare_Locks", proclock);
+ LOCK_PRINT("PostPrepare_Locks", lock, 0);
+ Assert(lock->nRequested >= 0);
+ Assert(lock->nGranted >= 0);
+ Assert(lock->nGranted <= lock->nRequested);
+ Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+ /* Ignore it if nothing to release (must be a session lock) */
+ if (proclock->releaseMask == 0)
+ continue;
+
+ /* Else we should be releasing all locks */
+ if (proclock->releaseMask != proclock->holdMask)
+ elog(PANIC, "we seem to have dropped a bit somewhere");
+
+ /*
+ * We cannot simply modify proclock->tag.myProc to reassign
+ * ownership of the lock, because that's part of the hash key and
+ * the proclock would then be in the wrong hash chain. Instead
+ * use hash_update_hash_key. (We used to create a new hash entry,
+ * but that risks out-of-memory failure if other processes are
+ * busy making proclocks too.) We must unlink the proclock from
+ * our procLink chain and put it into the new proc's chain, too.
+ *
+ * Note: the updated proclock hash key will still belong to the
+ * same hash partition, cf proclock_hash(). So the partition lock
+ * we already hold is sufficient for this.
+ */
+ SHMQueueDelete(&proclock->procLink);
+
+ /*
+ * Create the new hash key for the proclock.
+ */
+ proclocktag.myLock = lock;
+ proclocktag.myProc = newproc;
+
+ /*
+ * Update groupLeader pointer to point to the new proc. (We'd
+ * better not be a member of somebody else's lock group!)
+ */
+ Assert(proclock->groupLeader == proclock->tag.myProc);
+ proclock->groupLeader = newproc;
+
+ /*
+ * Update the proclock. We should not find any existing entry for
+ * the same hash key, since there can be only one entry for any
+ * given lock with my own proc.
+ */
+ if (!hash_update_hash_key(LockMethodProcLockHash,
+ (void *) proclock,
+ (void *) &proclocktag))
+ elog(PANIC, "duplicate entry found while reassigning a prepared transaction's locks");
+
+ /* Re-link into the new proc's proclock list */
+ SHMQueueInsertBefore(&(newproc->myProcLocks[partition]),
+ &proclock->procLink);
+
+ PROCLOCK_PRINT("PostPrepare_Locks: updated", proclock);
+ } /* loop over PROCLOCKs within this partition */
+
+ LWLockRelease(partitionLock);
+ } /* loop over partitions */
+
+ END_CRIT_SECTION();
+}
+
+
+/*
+ * Estimate shared-memory space used for lock tables
+ */
+Size
+LockShmemSize(void)
+{
+ Size size = 0;
+ long max_table_size;
+
+ /* lock hash table */
+ max_table_size = NLOCKENTS();
+ size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK)));
+
+ /* proclock hash table */
+ max_table_size *= 2;
+ size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK)));
+
+ /*
+ * Since NLOCKENTS is only an estimate, add 10% safety margin.
+ */
+ size = add_size(size, size / 10);
+
+ return size;
+}
+
+/*
+ * GetLockStatusData - Return a summary of the lock manager's internal
+ * status, for use in a user-level reporting function.
+ *
+ * The return data consists of an array of LockInstanceData objects,
+ * which are a lightly abstracted version of the PROCLOCK data structures,
+ * i.e. there is one entry for each unique lock and interested PGPROC.
+ * It is the caller's responsibility to match up related items (such as
+ * references to the same lockable object or PGPROC) if wanted.
+ *
+ * The design goal is to hold the LWLocks for as short a time as possible;
+ * thus, this function simply makes a copy of the necessary data and releases
+ * the locks, allowing the caller to contemplate and format the data for as
+ * long as it pleases.
+ */
+LockData *
+GetLockStatusData(void)
+{
+ LockData *data;
+ PROCLOCK *proclock;
+ HASH_SEQ_STATUS seqstat;
+ int els;
+ int el;
+ int i;
+
+ data = (LockData *) palloc(sizeof(LockData));
+
+ /* Guess how much space we'll need. */
+ els = MaxBackends;
+ el = 0;
+ data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * els);
+
+ /*
+ * First, we iterate through the per-backend fast-path arrays, locking
+ * them one at a time. This might produce an inconsistent picture of the
+ * system state, but taking all of those LWLocks at the same time seems
+ * impractical (in particular, note MAX_SIMUL_LWLOCKS). It shouldn't
+ * matter too much, because none of these locks can be involved in lock
+ * conflicts anyway - anything that might must be present in the main lock
+ * table. (For the same reason, we don't sweat about making leaderPid
+ * completely valid. We cannot safely dereference another backend's
+ * lockGroupLeader field without holding all lock partition locks, and
+ * it's not worth that.)
+ */
+ for (i = 0; i < ProcGlobal->allProcCount; ++i)
+ {
+ PGPROC *proc = &ProcGlobal->allProcs[i];
+ uint32 f;
+
+ LWLockAcquire(&proc->fpInfoLock, LW_SHARED);
+
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; ++f)
+ {
+ LockInstanceData *instance;
+ uint32 lockbits = FAST_PATH_GET_BITS(proc, f);
+
+ /* Skip unallocated slots. */
+ if (!lockbits)
+ continue;
+
+ if (el >= els)
+ {
+ els += MaxBackends;
+ data->locks = (LockInstanceData *)
+ repalloc(data->locks, sizeof(LockInstanceData) * els);
+ }
+
+ instance = &data->locks[el];
+ SET_LOCKTAG_RELATION(instance->locktag, proc->databaseId,
+ proc->fpRelId[f]);
+ instance->holdMask = lockbits << FAST_PATH_LOCKNUMBER_OFFSET;
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->leaderPid = proc->pid;
+ instance->fastpath = true;
+
+ /*
+ * Successfully taking fast path lock means there were no
+ * conflicting locks.
+ */
+ instance->waitStart = 0;
+
+ el++;
+ }
+
+ if (proc->fpVXIDLock)
+ {
+ VirtualTransactionId vxid;
+ LockInstanceData *instance;
+
+ if (el >= els)
+ {
+ els += MaxBackends;
+ data->locks = (LockInstanceData *)
+ repalloc(data->locks, sizeof(LockInstanceData) * els);
+ }
+
+ vxid.backendId = proc->backendId;
+ vxid.localTransactionId = proc->fpLocalTransactionId;
+
+ instance = &data->locks[el];
+ SET_LOCKTAG_VIRTUALTRANSACTION(instance->locktag, vxid);
+ instance->holdMask = LOCKBIT_ON(ExclusiveLock);
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->leaderPid = proc->pid;
+ instance->fastpath = true;
+ instance->waitStart = 0;
+
+ el++;
+ }
+
+ LWLockRelease(&proc->fpInfoLock);
+ }
+
+ /*
+ * Next, acquire lock on the entire shared lock data structure. We do
+ * this so that, at least for locks in the primary lock table, the state
+ * will be self-consistent.
+ *
+ * Since this is a read-only operation, we take shared instead of
+ * exclusive lock. There's not a whole lot of point to this, because all
+ * the normal operations require exclusive lock, but it doesn't hurt
+ * anything either. It will at least allow two backends to do
+ * GetLockStatusData in parallel.
+ *
+ * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+ */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+ /* Now we can safely count the number of proclocks */
+ data->nelements = el + hash_get_num_entries(LockMethodProcLockHash);
+ if (data->nelements > els)
+ {
+ els = data->nelements;
+ data->locks = (LockInstanceData *)
+ repalloc(data->locks, sizeof(LockInstanceData) * els);
+ }
+
+ /* Now scan the tables to copy the data */
+ hash_seq_init(&seqstat, LockMethodProcLockHash);
+
+ while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+ {
+ PGPROC *proc = proclock->tag.myProc;
+ LOCK *lock = proclock->tag.myLock;
+ LockInstanceData *instance = &data->locks[el];
+
+ memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+ instance->holdMask = proclock->holdMask;
+ if (proc->waitLock == proclock->tag.myLock)
+ instance->waitLockMode = proc->waitLockMode;
+ else
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->leaderPid = proclock->groupLeader->pid;
+ instance->fastpath = false;
+ instance->waitStart = (TimestampTz) pg_atomic_read_u64(&proc->waitStart);
+
+ el++;
+ }
+
+ /*
+ * And release locks. We do this in reverse order for two reasons: (1)
+ * Anyone else who needs more than one of the locks will be trying to lock
+ * them in increasing order; we don't want to release the other process
+ * until it can get all the locks it needs. (2) This avoids O(N^2)
+ * behavior inside LWLockRelease.
+ */
+ for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+ LWLockRelease(LockHashPartitionLockByIndex(i));
+
+ Assert(el == data->nelements);
+
+ return data;
+}
+
+/*
+ * GetBlockerStatusData - Return a summary of the lock manager's state
+ * concerning locks that are blocking the specified PID or any member of
+ * the PID's lock group, for use in a user-level reporting function.
+ *
+ * For each PID within the lock group that is awaiting some heavyweight lock,
+ * the return data includes an array of LockInstanceData objects, which are
+ * the same data structure used by GetLockStatusData; but unlike that function,
+ * this one reports only the PROCLOCKs associated with the lock that that PID
+ * is blocked on. (Hence, all the locktags should be the same for any one
+ * blocked PID.) In addition, we return an array of the PIDs of those backends
+ * that are ahead of the blocked PID in the lock's wait queue. These can be
+ * compared with the PIDs in the LockInstanceData objects to determine which
+ * waiters are ahead of or behind the blocked PID in the queue.
+ *
+ * If blocked_pid isn't a valid backend PID or nothing in its lock group is
+ * waiting on any heavyweight lock, return empty arrays.
+ *
+ * The design goal is to hold the LWLocks for as short a time as possible;
+ * thus, this function simply makes a copy of the necessary data and releases
+ * the locks, allowing the caller to contemplate and format the data for as
+ * long as it pleases.
+ */
+BlockedProcsData *
+GetBlockerStatusData(int blocked_pid)
+{
+ BlockedProcsData *data;
+ PGPROC *proc;
+ int i;
+
+ data = (BlockedProcsData *) palloc(sizeof(BlockedProcsData));
+
+ /*
+ * Guess how much space we'll need, and preallocate. Most of the time
+ * this will avoid needing to do repalloc while holding the LWLocks. (We
+ * assume, but check with an Assert, that MaxBackends is enough entries
+ * for the procs[] array; the other two could need enlargement, though.)
+ */
+ data->nprocs = data->nlocks = data->npids = 0;
+ data->maxprocs = data->maxlocks = data->maxpids = MaxBackends;
+ data->procs = (BlockedProcData *) palloc(sizeof(BlockedProcData) * data->maxprocs);
+ data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * data->maxlocks);
+ data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids);
+
+ /*
+ * In order to search the ProcArray for blocked_pid and assume that that
+ * entry won't immediately disappear under us, we must hold ProcArrayLock.
+ * In addition, to examine the lock grouping fields of any other backend,
+ * we must hold all the hash partition locks. (Only one of those locks is
+ * actually relevant for any one lock group, but we can't know which one
+ * ahead of time.) It's fairly annoying to hold all those locks
+ * throughout this, but it's no worse than GetLockStatusData(), and it
+ * does have the advantage that we're guaranteed to return a
+ * self-consistent instantaneous state.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ proc = BackendPidGetProcWithLock(blocked_pid);
+
+ /* Nothing to do if it's gone */
+ if (proc != NULL)
+ {
+ /*
+ * Acquire lock on the entire shared lock data structure. See notes
+ * in GetLockStatusData().
+ */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+ if (proc->lockGroupLeader == NULL)
+ {
+ /* Easy case, proc is not a lock group member */
+ GetSingleProcBlockerStatusData(proc, data);
+ }
+ else
+ {
+ /* Examine all procs in proc's lock group */
+ dlist_iter iter;
+
+ dlist_foreach(iter, &proc->lockGroupLeader->lockGroupMembers)
+ {
+ PGPROC *memberProc;
+
+ memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur);
+ GetSingleProcBlockerStatusData(memberProc, data);
+ }
+ }
+
+ /*
+ * And release locks. See notes in GetLockStatusData().
+ */
+ for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+ LWLockRelease(LockHashPartitionLockByIndex(i));
+
+ Assert(data->nprocs <= data->maxprocs);
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return data;
+}
+
+/* Accumulate data about one possibly-blocked proc for GetBlockerStatusData */
+static void
+GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data)
+{
+ LOCK *theLock = blocked_proc->waitLock;
+ BlockedProcData *bproc;
+ SHM_QUEUE *procLocks;
+ PROCLOCK *proclock;
+ PROC_QUEUE *waitQueue;
+ PGPROC *proc;
+ int queue_size;
+ int i;
+
+ /* Nothing to do if this proc is not blocked */
+ if (theLock == NULL)
+ return;
+
+ /* Set up a procs[] element */
+ bproc = &data->procs[data->nprocs++];
+ bproc->pid = blocked_proc->pid;
+ bproc->first_lock = data->nlocks;
+ bproc->first_waiter = data->npids;
+
+ /*
+ * We may ignore the proc's fast-path arrays, since nothing in those could
+ * be related to a contended lock.
+ */
+
+ /* Collect all PROCLOCKs associated with theLock */
+ procLocks = &(theLock->procLocks);
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, lockLink));
+ while (proclock)
+ {
+ PGPROC *proc = proclock->tag.myProc;
+ LOCK *lock = proclock->tag.myLock;
+ LockInstanceData *instance;
+
+ if (data->nlocks >= data->maxlocks)
+ {
+ data->maxlocks += MaxBackends;
+ data->locks = (LockInstanceData *)
+ repalloc(data->locks, sizeof(LockInstanceData) * data->maxlocks);
+ }
+
+ instance = &data->locks[data->nlocks];
+ memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+ instance->holdMask = proclock->holdMask;
+ if (proc->waitLock == lock)
+ instance->waitLockMode = proc->waitLockMode;
+ else
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->leaderPid = proclock->groupLeader->pid;
+ instance->fastpath = false;
+ data->nlocks++;
+
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
+ offsetof(PROCLOCK, lockLink));
+ }
+
+ /* Enlarge waiter_pids[] if it's too small to hold all wait queue PIDs */
+ waitQueue = &(theLock->waitProcs);
+ queue_size = waitQueue->size;
+
+ if (queue_size > data->maxpids - data->npids)
+ {
+ data->maxpids = Max(data->maxpids + MaxBackends,
+ data->npids + queue_size);
+ data->waiter_pids = (int *) repalloc(data->waiter_pids,
+ sizeof(int) * data->maxpids);
+ }
+
+ /* Collect PIDs from the lock's wait queue, stopping at blocked_proc */
+ proc = (PGPROC *) waitQueue->links.next;
+ for (i = 0; i < queue_size; i++)
+ {
+ if (proc == blocked_proc)
+ break;
+ data->waiter_pids[data->npids++] = proc->pid;
+ proc = (PGPROC *) proc->links.next;
+ }
+
+ bproc->num_locks = data->nlocks - bproc->first_lock;
+ bproc->num_waiters = data->npids - bproc->first_waiter;
+}
+
+/*
+ * Returns a list of currently held AccessExclusiveLocks, for use by
+ * LogStandbySnapshot(). The result is a palloc'd array,
+ * with the number of elements returned into *nlocks.
+ *
+ * XXX This currently takes a lock on all partitions of the lock table,
+ * but it's possible to do better. By reference counting locks and storing
+ * the value in the ProcArray entry for each backend we could tell if any
+ * locks need recording without having to acquire the partition locks and
+ * scan the lock table. Whether that's worth the additional overhead
+ * is pretty dubious though.
+ */
+xl_standby_lock *
+GetRunningTransactionLocks(int *nlocks)
+{
+ xl_standby_lock *accessExclusiveLocks;
+ PROCLOCK *proclock;
+ HASH_SEQ_STATUS seqstat;
+ int i;
+ int index;
+ int els;
+
+ /*
+ * Acquire lock on the entire shared lock data structure.
+ *
+ * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+ */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+ /* Now we can safely count the number of proclocks */
+ els = hash_get_num_entries(LockMethodProcLockHash);
+
+ /*
+ * Allocating enough space for all locks in the lock table is overkill,
+ * but it's more convenient and faster than having to enlarge the array.
+ */
+ accessExclusiveLocks = palloc(els * sizeof(xl_standby_lock));
+
+ /* Now scan the tables to copy the data */
+ hash_seq_init(&seqstat, LockMethodProcLockHash);
+
+ /*
+ * If lock is a currently granted AccessExclusiveLock then it will have
+ * just one proclock holder, so locks are never accessed twice in this
+ * particular case. Don't copy this code for use elsewhere because in the
+ * general case this will give you duplicate locks when looking at
+ * non-exclusive lock types.
+ */
+ index = 0;
+ while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+ {
+ /* make sure this definition matches the one used in LockAcquire */
+ if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) &&
+ proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION)
+ {
+ PGPROC *proc = proclock->tag.myProc;
+ LOCK *lock = proclock->tag.myLock;
+ TransactionId xid = proc->xid;
+
+ /*
+ * Don't record locks for transactions if we know they have
+ * already issued their WAL record for commit but not yet released
+ * lock. It is still possible that we see locks held by already
+ * complete transactions, if they haven't yet zeroed their xids.
+ */
+ if (!TransactionIdIsValid(xid))
+ continue;
+
+ accessExclusiveLocks[index].xid = xid;
+ accessExclusiveLocks[index].dbOid = lock->tag.locktag_field1;
+ accessExclusiveLocks[index].relOid = lock->tag.locktag_field2;
+
+ index++;
+ }
+ }
+
+ Assert(index <= els);
+
+ /*
+ * And release locks. We do this in reverse order for two reasons: (1)
+ * Anyone else who needs more than one of the locks will be trying to lock
+ * them in increasing order; we don't want to release the other process
+ * until it can get all the locks it needs. (2) This avoids O(N^2)
+ * behavior inside LWLockRelease.
+ */
+ for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+ LWLockRelease(LockHashPartitionLockByIndex(i));
+
+ *nlocks = index;
+ return accessExclusiveLocks;
+}
+
+/* Provide the textual name of any lock mode */
+const char *
+GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode)
+{
+ Assert(lockmethodid > 0 && lockmethodid < lengthof(LockMethods));
+ Assert(mode > 0 && mode <= LockMethods[lockmethodid]->numLockModes);
+ return LockMethods[lockmethodid]->lockModeNames[mode];
+}
+
+#ifdef LOCK_DEBUG
+/*
+ * Dump all locks in the given proc's myProcLocks lists.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+void
+DumpLocks(PGPROC *proc)
+{
+ SHM_QUEUE *procLocks;
+ PROCLOCK *proclock;
+ LOCK *lock;
+ int i;
+
+ if (proc == NULL)
+ return;
+
+ if (proc->waitLock)
+ LOCK_PRINT("DumpLocks: waiting on", proc->waitLock, 0);
+
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ {
+ procLocks = &(proc->myProcLocks[i]);
+
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, procLink));
+
+ while (proclock)
+ {
+ Assert(proclock->tag.myProc == proc);
+
+ lock = proclock->tag.myLock;
+
+ PROCLOCK_PRINT("DumpLocks", proclock);
+ LOCK_PRINT("DumpLocks", lock, 0);
+
+ proclock = (PROCLOCK *)
+ SHMQueueNext(procLocks, &proclock->procLink,
+ offsetof(PROCLOCK, procLink));
+ }
+ }
+}
+
+/*
+ * Dump all lmgr locks.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+void
+DumpAllLocks(void)
+{
+ PGPROC *proc;
+ PROCLOCK *proclock;
+ LOCK *lock;
+ HASH_SEQ_STATUS status;
+
+ proc = MyProc;
+
+ if (proc && proc->waitLock)
+ LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0);
+
+ hash_seq_init(&status, LockMethodProcLockHash);
+
+ while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ PROCLOCK_PRINT("DumpAllLocks", proclock);
+
+ lock = proclock->tag.myLock;
+ if (lock)
+ LOCK_PRINT("DumpAllLocks", lock, 0);
+ else
+ elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL");
+ }
+}
+#endif /* LOCK_DEBUG */
+
+/*
+ * LOCK 2PC resource manager's routines
+ */
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared.
+ *
+ * Because this function is run at db startup, re-acquiring the locks should
+ * never conflict with running transactions because there are none. We
+ * assume that the lock state represented by the stored 2PC files is legal.
+ *
+ * When switching from Hot Standby mode to normal operation, the locks will
+ * be already held by the startup process. The locks are acquired for the new
+ * procs without checking for conflicts, so we don't get a conflict between the
+ * startup process and the dummy procs, even though we will momentarily have
+ * a situation where two procs are holding the same AccessExclusiveLock,
+ * which isn't normally possible because the conflict. If we're in standby
+ * mode, but a recovery snapshot hasn't been established yet, it's possible
+ * that some but not all of the locks are already held by the startup process.
+ *
+ * This approach is simple, but also a bit dangerous, because if there isn't
+ * enough shared memory to acquire the locks, an error will be thrown, which
+ * is promoted to FATAL and recovery will abort, bringing down postmaster.
+ * A safer approach would be to transfer the locks like we do in
+ * AtPrepare_Locks, but then again, in hot standby mode it's possible for
+ * read-only backends to use up all the shared lock memory anyway, so that
+ * replaying the WAL record that needs to acquire a lock will throw an error
+ * and PANIC anyway.
+ */
+void
+lock_twophase_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+ PGPROC *proc = TwoPhaseGetDummyProc(xid, false);
+ LOCKTAG *locktag;
+ LOCKMODE lockmode;
+ LOCKMETHODID lockmethodid;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ PROCLOCKTAG proclocktag;
+ bool found;
+ uint32 hashcode;
+ uint32 proclock_hashcode;
+ int partition;
+ LWLock *partitionLock;
+ LockMethod lockMethodTable;
+
+ Assert(len == sizeof(TwoPhaseLockRecord));
+ locktag = &rec->locktag;
+ lockmode = rec->lockmode;
+ lockmethodid = locktag->locktag_lockmethodid;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+
+ hashcode = LockTagHashCode(locktag);
+ partition = LockHashPartition(hashcode);
+ partitionLock = LockHashPartitionLock(hashcode);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Find or create a lock with this tag.
+ */
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ (void *) locktag,
+ hashcode,
+ HASH_ENTER_NULL,
+ &found);
+ if (!lock)
+ {
+ LWLockRelease(partitionLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_locks_per_transaction.")));
+ }
+
+ /*
+ * if it's a new lock object, initialize it
+ */
+ if (!found)
+ {
+ lock->grantMask = 0;
+ lock->waitMask = 0;
+ SHMQueueInit(&(lock->procLocks));
+ ProcQueueInit(&(lock->waitProcs));
+ lock->nRequested = 0;
+ lock->nGranted = 0;
+ MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+ MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+ LOCK_PRINT("lock_twophase_recover: new", lock, lockmode);
+ }
+ else
+ {
+ LOCK_PRINT("lock_twophase_recover: found", lock, lockmode);
+ Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+ Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+ Assert(lock->nGranted <= lock->nRequested);
+ }
+
+ /*
+ * Create the hash key for the proclock table.
+ */
+ proclocktag.myLock = lock;
+ proclocktag.myProc = proc;
+
+ proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+ /*
+ * Find or create a proclock entry with this tag
+ */
+ proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ (void *) &proclocktag,
+ proclock_hashcode,
+ HASH_ENTER_NULL,
+ &found);
+ if (!proclock)
+ {
+ /* Oops, not enough shmem for the proclock */
+ if (lock->nRequested == 0)
+ {
+ /*
+ * There are no other requestors of this lock, so garbage-collect
+ * the lock object. We *must* do this to avoid a permanent leak
+ * of shared memory, because there won't be anything to cause
+ * anyone to release the lock object later.
+ */
+ Assert(SHMQueueEmpty(&(lock->procLocks)));
+ if (!hash_search_with_hash_value(LockMethodLockHash,
+ (void *) &(lock->tag),
+ hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "lock table corrupted");
+ }
+ LWLockRelease(partitionLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_locks_per_transaction.")));
+ }
+
+ /*
+ * If new, initialize the new entry
+ */
+ if (!found)
+ {
+ Assert(proc->lockGroupLeader == NULL);
+ proclock->groupLeader = proc;
+ proclock->holdMask = 0;
+ proclock->releaseMask = 0;
+ /* Add proclock to appropriate lists */
+ SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink);
+ SHMQueueInsertBefore(&(proc->myProcLocks[partition]),
+ &proclock->procLink);
+ PROCLOCK_PRINT("lock_twophase_recover: new", proclock);
+ }
+ else
+ {
+ PROCLOCK_PRINT("lock_twophase_recover: found", proclock);
+ Assert((proclock->holdMask & ~lock->grantMask) == 0);
+ }
+
+ /*
+ * lock->nRequested and lock->requested[] count the total number of
+ * requests, whether granted or waiting, so increment those immediately.
+ */
+ lock->nRequested++;
+ lock->requested[lockmode]++;
+ Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+ /*
+ * We shouldn't already hold the desired lock.
+ */
+ if (proclock->holdMask & LOCKBIT_ON(lockmode))
+ elog(ERROR, "lock %s on object %u/%u/%u is already held",
+ lockMethodTable->lockModeNames[lockmode],
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3);
+
+ /*
+ * We ignore any possible conflicts and just grant ourselves the lock. Not
+ * only because we don't bother, but also to avoid deadlocks when
+ * switching from standby to normal mode. See function comment.
+ */
+ GrantLock(lock, proclock, lockmode);
+
+ /*
+ * Bump strong lock count, to make sure any fast-path lock requests won't
+ * be granted without consulting the primary lock table.
+ */
+ if (ConflictsWithRelationFastPath(&lock->tag, lockmode))
+ {
+ uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ FastPathStrongRelationLocks->count[fasthashcode]++;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+ }
+
+ LWLockRelease(partitionLock);
+}
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared, when
+ * starting up into hot standby mode.
+ */
+void
+lock_twophase_standby_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+ LOCKTAG *locktag;
+ LOCKMODE lockmode;
+ LOCKMETHODID lockmethodid;
+
+ Assert(len == sizeof(TwoPhaseLockRecord));
+ locktag = &rec->locktag;
+ lockmode = rec->lockmode;
+ lockmethodid = locktag->locktag_lockmethodid;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+ if (lockmode == AccessExclusiveLock &&
+ locktag->locktag_type == LOCKTAG_RELATION)
+ {
+ StandbyAcquireAccessExclusiveLock(xid,
+ locktag->locktag_field1 /* dboid */ ,
+ locktag->locktag_field2 /* reloid */ );
+ }
+}
+
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Find and release the lock indicated by the 2PC record.
+ */
+void
+lock_twophase_postcommit(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+ PGPROC *proc = TwoPhaseGetDummyProc(xid, true);
+ LOCKTAG *locktag;
+ LOCKMETHODID lockmethodid;
+ LockMethod lockMethodTable;
+
+ Assert(len == sizeof(TwoPhaseLockRecord));
+ locktag = &rec->locktag;
+ lockmethodid = locktag->locktag_lockmethodid;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+
+ LockRefindAndRelease(lockMethodTable, proc, locktag, rec->lockmode, true);
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * This is actually just the same as the COMMIT case.
+ */
+void
+lock_twophase_postabort(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ lock_twophase_postcommit(xid, info, recdata, len);
+}
+
+/*
+ * VirtualXactLockTableInsert
+ *
+ * Take vxid lock via the fast-path. There can't be any pre-existing
+ * lockers, as we haven't advertised this vxid via the ProcArray yet.
+ *
+ * Since MyProc->fpLocalTransactionId will normally contain the same data
+ * as MyProc->lxid, you might wonder if we really need both. The
+ * difference is that MyProc->lxid is set and cleared unlocked, and
+ * examined by procarray.c, while fpLocalTransactionId is protected by
+ * fpInfoLock and is used only by the locking subsystem. Doing it this
+ * way makes it easier to verify that there are no funny race conditions.
+ *
+ * We don't bother recording this lock in the local lock table, since it's
+ * only ever released at the end of a transaction. Instead,
+ * LockReleaseAll() calls VirtualXactLockTableCleanup().
+ */
+void
+VirtualXactLockTableInsert(VirtualTransactionId vxid)
+{
+ Assert(VirtualTransactionIdIsValid(vxid));
+
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+ Assert(MyProc->backendId == vxid.backendId);
+ Assert(MyProc->fpLocalTransactionId == InvalidLocalTransactionId);
+ Assert(MyProc->fpVXIDLock == false);
+
+ MyProc->fpVXIDLock = true;
+ MyProc->fpLocalTransactionId = vxid.localTransactionId;
+
+ LWLockRelease(&MyProc->fpInfoLock);
+}
+
+/*
+ * VirtualXactLockTableCleanup
+ *
+ * Check whether a VXID lock has been materialized; if so, release it,
+ * unblocking waiters.
+ */
+void
+VirtualXactLockTableCleanup(void)
+{
+ bool fastpath;
+ LocalTransactionId lxid;
+
+ Assert(MyProc->backendId != InvalidBackendId);
+
+ /*
+ * Clean up shared memory state.
+ */
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+ fastpath = MyProc->fpVXIDLock;
+ lxid = MyProc->fpLocalTransactionId;
+ MyProc->fpVXIDLock = false;
+ MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+
+ LWLockRelease(&MyProc->fpInfoLock);
+
+ /*
+ * If fpVXIDLock has been cleared without touching fpLocalTransactionId,
+ * that means someone transferred the lock to the main lock table.
+ */
+ if (!fastpath && LocalTransactionIdIsValid(lxid))
+ {
+ VirtualTransactionId vxid;
+ LOCKTAG locktag;
+
+ vxid.backendId = MyBackendId;
+ vxid.localTransactionId = lxid;
+ SET_LOCKTAG_VIRTUALTRANSACTION(locktag, vxid);
+
+ LockRefindAndRelease(LockMethods[DEFAULT_LOCKMETHOD], MyProc,
+ &locktag, ExclusiveLock, false);
+ }
+}
+
+/*
+ * XactLockForVirtualXact
+ *
+ * If TransactionIdIsValid(xid), this is essentially XactLockTableWait(xid,
+ * NULL, NULL, XLTW_None) or ConditionalXactLockTableWait(xid). Unlike those
+ * functions, it assumes "xid" is never a subtransaction and that "xid" is
+ * prepared, committed, or aborted.
+ *
+ * If !TransactionIdIsValid(xid), this locks every prepared XID having been
+ * known as "vxid" before its PREPARE TRANSACTION.
+ */
+static bool
+XactLockForVirtualXact(VirtualTransactionId vxid,
+ TransactionId xid, bool wait)
+{
+ bool more = false;
+
+ /* There is no point to wait for 2PCs if you have no 2PCs. */
+ if (max_prepared_xacts == 0)
+ return true;
+
+ do
+ {
+ LockAcquireResult lar;
+ LOCKTAG tag;
+
+ /* Clear state from previous iterations. */
+ if (more)
+ {
+ xid = InvalidTransactionId;
+ more = false;
+ }
+
+ /* If we have no xid, try to find one. */
+ if (!TransactionIdIsValid(xid))
+ xid = TwoPhaseGetXidByVirtualXID(vxid, &more);
+ if (!TransactionIdIsValid(xid))
+ {
+ Assert(!more);
+ return true;
+ }
+
+ /* Check or wait for XID completion. */
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+ lar = LockAcquire(&tag, ShareLock, false, !wait);
+ if (lar == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+ LockRelease(&tag, ShareLock, false);
+ } while (more);
+
+ return true;
+}
+
+/*
+ * VirtualXactLock
+ *
+ * If wait = true, wait as long as the given VXID or any XID acquired by the
+ * same transaction is still running. Then, return true.
+ *
+ * If wait = false, just check whether that VXID or one of those XIDs is still
+ * running, and return true or false.
+ */
+bool
+VirtualXactLock(VirtualTransactionId vxid, bool wait)
+{
+ LOCKTAG tag;
+ PGPROC *proc;
+ TransactionId xid = InvalidTransactionId;
+
+ Assert(VirtualTransactionIdIsValid(vxid));
+
+ if (VirtualTransactionIdIsRecoveredPreparedXact(vxid))
+ /* no vxid lock; localTransactionId is a normal, locked XID */
+ return XactLockForVirtualXact(vxid, vxid.localTransactionId, wait);
+
+ SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid);
+
+ /*
+ * If a lock table entry must be made, this is the PGPROC on whose behalf
+ * it must be done. Note that the transaction might end or the PGPROC
+ * might be reassigned to a new backend before we get around to examining
+ * it, but it doesn't matter. If we find upon examination that the
+ * relevant lxid is no longer running here, that's enough to prove that
+ * it's no longer running anywhere.
+ */
+ proc = BackendIdGetProc(vxid.backendId);
+ if (proc == NULL)
+ return XactLockForVirtualXact(vxid, InvalidTransactionId, wait);
+
+ /*
+ * We must acquire this lock before checking the backendId and lxid
+ * against the ones we're waiting for. The target backend will only set
+ * or clear lxid while holding this lock.
+ */
+ LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
+
+ if (proc->backendId != vxid.backendId
+ || proc->fpLocalTransactionId != vxid.localTransactionId)
+ {
+ /* VXID ended */
+ LWLockRelease(&proc->fpInfoLock);
+ return XactLockForVirtualXact(vxid, InvalidTransactionId, wait);
+ }
+
+ /*
+ * If we aren't asked to wait, there's no need to set up a lock table
+ * entry. The transaction is still in progress, so just return false.
+ */
+ if (!wait)
+ {
+ LWLockRelease(&proc->fpInfoLock);
+ return false;
+ }
+
+ /*
+ * OK, we're going to need to sleep on the VXID. But first, we must set
+ * up the primary lock table entry, if needed (ie, convert the proc's
+ * fast-path lock on its VXID to a regular lock).
+ */
+ if (proc->fpVXIDLock)
+ {
+ PROCLOCK *proclock;
+ uint32 hashcode;
+ LWLock *partitionLock;
+
+ hashcode = LockTagHashCode(&tag);
+
+ partitionLock = LockHashPartitionLock(hashcode);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ proclock = SetupLockInTable(LockMethods[DEFAULT_LOCKMETHOD], proc,
+ &tag, hashcode, ExclusiveLock);
+ if (!proclock)
+ {
+ LWLockRelease(partitionLock);
+ LWLockRelease(&proc->fpInfoLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_locks_per_transaction.")));
+ }
+ GrantLock(proclock->tag.myLock, proclock, ExclusiveLock);
+
+ LWLockRelease(partitionLock);
+
+ proc->fpVXIDLock = false;
+ }
+
+ /*
+ * If the proc has an XID now, we'll avoid a TwoPhaseGetXidByVirtualXID()
+ * search. The proc might have assigned this XID but not yet locked it,
+ * in which case the proc will lock this XID before releasing the VXID.
+ * The fpInfoLock critical section excludes VirtualXactLockTableCleanup(),
+ * so we won't save an XID of a different VXID. It doesn't matter whether
+ * we save this before or after setting up the primary lock table entry.
+ */
+ xid = proc->xid;
+
+ /* Done with proc->fpLockBits */
+ LWLockRelease(&proc->fpInfoLock);
+
+ /* Time to wait. */
+ (void) LockAcquire(&tag, ShareLock, false, false);
+
+ LockRelease(&tag, ShareLock, false);
+ return XactLockForVirtualXact(vxid, xid, wait);
+}
+
+/*
+ * LockWaiterCount
+ *
+ * Find the number of lock requester on this locktag
+ */
+int
+LockWaiterCount(const LOCKTAG *locktag)
+{
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LOCK *lock;
+ bool found;
+ uint32 hashcode;
+ LWLock *partitionLock;
+ int waiters = 0;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+ hashcode = LockTagHashCode(locktag);
+ partitionLock = LockHashPartitionLock(hashcode);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ (const void *) locktag,
+ hashcode,
+ HASH_FIND,
+ &found);
+ if (found)
+ {
+ Assert(lock != NULL);
+ waiters = lock->nRequested;
+ }
+ LWLockRelease(partitionLock);
+
+ return waiters;
+}
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index 0000000..07eb6f6
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -0,0 +1,1977 @@
+/*-------------------------------------------------------------------------
+ *
+ * lwlock.c
+ * Lightweight lock manager
+ *
+ * Lightweight locks are intended primarily to provide mutual exclusion of
+ * access to shared-memory data structures. Therefore, they offer both
+ * exclusive and shared lock modes (to support read/write and read-only
+ * access to a shared object). There are few other frammishes. User-level
+ * locking should be done with the full lock manager --- which depends on
+ * LWLocks to protect its shared state.
+ *
+ * In addition to exclusive and shared modes, lightweight locks can be used to
+ * wait until a variable changes value. The variable is initially not set
+ * when the lock is acquired with LWLockAcquire, i.e. it remains set to the
+ * value it was set to when the lock was released last, and can be updated
+ * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar
+ * waits for the variable to be updated, or until the lock is free. When
+ * releasing the lock with LWLockReleaseClearVar() the value can be set to an
+ * appropriate value for a free lock. The meaning of the variable is up to
+ * the caller, the lightweight lock code just assigns and compares it.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/lwlock.c
+ *
+ * NOTES:
+ *
+ * This used to be a pretty straight forward reader-writer lock
+ * implementation, in which the internal state was protected by a
+ * spinlock. Unfortunately the overhead of taking the spinlock proved to be
+ * too high for workloads/locks that were taken in shared mode very
+ * frequently. Often we were spinning in the (obviously exclusive) spinlock,
+ * while trying to acquire a shared lock that was actually free.
+ *
+ * Thus a new implementation was devised that provides wait-free shared lock
+ * acquisition for locks that aren't exclusively locked.
+ *
+ * The basic idea is to have a single atomic variable 'lockcount' instead of
+ * the formerly separate shared and exclusive counters and to use atomic
+ * operations to acquire the lock. That's fairly easy to do for plain
+ * rw-spinlocks, but a lot harder for something like LWLocks that want to wait
+ * in the OS.
+ *
+ * For lock acquisition we use an atomic compare-and-exchange on the lockcount
+ * variable. For exclusive lock we swap in a sentinel value
+ * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders.
+ *
+ * To release the lock we use an atomic decrement to release the lock. If the
+ * new value is zero (we get that atomically), we know we can/have to release
+ * waiters.
+ *
+ * Obviously it is important that the sentinel value for exclusive locks
+ * doesn't conflict with the maximum number of possible share lockers -
+ * luckily MAX_BACKENDS makes that easily possible.
+ *
+ *
+ * The attentive reader might have noticed that naively doing the above has a
+ * glaring race condition: We try to lock using the atomic operations and
+ * notice that we have to wait. Unfortunately by the time we have finished
+ * queuing, the former locker very well might have already finished it's
+ * work. That's problematic because we're now stuck waiting inside the OS.
+
+ * To mitigate those races we use a two phased attempt at locking:
+ * Phase 1: Try to do it atomically, if we succeed, nice
+ * Phase 2: Add ourselves to the waitqueue of the lock
+ * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from
+ * the queue
+ * Phase 4: Sleep till wake-up, goto Phase 1
+ *
+ * This protects us against the problem from above as nobody can release too
+ * quick, before we're queued, since after Phase 2 we're already queued.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "postmaster/postmaster.h"
+#include "replication/slot.h"
+#include "storage/ipc.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+#ifdef LWLOCK_STATS
+#include "utils/hsearch.h"
+#endif
+
+
+/* We use the ShmemLock spinlock to protect LWLockCounter */
+extern slock_t *ShmemLock;
+
+#define LW_FLAG_HAS_WAITERS ((uint32) 1 << 30)
+#define LW_FLAG_RELEASE_OK ((uint32) 1 << 29)
+#define LW_FLAG_LOCKED ((uint32) 1 << 28)
+
+#define LW_VAL_EXCLUSIVE ((uint32) 1 << 24)
+#define LW_VAL_SHARED 1
+
+#define LW_LOCK_MASK ((uint32) ((1 << 25)-1))
+/* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */
+#define LW_SHARED_MASK ((uint32) ((1 << 24)-1))
+
+/*
+ * There are three sorts of LWLock "tranches":
+ *
+ * 1. The individually-named locks defined in lwlocknames.h each have their
+ * own tranche. The names of these tranches appear in IndividualLWLockNames[]
+ * in lwlocknames.c.
+ *
+ * 2. There are some predefined tranches for built-in groups of locks.
+ * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names
+ * appear in BuiltinTrancheNames[] below.
+ *
+ * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche
+ * or LWLockRegisterTranche. The names of these that are known in the current
+ * process appear in LWLockTrancheNames[].
+ *
+ * All these names are user-visible as wait event names, so choose with care
+ * ... and do not forget to update the documentation's list of wait events.
+ */
+extern const char *const IndividualLWLockNames[]; /* in lwlocknames.c */
+
+static const char *const BuiltinTrancheNames[] = {
+ /* LWTRANCHE_XACT_BUFFER: */
+ "XactBuffer",
+ /* LWTRANCHE_COMMITTS_BUFFER: */
+ "CommitTSBuffer",
+ /* LWTRANCHE_SUBTRANS_BUFFER: */
+ "SubtransBuffer",
+ /* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */
+ "MultiXactOffsetBuffer",
+ /* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */
+ "MultiXactMemberBuffer",
+ /* LWTRANCHE_NOTIFY_BUFFER: */
+ "NotifyBuffer",
+ /* LWTRANCHE_SERIAL_BUFFER: */
+ "SerialBuffer",
+ /* LWTRANCHE_WAL_INSERT: */
+ "WALInsert",
+ /* LWTRANCHE_BUFFER_CONTENT: */
+ "BufferContent",
+ /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
+ "ReplicationOriginState",
+ /* LWTRANCHE_REPLICATION_SLOT_IO: */
+ "ReplicationSlotIO",
+ /* LWTRANCHE_LOCK_FASTPATH: */
+ "LockFastPath",
+ /* LWTRANCHE_BUFFER_MAPPING: */
+ "BufferMapping",
+ /* LWTRANCHE_LOCK_MANAGER: */
+ "LockManager",
+ /* LWTRANCHE_PREDICATE_LOCK_MANAGER: */
+ "PredicateLockManager",
+ /* LWTRANCHE_PARALLEL_HASH_JOIN: */
+ "ParallelHashJoin",
+ /* LWTRANCHE_PARALLEL_QUERY_DSA: */
+ "ParallelQueryDSA",
+ /* LWTRANCHE_PER_SESSION_DSA: */
+ "PerSessionDSA",
+ /* LWTRANCHE_PER_SESSION_RECORD_TYPE: */
+ "PerSessionRecordType",
+ /* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */
+ "PerSessionRecordTypmod",
+ /* LWTRANCHE_SHARED_TUPLESTORE: */
+ "SharedTupleStore",
+ /* LWTRANCHE_SHARED_TIDBITMAP: */
+ "SharedTidBitmap",
+ /* LWTRANCHE_PARALLEL_APPEND: */
+ "ParallelAppend",
+ /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */
+ "PerXactPredicateList"
+};
+
+StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
+ LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS,
+ "missing entries in BuiltinTrancheNames[]");
+
+/*
+ * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and
+ * stores the names of all dynamically-created tranches known to the current
+ * process. Any unused entries in the array will contain NULL.
+ */
+static const char **LWLockTrancheNames = NULL;
+static int LWLockTrancheNamesAllocated = 0;
+
+/*
+ * This points to the main array of LWLocks in shared memory. Backends inherit
+ * the pointer by fork from the postmaster (except in the EXEC_BACKEND case,
+ * where we have special measures to pass it down).
+ */
+LWLockPadded *MainLWLockArray = NULL;
+
+/*
+ * We use this structure to keep track of locked LWLocks for release
+ * during error recovery. Normally, only a few will be held at once, but
+ * occasionally the number can be much higher; for example, the pg_buffercache
+ * extension locks all buffer partitions simultaneously.
+ */
+#define MAX_SIMUL_LWLOCKS 200
+
+/* struct representing the LWLocks we're holding */
+typedef struct LWLockHandle
+{
+ LWLock *lock;
+ LWLockMode mode;
+} LWLockHandle;
+
+static int num_held_lwlocks = 0;
+static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS];
+
+/* struct representing the LWLock tranche request for named tranche */
+typedef struct NamedLWLockTrancheRequest
+{
+ char tranche_name[NAMEDATALEN];
+ int num_lwlocks;
+} NamedLWLockTrancheRequest;
+
+static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL;
+static int NamedLWLockTrancheRequestsAllocated = 0;
+
+/*
+ * NamedLWLockTrancheRequests is both the valid length of the request array,
+ * and the length of the shared-memory NamedLWLockTrancheArray later on.
+ * This variable and NamedLWLockTrancheArray are non-static so that
+ * postmaster.c can copy them to child processes in EXEC_BACKEND builds.
+ */
+int NamedLWLockTrancheRequests = 0;
+
+/* points to data in shared memory: */
+NamedLWLockTranche *NamedLWLockTrancheArray = NULL;
+
+static bool lock_named_request_allowed = true;
+
+static void InitializeLWLocks(void);
+static inline void LWLockReportWaitStart(LWLock *lock);
+static inline void LWLockReportWaitEnd(void);
+static const char *GetLWTrancheName(uint16 trancheId);
+
+#define T_NAME(lock) \
+ GetLWTrancheName((lock)->tranche)
+
+#ifdef LWLOCK_STATS
+typedef struct lwlock_stats_key
+{
+ int tranche;
+ void *instance;
+} lwlock_stats_key;
+
+typedef struct lwlock_stats
+{
+ lwlock_stats_key key;
+ int sh_acquire_count;
+ int ex_acquire_count;
+ int block_count;
+ int dequeue_self_count;
+ int spin_delay_count;
+} lwlock_stats;
+
+static HTAB *lwlock_stats_htab;
+static lwlock_stats lwlock_stats_dummy;
+#endif
+
+#ifdef LOCK_DEBUG
+bool Trace_lwlocks = false;
+
+inline static void
+PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode)
+{
+ /* hide statement & context here, otherwise the log is just too verbose */
+ if (Trace_lwlocks)
+ {
+ uint32 state = pg_atomic_read_u32(&lock->state);
+
+ ereport(LOG,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d",
+ MyProcPid,
+ where, T_NAME(lock), lock,
+ (state & LW_VAL_EXCLUSIVE) != 0,
+ state & LW_SHARED_MASK,
+ (state & LW_FLAG_HAS_WAITERS) != 0,
+ pg_atomic_read_u32(&lock->nwaiters),
+ (state & LW_FLAG_RELEASE_OK) != 0)));
+ }
+}
+
+inline static void
+LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg)
+{
+ /* hide statement & context here, otherwise the log is just too verbose */
+ if (Trace_lwlocks)
+ {
+ ereport(LOG,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("%s(%s %p): %s", where,
+ T_NAME(lock), lock, msg)));
+ }
+}
+
+#else /* not LOCK_DEBUG */
+#define PRINT_LWDEBUG(a,b,c) ((void)0)
+#define LOG_LWDEBUG(a,b,c) ((void)0)
+#endif /* LOCK_DEBUG */
+
+#ifdef LWLOCK_STATS
+
+static void init_lwlock_stats(void);
+static void print_lwlock_stats(int code, Datum arg);
+static lwlock_stats * get_lwlock_stats_entry(LWLock *lock);
+
+static void
+init_lwlock_stats(void)
+{
+ HASHCTL ctl;
+ static MemoryContext lwlock_stats_cxt = NULL;
+ static bool exit_registered = false;
+
+ if (lwlock_stats_cxt != NULL)
+ MemoryContextDelete(lwlock_stats_cxt);
+
+ /*
+ * The LWLock stats will be updated within a critical section, which
+ * requires allocating new hash entries. Allocations within a critical
+ * section are normally not allowed because running out of memory would
+ * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally
+ * turned on in production, so that's an acceptable risk. The hash entries
+ * are small, so the risk of running out of memory is minimal in practice.
+ */
+ lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext,
+ "LWLock stats",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true);
+
+ ctl.keysize = sizeof(lwlock_stats_key);
+ ctl.entrysize = sizeof(lwlock_stats);
+ ctl.hcxt = lwlock_stats_cxt;
+ lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ if (!exit_registered)
+ {
+ on_shmem_exit(print_lwlock_stats, 0);
+ exit_registered = true;
+ }
+}
+
+static void
+print_lwlock_stats(int code, Datum arg)
+{
+ HASH_SEQ_STATUS scan;
+ lwlock_stats *lwstats;
+
+ hash_seq_init(&scan, lwlock_stats_htab);
+
+ /* Grab an LWLock to keep different backends from mixing reports */
+ LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE);
+
+ while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL)
+ {
+ fprintf(stderr,
+ "PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n",
+ MyProcPid, GetLWTrancheName(lwstats->key.tranche),
+ lwstats->key.instance, lwstats->sh_acquire_count,
+ lwstats->ex_acquire_count, lwstats->block_count,
+ lwstats->spin_delay_count, lwstats->dequeue_self_count);
+ }
+
+ LWLockRelease(&MainLWLockArray[0].lock);
+}
+
+static lwlock_stats *
+get_lwlock_stats_entry(LWLock *lock)
+{
+ lwlock_stats_key key;
+ lwlock_stats *lwstats;
+ bool found;
+
+ /*
+ * During shared memory initialization, the hash table doesn't exist yet.
+ * Stats of that phase aren't very interesting, so just collect operations
+ * on all locks in a single dummy entry.
+ */
+ if (lwlock_stats_htab == NULL)
+ return &lwlock_stats_dummy;
+
+ /* Fetch or create the entry. */
+ MemSet(&key, 0, sizeof(key));
+ key.tranche = lock->tranche;
+ key.instance = lock;
+ lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found);
+ if (!found)
+ {
+ lwstats->sh_acquire_count = 0;
+ lwstats->ex_acquire_count = 0;
+ lwstats->block_count = 0;
+ lwstats->dequeue_self_count = 0;
+ lwstats->spin_delay_count = 0;
+ }
+ return lwstats;
+}
+#endif /* LWLOCK_STATS */
+
+
+/*
+ * Compute number of LWLocks required by named tranches. These will be
+ * allocated in the main array.
+ */
+static int
+NumLWLocksForNamedTranches(void)
+{
+ int numLocks = 0;
+ int i;
+
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+
+ return numLocks;
+}
+
+/*
+ * Compute shmem space needed for LWLocks and named tranches.
+ */
+Size
+LWLockShmemSize(void)
+{
+ Size size;
+ int i;
+ int numLocks = NUM_FIXED_LWLOCKS;
+
+ /* Calculate total number of locks needed in the main array. */
+ numLocks += NumLWLocksForNamedTranches();
+
+ /* Space for the LWLock array. */
+ size = mul_size(numLocks, sizeof(LWLockPadded));
+
+ /* Space for dynamic allocation counter, plus room for alignment. */
+ size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE);
+
+ /* space for named tranches. */
+ size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche)));
+
+ /* space for name of each tranche. */
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1);
+
+ /* Disallow adding any more named tranches. */
+ lock_named_request_allowed = false;
+
+ return size;
+}
+
+/*
+ * Allocate shmem space for the main LWLock array and all tranches and
+ * initialize it. We also register extension LWLock tranches here.
+ */
+void
+CreateLWLocks(void)
+{
+ StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
+ "MAX_BACKENDS too big for lwlock.c");
+
+ StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
+ "Miscalculated LWLock padding");
+
+ if (!IsUnderPostmaster)
+ {
+ Size spaceLocks = LWLockShmemSize();
+ int *LWLockCounter;
+ char *ptr;
+
+ /* Allocate space */
+ ptr = (char *) ShmemAlloc(spaceLocks);
+
+ /* Leave room for dynamic allocation of tranches */
+ ptr += sizeof(int);
+
+ /* Ensure desired alignment of LWLock array */
+ ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE;
+
+ MainLWLockArray = (LWLockPadded *) ptr;
+
+ /*
+ * Initialize the dynamic-allocation counter for tranches, which is
+ * stored just before the first LWLock.
+ */
+ LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+ *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED;
+
+ /* Initialize all LWLocks */
+ InitializeLWLocks();
+ }
+
+ /* Register named extension LWLock tranches in the current process. */
+ for (int i = 0; i < NamedLWLockTrancheRequests; i++)
+ LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId,
+ NamedLWLockTrancheArray[i].trancheName);
+}
+
+/*
+ * Initialize LWLocks that are fixed and those belonging to named tranches.
+ */
+static void
+InitializeLWLocks(void)
+{
+ int numNamedLocks = NumLWLocksForNamedTranches();
+ int id;
+ int i;
+ int j;
+ LWLockPadded *lock;
+
+ /* Initialize all individual LWLocks in main array */
+ for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++)
+ LWLockInitialize(&lock->lock, id);
+
+ /* Initialize buffer mapping LWLocks in main array */
+ lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING);
+
+ /* Initialize lmgrs' LWLocks in main array */
+ lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER);
+
+ /* Initialize predicate lmgrs' LWLocks in main array */
+ lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER);
+
+ /*
+ * Copy the info about any named tranches into shared memory (so that
+ * other processes can see it), and initialize the requested LWLocks.
+ */
+ if (NamedLWLockTrancheRequests > 0)
+ {
+ char *trancheNames;
+
+ NamedLWLockTrancheArray = (NamedLWLockTranche *)
+ &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks];
+
+ trancheNames = (char *) NamedLWLockTrancheArray +
+ (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche));
+ lock = &MainLWLockArray[NUM_FIXED_LWLOCKS];
+
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ {
+ NamedLWLockTrancheRequest *request;
+ NamedLWLockTranche *tranche;
+ char *name;
+
+ request = &NamedLWLockTrancheRequestArray[i];
+ tranche = &NamedLWLockTrancheArray[i];
+
+ name = trancheNames;
+ trancheNames += strlen(request->tranche_name) + 1;
+ strcpy(name, request->tranche_name);
+ tranche->trancheId = LWLockNewTrancheId();
+ tranche->trancheName = name;
+
+ for (j = 0; j < request->num_lwlocks; j++, lock++)
+ LWLockInitialize(&lock->lock, tranche->trancheId);
+ }
+ }
+}
+
+/*
+ * InitLWLockAccess - initialize backend-local state needed to hold LWLocks
+ */
+void
+InitLWLockAccess(void)
+{
+#ifdef LWLOCK_STATS
+ init_lwlock_stats();
+#endif
+}
+
+/*
+ * GetNamedLWLockTranche - returns the base address of LWLock from the
+ * specified tranche.
+ *
+ * Caller needs to retrieve the requested number of LWLocks starting from
+ * the base lock address returned by this API. This can be used for
+ * tranches that are requested by using RequestNamedLWLockTranche() API.
+ */
+LWLockPadded *
+GetNamedLWLockTranche(const char *tranche_name)
+{
+ int lock_pos;
+ int i;
+
+ /*
+ * Obtain the position of base address of LWLock belonging to requested
+ * tranche_name in MainLWLockArray. LWLocks for named tranches are placed
+ * in MainLWLockArray after fixed locks.
+ */
+ lock_pos = NUM_FIXED_LWLOCKS;
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ {
+ if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name,
+ tranche_name) == 0)
+ return &MainLWLockArray[lock_pos];
+
+ lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+ }
+
+ elog(ERROR, "requested tranche is not registered");
+
+ /* just to keep compiler quiet */
+ return NULL;
+}
+
+/*
+ * Allocate a new tranche ID.
+ */
+int
+LWLockNewTrancheId(void)
+{
+ int result;
+ int *LWLockCounter;
+
+ LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+ SpinLockAcquire(ShmemLock);
+ result = (*LWLockCounter)++;
+ SpinLockRelease(ShmemLock);
+
+ return result;
+}
+
+/*
+ * Register a dynamic tranche name in the lookup table of the current process.
+ *
+ * This routine will save a pointer to the tranche name passed as an argument,
+ * so the name should be allocated in a backend-lifetime context
+ * (shared memory, TopMemoryContext, static constant, or similar).
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+LWLockRegisterTranche(int tranche_id, const char *tranche_name)
+{
+ /* This should only be called for user-defined tranches. */
+ if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED)
+ return;
+
+ /* Convert to array index. */
+ tranche_id -= LWTRANCHE_FIRST_USER_DEFINED;
+
+ /* If necessary, create or enlarge array. */
+ if (tranche_id >= LWLockTrancheNamesAllocated)
+ {
+ int newalloc;
+
+ newalloc = Max(LWLockTrancheNamesAllocated, 8);
+ while (newalloc <= tranche_id)
+ newalloc *= 2;
+
+ if (LWLockTrancheNames == NULL)
+ LWLockTrancheNames = (const char **)
+ MemoryContextAllocZero(TopMemoryContext,
+ newalloc * sizeof(char *));
+ else
+ {
+ LWLockTrancheNames = (const char **)
+ repalloc(LWLockTrancheNames, newalloc * sizeof(char *));
+ memset(LWLockTrancheNames + LWLockTrancheNamesAllocated,
+ 0,
+ (newalloc - LWLockTrancheNamesAllocated) * sizeof(char *));
+ }
+ LWLockTrancheNamesAllocated = newalloc;
+ }
+
+ LWLockTrancheNames[tranche_id] = tranche_name;
+}
+
+/*
+ * RequestNamedLWLockTranche
+ * Request that extra LWLocks be allocated during postmaster
+ * startup.
+ *
+ * This is only useful for extensions if called from the _PG_init hook
+ * of a library that is loaded into the postmaster via
+ * shared_preload_libraries. Once shared memory has been allocated, calls
+ * will be ignored. (We could raise an error, but it seems better to make
+ * it a no-op, so that libraries containing such calls can be reloaded if
+ * needed.)
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks)
+{
+ NamedLWLockTrancheRequest *request;
+
+ if (IsUnderPostmaster || !lock_named_request_allowed)
+ return; /* too late */
+
+ if (NamedLWLockTrancheRequestArray == NULL)
+ {
+ NamedLWLockTrancheRequestsAllocated = 16;
+ NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+ MemoryContextAlloc(TopMemoryContext,
+ NamedLWLockTrancheRequestsAllocated
+ * sizeof(NamedLWLockTrancheRequest));
+ }
+
+ if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated)
+ {
+ int i = NamedLWLockTrancheRequestsAllocated;
+
+ while (i <= NamedLWLockTrancheRequests)
+ i *= 2;
+
+ NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+ repalloc(NamedLWLockTrancheRequestArray,
+ i * sizeof(NamedLWLockTrancheRequest));
+ NamedLWLockTrancheRequestsAllocated = i;
+ }
+
+ request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests];
+ Assert(strlen(tranche_name) + 1 <= NAMEDATALEN);
+ strlcpy(request->tranche_name, tranche_name, NAMEDATALEN);
+ request->num_lwlocks = num_lwlocks;
+ NamedLWLockTrancheRequests++;
+}
+
+/*
+ * LWLockInitialize - initialize a new lwlock; it's initially unlocked
+ */
+void
+LWLockInitialize(LWLock *lock, int tranche_id)
+{
+ pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK);
+#ifdef LOCK_DEBUG
+ pg_atomic_init_u32(&lock->nwaiters, 0);
+#endif
+ lock->tranche = tranche_id;
+ proclist_init(&lock->waiters);
+}
+
+/*
+ * Report start of wait event for light-weight locks.
+ *
+ * This function will be used by all the light-weight lock calls which
+ * needs to wait to acquire the lock. This function distinguishes wait
+ * event based on tranche and lock id.
+ */
+static inline void
+LWLockReportWaitStart(LWLock *lock)
+{
+ pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche);
+}
+
+/*
+ * Report end of wait event for light-weight locks.
+ */
+static inline void
+LWLockReportWaitEnd(void)
+{
+ pgstat_report_wait_end();
+}
+
+/*
+ * Return the name of an LWLock tranche.
+ */
+static const char *
+GetLWTrancheName(uint16 trancheId)
+{
+ /* Individual LWLock? */
+ if (trancheId < NUM_INDIVIDUAL_LWLOCKS)
+ return IndividualLWLockNames[trancheId];
+
+ /* Built-in tranche? */
+ if (trancheId < LWTRANCHE_FIRST_USER_DEFINED)
+ return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS];
+
+ /*
+ * It's an extension tranche, so look in LWLockTrancheNames[]. However,
+ * it's possible that the tranche has never been registered in the current
+ * process, in which case give up and return "extension".
+ */
+ trancheId -= LWTRANCHE_FIRST_USER_DEFINED;
+
+ if (trancheId >= LWLockTrancheNamesAllocated ||
+ LWLockTrancheNames[trancheId] == NULL)
+ return "extension";
+
+ return LWLockTrancheNames[trancheId];
+}
+
+/*
+ * Return an identifier for an LWLock based on the wait class and event.
+ */
+const char *
+GetLWLockIdentifier(uint32 classId, uint16 eventId)
+{
+ Assert(classId == PG_WAIT_LWLOCK);
+ /* The event IDs are just tranche numbers. */
+ return GetLWTrancheName(eventId);
+}
+
+/*
+ * Internal function that tries to atomically acquire the lwlock in the passed
+ * in mode.
+ *
+ * This function will not block waiting for a lock to become free - that's the
+ * callers job.
+ *
+ * Returns true if the lock isn't free and we need to wait.
+ */
+static bool
+LWLockAttemptLock(LWLock *lock, LWLockMode mode)
+{
+ uint32 old_state;
+
+ AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+
+ /*
+ * Read once outside the loop, later iterations will get the newer value
+ * via compare & exchange.
+ */
+ old_state = pg_atomic_read_u32(&lock->state);
+
+ /* loop until we've determined whether we could acquire the lock or not */
+ while (true)
+ {
+ uint32 desired_state;
+ bool lock_free;
+
+ desired_state = old_state;
+
+ if (mode == LW_EXCLUSIVE)
+ {
+ lock_free = (old_state & LW_LOCK_MASK) == 0;
+ if (lock_free)
+ desired_state += LW_VAL_EXCLUSIVE;
+ }
+ else
+ {
+ lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
+ if (lock_free)
+ desired_state += LW_VAL_SHARED;
+ }
+
+ /*
+ * Attempt to swap in the state we are expecting. If we didn't see
+ * lock to be free, that's just the old value. If we saw it as free,
+ * we'll attempt to mark it acquired. The reason that we always swap
+ * in the value is that this doubles as a memory barrier. We could try
+ * to be smarter and only swap in values if we saw the lock as free,
+ * but benchmark haven't shown it as beneficial so far.
+ *
+ * Retry if the value changed since we last looked at it.
+ */
+ if (pg_atomic_compare_exchange_u32(&lock->state,
+ &old_state, desired_state))
+ {
+ if (lock_free)
+ {
+ /* Great! Got the lock. */
+#ifdef LOCK_DEBUG
+ if (mode == LW_EXCLUSIVE)
+ lock->owner = MyProc;
+#endif
+ return false;
+ }
+ else
+ return true; /* somebody else has the lock */
+ }
+ }
+ pg_unreachable();
+}
+
+/*
+ * Lock the LWLock's wait list against concurrent activity.
+ *
+ * NB: even though the wait list is locked, non-conflicting lock operations
+ * may still happen concurrently.
+ *
+ * Time spent holding mutex should be short!
+ */
+static void
+LWLockWaitListLock(LWLock *lock)
+{
+ uint32 old_state;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+ uint32 delays = 0;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ while (true)
+ {
+ /* always try once to acquire lock directly */
+ old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED);
+ if (!(old_state & LW_FLAG_LOCKED))
+ break; /* got lock */
+
+ /* and then spin without atomic operations until lock is released */
+ {
+ SpinDelayStatus delayStatus;
+
+ init_local_spin_delay(&delayStatus);
+
+ while (old_state & LW_FLAG_LOCKED)
+ {
+ perform_spin_delay(&delayStatus);
+ old_state = pg_atomic_read_u32(&lock->state);
+ }
+#ifdef LWLOCK_STATS
+ delays += delayStatus.delays;
+#endif
+ finish_spin_delay(&delayStatus);
+ }
+
+ /*
+ * Retry. The lock might obviously already be re-acquired by the time
+ * we're attempting to get it again.
+ */
+ }
+
+#ifdef LWLOCK_STATS
+ lwstats->spin_delay_count += delays;
+#endif
+}
+
+/*
+ * Unlock the LWLock's wait list.
+ *
+ * Note that it can be more efficient to manipulate flags and release the
+ * locks in a single atomic operation.
+ */
+static void
+LWLockWaitListUnlock(LWLock *lock)
+{
+ uint32 old_state PG_USED_FOR_ASSERTS_ONLY;
+
+ old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED);
+
+ Assert(old_state & LW_FLAG_LOCKED);
+}
+
+/*
+ * Wakeup all the lockers that currently have a chance to acquire the lock.
+ */
+static void
+LWLockWakeup(LWLock *lock)
+{
+ bool new_release_ok;
+ bool wokeup_somebody = false;
+ proclist_head wakeup;
+ proclist_mutable_iter iter;
+
+ proclist_init(&wakeup);
+
+ new_release_ok = true;
+
+ /* lock wait list while collecting backends to wake up */
+ LWLockWaitListLock(lock);
+
+ proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
+ continue;
+
+ proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+ proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+ if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+ {
+ /*
+ * Prevent additional wakeups until retryer gets to run. Backends
+ * that are just waiting for the lock to become free don't retry
+ * automatically.
+ */
+ new_release_ok = false;
+
+ /*
+ * Don't wakeup (further) exclusive locks.
+ */
+ wokeup_somebody = true;
+ }
+
+ /*
+ * Once we've woken up an exclusive lock, there's no point in waking
+ * up anybody else.
+ */
+ if (waiter->lwWaitMode == LW_EXCLUSIVE)
+ break;
+ }
+
+ Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS);
+
+ /* unset required flags, and release lock, in one fell swoop */
+ {
+ uint32 old_state;
+ uint32 desired_state;
+
+ old_state = pg_atomic_read_u32(&lock->state);
+ while (true)
+ {
+ desired_state = old_state;
+
+ /* compute desired flags */
+
+ if (new_release_ok)
+ desired_state |= LW_FLAG_RELEASE_OK;
+ else
+ desired_state &= ~LW_FLAG_RELEASE_OK;
+
+ if (proclist_is_empty(&wakeup))
+ desired_state &= ~LW_FLAG_HAS_WAITERS;
+
+ desired_state &= ~LW_FLAG_LOCKED; /* release lock */
+
+ if (pg_atomic_compare_exchange_u32(&lock->state, &old_state,
+ desired_state))
+ break;
+ }
+ }
+
+ /* Awaken any waiters I removed from the queue. */
+ proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ LOG_LWDEBUG("LWLockRelease", lock, "release waiter");
+ proclist_delete(&wakeup, iter.cur, lwWaitLink);
+
+ /*
+ * Guarantee that lwWaiting being unset only becomes visible once the
+ * unlink from the link has completed. Otherwise the target backend
+ * could be woken up for other reason and enqueue for a new lock - if
+ * that happens before the list unlink happens, the list would end up
+ * being corrupted.
+ *
+ * The barrier pairs with the LWLockWaitListLock() when enqueuing for
+ * another lock.
+ */
+ pg_write_barrier();
+ waiter->lwWaiting = false;
+ PGSemaphoreUnlock(waiter->sem);
+ }
+}
+
+/*
+ * Add ourselves to the end of the queue.
+ *
+ * NB: Mode can be LW_WAIT_UNTIL_FREE here!
+ */
+static void
+LWLockQueueSelf(LWLock *lock, LWLockMode mode)
+{
+ /*
+ * If we don't have a PGPROC structure, there's no way to wait. This
+ * should never occur, since MyProc should only be null during shared
+ * memory initialization.
+ */
+ if (MyProc == NULL)
+ elog(PANIC, "cannot wait without a PGPROC structure");
+
+ if (MyProc->lwWaiting)
+ elog(PANIC, "queueing for lock while waiting on another one");
+
+ LWLockWaitListLock(lock);
+
+ /* setting the flag is protected by the spinlock */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
+
+ MyProc->lwWaiting = true;
+ MyProc->lwWaitMode = mode;
+
+ /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */
+ if (mode == LW_WAIT_UNTIL_FREE)
+ proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+ else
+ proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+
+ /* Can release the mutex now */
+ LWLockWaitListUnlock(lock);
+
+#ifdef LOCK_DEBUG
+ pg_atomic_fetch_add_u32(&lock->nwaiters, 1);
+#endif
+
+}
+
+/*
+ * Remove ourselves from the waitlist.
+ *
+ * This is used if we queued ourselves because we thought we needed to sleep
+ * but, after further checking, we discovered that we don't actually need to
+ * do so.
+ */
+static void
+LWLockDequeueSelf(LWLock *lock)
+{
+ bool found = false;
+ proclist_mutable_iter iter;
+
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+
+ lwstats->dequeue_self_count++;
+#endif
+
+ LWLockWaitListLock(lock);
+
+ /*
+ * Can't just remove ourselves from the list, but we need to iterate over
+ * all entries as somebody else could have dequeued us.
+ */
+ proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+ {
+ if (iter.cur == MyProc->pgprocno)
+ {
+ found = true;
+ proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+ break;
+ }
+ }
+
+ if (proclist_is_empty(&lock->waiters) &&
+ (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0)
+ {
+ pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS);
+ }
+
+ /* XXX: combine with fetch_and above? */
+ LWLockWaitListUnlock(lock);
+
+ /* clear waiting state again, nice for debugging */
+ if (found)
+ MyProc->lwWaiting = false;
+ else
+ {
+ int extraWaits = 0;
+
+ /*
+ * Somebody else dequeued us and has or will wake us up. Deal with the
+ * superfluous absorption of a wakeup.
+ */
+
+ /*
+ * Reset RELEASE_OK flag if somebody woke us before we removed
+ * ourselves - they'll have set it to false.
+ */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+ /*
+ * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
+ * get reset at some inconvenient point later. Most of the time this
+ * will immediately return.
+ */
+ for (;;)
+ {
+ PGSemaphoreLock(MyProc->sem);
+ if (!MyProc->lwWaiting)
+ break;
+ extraWaits++;
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(MyProc->sem);
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+}
+
+/*
+ * LWLockAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, sleep until it is. Returns true if the lock
+ * was available immediately, false if we had to sleep.
+ *
+ * Side effect: cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockAcquire(LWLock *lock, LWLockMode mode)
+{
+ PGPROC *proc = MyProc;
+ bool result = true;
+ int extraWaits = 0;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockAcquire", lock, mode);
+
+#ifdef LWLOCK_STATS
+ /* Count lock acquisition attempts */
+ if (mode == LW_EXCLUSIVE)
+ lwstats->ex_acquire_count++;
+ else
+ lwstats->sh_acquire_count++;
+#endif /* LWLOCK_STATS */
+
+ /*
+ * We can't wait if we haven't got a PGPROC. This should only occur
+ * during bootstrap or shared memory initialization. Put an Assert here
+ * to catch unsafe coding practices.
+ */
+ Assert(!(proc == NULL && IsUnderPostmaster));
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * Loop here to try to acquire lock after each time we are signaled by
+ * LWLockRelease.
+ *
+ * NOTE: it might seem better to have LWLockRelease actually grant us the
+ * lock, rather than retrying and possibly having to go back to sleep. But
+ * in practice that is no good because it means a process swap for every
+ * lock acquisition when two or more processes are contending for the same
+ * lock. Since LWLocks are normally used to protect not-very-long
+ * sections of computation, a process needs to be able to acquire and
+ * release the same lock many times during a single CPU time slice, even
+ * in the presence of contention. The efficiency of being able to do that
+ * outweighs the inefficiency of sometimes wasting a process dispatch
+ * cycle because the lock is not free when a released waiter finally gets
+ * to run. See pgsql-hackers archives for 29-Dec-01.
+ */
+ for (;;)
+ {
+ bool mustwait;
+
+ /*
+ * Try to grab the lock the first time, we're not in the waitqueue
+ * yet/anymore.
+ */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock");
+ break; /* got the lock */
+ }
+
+ /*
+ * Ok, at this point we couldn't grab the lock on the first try. We
+ * cannot simply queue ourselves to the end of the list and wait to be
+ * woken up because by now the lock could long have been released.
+ * Instead add us to the queue and try to grab the lock again. If we
+ * succeed we need to revert the queuing and be happy, otherwise we
+ * recheck the lock. If we still couldn't grab it, we know that the
+ * other locker will see our queue entries when releasing since they
+ * existed before we checked for the lock.
+ */
+
+ /* add to the queue */
+ LWLockQueueSelf(lock, mode);
+
+ /* we're now guaranteed to be woken up if necessary */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ /* ok, grabbed the lock the second time round, need to undo queueing */
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue");
+
+ LWLockDequeueSelf(lock);
+ break;
+ }
+
+ /*
+ * Wait until awakened.
+ *
+ * It is possible that we get awakened for a reason other than being
+ * signaled by LWLockRelease. If so, loop back and wait again. Once
+ * we've gotten the LWLock, re-increment the sema by the number of
+ * additional signals received.
+ */
+ LOG_LWDEBUG("LWLockAcquire", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (!proc->lwWaiting)
+ break;
+ extraWaits++;
+ }
+
+ /* Retrying, allow LWLockRelease to release waiters again. */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockAcquire", lock, "awakened");
+
+ /* Now loop back and try to acquire lock again. */
+ result = false;
+ }
+
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode);
+
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ return result;
+}
+
+/*
+ * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, return false with no side-effects.
+ *
+ * If successful, cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
+{
+ bool mustwait;
+
+ AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode);
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /* Check for the lock */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ /* Failed to get lock, so release interrupt holdoff */
+ RESUME_INTERRUPTS();
+
+ LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed");
+ if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode);
+ }
+ else
+ {
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+ if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode);
+ }
+ return !mustwait;
+}
+
+/*
+ * LWLockAcquireOrWait - Acquire lock, or wait until it's free
+ *
+ * The semantics of this function are a bit funky. If the lock is currently
+ * free, it is acquired in the given mode, and the function returns true. If
+ * the lock isn't immediately free, the function waits until it is released
+ * and returns false, but does not acquire the lock.
+ *
+ * This is currently used for WALWriteLock: when a backend flushes the WAL,
+ * holding WALWriteLock, it can flush the commit records of many other
+ * backends as a side-effect. Those other backends need to wait until the
+ * flush finishes, but don't need to acquire the lock anymore. They can just
+ * wake up, observe that their records have already been flushed, and return.
+ */
+bool
+LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
+{
+ PGPROC *proc = MyProc;
+ bool mustwait;
+ int extraWaits = 0;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode);
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * NB: We're using nearly the same twice-in-a-row lock acquisition
+ * protocol as LWLockAcquire(). Check its comments for details.
+ */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ /*
+ * Wait until awakened. Like in LWLockAcquire, be prepared for
+ * bogus wakeups.
+ */
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (!proc->lwWaiting)
+ break;
+ extraWaits++;
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened");
+ }
+ else
+ {
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue");
+
+ /*
+ * Got lock in the second attempt, undo queueing. We need to treat
+ * this as having successfully acquired the lock, otherwise we'd
+ * not necessarily wake up people we've prevented from acquiring
+ * the lock.
+ */
+ LWLockDequeueSelf(lock);
+ }
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ if (mustwait)
+ {
+ /* Failed to get lock, so release interrupt holdoff */
+ RESUME_INTERRUPTS();
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed");
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode);
+ }
+ else
+ {
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded");
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode);
+ }
+
+ return !mustwait;
+}
+
+/*
+ * Does the lwlock in its current state need to wait for the variable value to
+ * change?
+ *
+ * If we don't need to wait, and it's because the value of the variable has
+ * changed, store the current value in newval.
+ *
+ * *result is set to true if the lock was free, and false otherwise.
+ */
+static bool
+LWLockConflictsWithVar(LWLock *lock,
+ uint64 *valptr, uint64 oldval, uint64 *newval,
+ bool *result)
+{
+ bool mustwait;
+ uint64 value;
+
+ /*
+ * Test first to see if it the slot is free right now.
+ *
+ * XXX: the caller uses a spinlock before this, so we don't need a memory
+ * barrier here as far as the current usage is concerned. But that might
+ * not be safe in general.
+ */
+ mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0;
+
+ if (!mustwait)
+ {
+ *result = true;
+ return false;
+ }
+
+ *result = false;
+
+ /*
+ * Read value using the lwlock's wait list lock, as we can't generally
+ * rely on atomic 64 bit reads/stores. TODO: On platforms with a way to
+ * do atomic 64 bit reads/writes the spinlock should be optimized away.
+ */
+ LWLockWaitListLock(lock);
+ value = *valptr;
+ LWLockWaitListUnlock(lock);
+
+ if (value != oldval)
+ {
+ mustwait = false;
+ *newval = value;
+ }
+ else
+ {
+ mustwait = true;
+ }
+
+ return mustwait;
+}
+
+/*
+ * LWLockWaitForVar - Wait until lock is free, or a variable is updated.
+ *
+ * If the lock is held and *valptr equals oldval, waits until the lock is
+ * either freed, or the lock holder updates *valptr by calling
+ * LWLockUpdateVar. If the lock is free on exit (immediately or after
+ * waiting), returns true. If the lock is still held, but *valptr no longer
+ * matches oldval, returns false and sets *newval to the current value in
+ * *valptr.
+ *
+ * Note: this function ignores shared lock holders; if the lock is held
+ * in shared mode, returns 'true'.
+ */
+bool
+LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
+{
+ PGPROC *proc = MyProc;
+ int extraWaits = 0;
+ bool result = false;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE);
+
+ /*
+ * Lock out cancel/die interrupts while we sleep on the lock. There is no
+ * cleanup mechanism to remove us from the wait queue if we got
+ * interrupted.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * Loop here to check the lock's status after each time we are signaled.
+ */
+ for (;;)
+ {
+ bool mustwait;
+
+ mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+ &result);
+
+ if (!mustwait)
+ break; /* the lock was free or value didn't match */
+
+ /*
+ * Add myself to wait queue. Note that this is racy, somebody else
+ * could wakeup before we're finished queuing. NB: We're using nearly
+ * the same twice-in-a-row lock acquisition protocol as
+ * LWLockAcquire(). Check its comments for details. The only
+ * difference is that we also have to check the variable's values when
+ * checking the state of the lock.
+ */
+ LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+ /*
+ * Set RELEASE_OK flag, to make sure we get woken up as soon as the
+ * lock is released.
+ */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+ /*
+ * We're now guaranteed to be woken up if necessary. Recheck the lock
+ * and variables state.
+ */
+ mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+ &result);
+
+ /* Ok, no conflict after we queued ourselves. Undo queueing. */
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue");
+
+ LWLockDequeueSelf(lock);
+ break;
+ }
+
+ /*
+ * Wait until awakened.
+ *
+ * It is possible that we get awakened for a reason other than being
+ * signaled by LWLockRelease. If so, loop back and wait again. Once
+ * we've gotten the LWLock, re-increment the sema by the number of
+ * additional signals received.
+ */
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (!proc->lwWaiting)
+ break;
+ extraWaits++;
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened");
+
+ /* Now loop back and check the status of the lock again. */
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ /*
+ * Now okay to allow cancel/die interrupts.
+ */
+ RESUME_INTERRUPTS();
+
+ return result;
+}
+
+
+/*
+ * LWLockUpdateVar - Update a variable and wake up waiters atomically
+ *
+ * Sets *valptr to 'val', and wakes up all processes waiting for us with
+ * LWLockWaitForVar(). Setting the value and waking up the processes happen
+ * atomically so that any process calling LWLockWaitForVar() on the same lock
+ * is guaranteed to see the new value, and act accordingly.
+ *
+ * The caller must be holding the lock in exclusive mode.
+ */
+void
+LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+ proclist_head wakeup;
+ proclist_mutable_iter iter;
+
+ PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE);
+
+ proclist_init(&wakeup);
+
+ LWLockWaitListLock(lock);
+
+ Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE);
+
+ /* Update the lock's value */
+ *valptr = val;
+
+ /*
+ * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken
+ * up. They are always in the front of the queue.
+ */
+ proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+ break;
+
+ proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+ proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+ }
+
+ /* We are done updating shared state of the lock itself. */
+ LWLockWaitListUnlock(lock);
+
+ /*
+ * Awaken any waiters I removed from the queue.
+ */
+ proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ proclist_delete(&wakeup, iter.cur, lwWaitLink);
+ /* check comment in LWLockWakeup() about this barrier */
+ pg_write_barrier();
+ waiter->lwWaiting = false;
+ PGSemaphoreUnlock(waiter->sem);
+ }
+}
+
+
+/*
+ * LWLockRelease - release a previously acquired lock
+ */
+void
+LWLockRelease(LWLock *lock)
+{
+ LWLockMode mode;
+ uint32 oldstate;
+ bool check_waiters;
+ int i;
+
+ /*
+ * Remove lock from list of locks held. Usually, but not always, it will
+ * be the latest-acquired lock; so search array backwards.
+ */
+ for (i = num_held_lwlocks; --i >= 0;)
+ if (lock == held_lwlocks[i].lock)
+ break;
+
+ if (i < 0)
+ elog(ERROR, "lock %s is not held", T_NAME(lock));
+
+ mode = held_lwlocks[i].mode;
+
+ num_held_lwlocks--;
+ for (; i < num_held_lwlocks; i++)
+ held_lwlocks[i] = held_lwlocks[i + 1];
+
+ PRINT_LWDEBUG("LWLockRelease", lock, mode);
+
+ /*
+ * Release my hold on lock, after that it can immediately be acquired by
+ * others, even if we still have to wakeup other waiters.
+ */
+ if (mode == LW_EXCLUSIVE)
+ oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE);
+ else
+ oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);
+
+ /* nobody else can have that kind of lock */
+ Assert(!(oldstate & LW_VAL_EXCLUSIVE));
+
+ if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock));
+
+ /*
+ * We're still waiting for backends to get scheduled, don't wake them up
+ * again.
+ */
+ if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) ==
+ (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) &&
+ (oldstate & LW_LOCK_MASK) == 0)
+ check_waiters = true;
+ else
+ check_waiters = false;
+
+ /*
+ * As waking up waiters requires the spinlock to be acquired, only do so
+ * if necessary.
+ */
+ if (check_waiters)
+ {
+ /* XXX: remove before commit? */
+ LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters");
+ LWLockWakeup(lock);
+ }
+
+ /*
+ * Now okay to allow cancel/die interrupts.
+ */
+ RESUME_INTERRUPTS();
+}
+
+/*
+ * LWLockReleaseClearVar - release a previously acquired lock, reset variable
+ */
+void
+LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+ LWLockWaitListLock(lock);
+
+ /*
+ * Set the variable's value before releasing the lock, that prevents race
+ * a race condition wherein a new locker acquires the lock, but hasn't yet
+ * set the variables value.
+ */
+ *valptr = val;
+ LWLockWaitListUnlock(lock);
+
+ LWLockRelease(lock);
+}
+
+
+/*
+ * LWLockReleaseAll - release all currently-held locks
+ *
+ * Used to clean up after ereport(ERROR). An important difference between this
+ * function and retail LWLockRelease calls is that InterruptHoldoffCount is
+ * unchanged by this operation. This is necessary since InterruptHoldoffCount
+ * has been set to an appropriate level earlier in error recovery. We could
+ * decrement it below zero if we allow it to drop for each released lock!
+ */
+void
+LWLockReleaseAll(void)
+{
+ while (num_held_lwlocks > 0)
+ {
+ HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
+
+ LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock);
+ }
+}
+
+
+/*
+ * LWLockHeldByMe - test whether my process holds a lock in any mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMe(LWLock *l)
+{
+ int i;
+
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ if (held_lwlocks[i].lock == l)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * LWLockHeldByMe - test whether my process holds any of an array of locks
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockAnyHeldByMe(LWLock *l, int nlocks, size_t stride)
+{
+ char *held_lock_addr;
+ char *begin;
+ char *end;
+ int i;
+
+ begin = (char *) l;
+ end = begin + nlocks * stride;
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ held_lock_addr = (char *) held_lwlocks[i].lock;
+ if (held_lock_addr >= begin &&
+ held_lock_addr < end &&
+ (held_lock_addr - begin) % stride == 0)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * LWLockHeldByMeInMode - test whether my process holds a lock in given mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
+{
+ int i;
+
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ if (held_lwlocks[i].lock == l && held_lwlocks[i].mode == mode)
+ return true;
+ }
+ return false;
+}
diff --git a/src/backend/storage/lmgr/lwlocknames.c b/src/backend/storage/lmgr/lwlocknames.c
new file mode 100644
index 0000000..65f7c5b
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.c
@@ -0,0 +1,52 @@
+/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */
+
+const char *const IndividualLWLockNames[] = {
+ "<unassigned:0>",
+ "ShmemIndex",
+ "OidGen",
+ "XidGen",
+ "ProcArray",
+ "SInvalRead",
+ "SInvalWrite",
+ "WALBufMapping",
+ "WALWrite",
+ "ControlFile",
+ "<unassigned:10>",
+ "XactSLRU",
+ "SubtransSLRU",
+ "MultiXactGen",
+ "MultiXactOffsetSLRU",
+ "MultiXactMemberSLRU",
+ "RelCacheInit",
+ "CheckpointerComm",
+ "TwoPhaseState",
+ "TablespaceCreate",
+ "BtreeVacuum",
+ "AddinShmemInit",
+ "Autovacuum",
+ "AutovacuumSchedule",
+ "SyncScan",
+ "RelationMapping",
+ "NotifySLRU",
+ "NotifyQueue",
+ "SerializableXactHash",
+ "SerializableFinishedList",
+ "SerializablePredicateList",
+ "SerialSLRU",
+ "SyncRep",
+ "BackgroundWorker",
+ "DynamicSharedMemoryControl",
+ "AutoFile",
+ "ReplicationSlotAllocation",
+ "ReplicationSlotControl",
+ "CommitTsSLRU",
+ "CommitTs",
+ "ReplicationOrigin",
+ "MultiXactTruncation",
+ "OldSnapshotTimeMap",
+ "LogicalRepWorker",
+ "XactTruncation",
+ "<unassigned:45>",
+ "WrapLimitsVacuum",
+ "NotifyQueueTail"
+};
diff --git a/src/backend/storage/lmgr/lwlocknames.h b/src/backend/storage/lmgr/lwlocknames.h
new file mode 100644
index 0000000..e279f72
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.h
@@ -0,0 +1,50 @@
+/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */
+/* there is deliberately not an #ifndef LWLOCKNAMES_H here */
+
+#define ShmemIndexLock (&MainLWLockArray[1].lock)
+#define OidGenLock (&MainLWLockArray[2].lock)
+#define XidGenLock (&MainLWLockArray[3].lock)
+#define ProcArrayLock (&MainLWLockArray[4].lock)
+#define SInvalReadLock (&MainLWLockArray[5].lock)
+#define SInvalWriteLock (&MainLWLockArray[6].lock)
+#define WALBufMappingLock (&MainLWLockArray[7].lock)
+#define WALWriteLock (&MainLWLockArray[8].lock)
+#define ControlFileLock (&MainLWLockArray[9].lock)
+#define XactSLRULock (&MainLWLockArray[11].lock)
+#define SubtransSLRULock (&MainLWLockArray[12].lock)
+#define MultiXactGenLock (&MainLWLockArray[13].lock)
+#define MultiXactOffsetSLRULock (&MainLWLockArray[14].lock)
+#define MultiXactMemberSLRULock (&MainLWLockArray[15].lock)
+#define RelCacheInitLock (&MainLWLockArray[16].lock)
+#define CheckpointerCommLock (&MainLWLockArray[17].lock)
+#define TwoPhaseStateLock (&MainLWLockArray[18].lock)
+#define TablespaceCreateLock (&MainLWLockArray[19].lock)
+#define BtreeVacuumLock (&MainLWLockArray[20].lock)
+#define AddinShmemInitLock (&MainLWLockArray[21].lock)
+#define AutovacuumLock (&MainLWLockArray[22].lock)
+#define AutovacuumScheduleLock (&MainLWLockArray[23].lock)
+#define SyncScanLock (&MainLWLockArray[24].lock)
+#define RelationMappingLock (&MainLWLockArray[25].lock)
+#define NotifySLRULock (&MainLWLockArray[26].lock)
+#define NotifyQueueLock (&MainLWLockArray[27].lock)
+#define SerializableXactHashLock (&MainLWLockArray[28].lock)
+#define SerializableFinishedListLock (&MainLWLockArray[29].lock)
+#define SerializablePredicateListLock (&MainLWLockArray[30].lock)
+#define SerialSLRULock (&MainLWLockArray[31].lock)
+#define SyncRepLock (&MainLWLockArray[32].lock)
+#define BackgroundWorkerLock (&MainLWLockArray[33].lock)
+#define DynamicSharedMemoryControlLock (&MainLWLockArray[34].lock)
+#define AutoFileLock (&MainLWLockArray[35].lock)
+#define ReplicationSlotAllocationLock (&MainLWLockArray[36].lock)
+#define ReplicationSlotControlLock (&MainLWLockArray[37].lock)
+#define CommitTsSLRULock (&MainLWLockArray[38].lock)
+#define CommitTsLock (&MainLWLockArray[39].lock)
+#define ReplicationOriginLock (&MainLWLockArray[40].lock)
+#define MultiXactTruncationLock (&MainLWLockArray[41].lock)
+#define OldSnapshotTimeMapLock (&MainLWLockArray[42].lock)
+#define LogicalRepWorkerLock (&MainLWLockArray[43].lock)
+#define XactTruncationLock (&MainLWLockArray[44].lock)
+#define WrapLimitsVacuumLock (&MainLWLockArray[46].lock)
+#define NotifyQueueTailLock (&MainLWLockArray[47].lock)
+
+#define NUM_INDIVIDUAL_LWLOCKS 48
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
new file mode 100644
index 0000000..6c7cf6c
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -0,0 +1,55 @@
+# Some commonly-used locks have predefined positions within MainLWLockArray;
+# these are defined here. If you add a lock, add it to the end to avoid
+# renumbering the existing locks; if you remove a lock, consider leaving a gap
+# in the numbering sequence for the benefit of DTrace and other external
+# debugging scripts. Also, do not forget to update the list of wait events
+# in the user documentation.
+
+# 0 is available; was formerly BufFreelistLock
+ShmemIndexLock 1
+OidGenLock 2
+XidGenLock 3
+ProcArrayLock 4
+SInvalReadLock 5
+SInvalWriteLock 6
+WALBufMappingLock 7
+WALWriteLock 8
+ControlFileLock 9
+# 10 was CheckpointLock
+XactSLRULock 11
+SubtransSLRULock 12
+MultiXactGenLock 13
+MultiXactOffsetSLRULock 14
+MultiXactMemberSLRULock 15
+RelCacheInitLock 16
+CheckpointerCommLock 17
+TwoPhaseStateLock 18
+TablespaceCreateLock 19
+BtreeVacuumLock 20
+AddinShmemInitLock 21
+AutovacuumLock 22
+AutovacuumScheduleLock 23
+SyncScanLock 24
+RelationMappingLock 25
+NotifySLRULock 26
+NotifyQueueLock 27
+SerializableXactHashLock 28
+SerializableFinishedListLock 29
+SerializablePredicateListLock 30
+SerialSLRULock 31
+SyncRepLock 32
+BackgroundWorkerLock 33
+DynamicSharedMemoryControlLock 34
+AutoFileLock 35
+ReplicationSlotAllocationLock 36
+ReplicationSlotControlLock 37
+CommitTsSLRULock 38
+CommitTsLock 39
+ReplicationOriginLock 40
+MultiXactTruncationLock 41
+OldSnapshotTimeMapLock 42
+LogicalRepWorkerLock 43
+XactTruncationLock 44
+# 45 was XactTruncationLock until removal of BackendRandomLock
+WrapLimitsVacuumLock 46
+NotifyQueueTailLock 47
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
new file mode 100644
index 0000000..d493aee
--- /dev/null
+++ b/src/backend/storage/lmgr/predicate.c
@@ -0,0 +1,5203 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate.c
+ * POSTGRES predicate locking
+ * to support full serializable transaction isolation
+ *
+ *
+ * The approach taken is to implement Serializable Snapshot Isolation (SSI)
+ * as initially described in this paper:
+ *
+ * Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
+ * Serializable isolation for snapshot databases.
+ * In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
+ * international conference on Management of data,
+ * pages 729-738, New York, NY, USA. ACM.
+ * http://doi.acm.org/10.1145/1376616.1376690
+ *
+ * and further elaborated in Cahill's doctoral thesis:
+ *
+ * Michael James Cahill. 2009.
+ * Serializable Isolation for Snapshot Databases.
+ * Sydney Digital Theses.
+ * University of Sydney, School of Information Technologies.
+ * http://hdl.handle.net/2123/5353
+ *
+ *
+ * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
+ * locks, which are so different from normal locks that a distinct set of
+ * structures is required to handle them. They are needed to detect
+ * rw-conflicts when the read happens before the write. (When the write
+ * occurs first, the reading transaction can check for a conflict by
+ * examining the MVCC data.)
+ *
+ * (1) Besides tuples actually read, they must cover ranges of tuples
+ * which would have been read based on the predicate. This will
+ * require modelling the predicates through locks against database
+ * objects such as pages, index ranges, or entire tables.
+ *
+ * (2) They must be kept in RAM for quick access. Because of this, it
+ * isn't possible to always maintain tuple-level granularity -- when
+ * the space allocated to store these approaches exhaustion, a
+ * request for a lock may need to scan for situations where a single
+ * transaction holds many fine-grained locks which can be coalesced
+ * into a single coarser-grained lock.
+ *
+ * (3) They never block anything; they are more like flags than locks
+ * in that regard; although they refer to database objects and are
+ * used to identify rw-conflicts with normal write locks.
+ *
+ * (4) While they are associated with a transaction, they must survive
+ * a successful COMMIT of that transaction, and remain until all
+ * overlapping transactions complete. This even means that they
+ * must survive termination of the transaction's process. If a
+ * top level transaction is rolled back, however, it is immediately
+ * flagged so that it can be ignored, and its SIREAD locks can be
+ * released any time after that.
+ *
+ * (5) The only transactions which create SIREAD locks or check for
+ * conflicts with them are serializable transactions.
+ *
+ * (6) When a write lock for a top level transaction is found to cover
+ * an existing SIREAD lock for the same transaction, the SIREAD lock
+ * can be deleted.
+ *
+ * (7) A write from a serializable transaction must ensure that an xact
+ * record exists for the transaction, with the same lifespan (until
+ * all concurrent transaction complete or the transaction is rolled
+ * back) so that rw-dependencies to that transaction can be
+ * detected.
+ *
+ * We use an optimization for read-only transactions. Under certain
+ * circumstances, a read-only transaction's snapshot can be shown to
+ * never have conflicts with other transactions. This is referred to
+ * as a "safe" snapshot (and one known not to be is "unsafe").
+ * However, it can't be determined whether a snapshot is safe until
+ * all concurrent read/write transactions complete.
+ *
+ * Once a read-only transaction is known to have a safe snapshot, it
+ * can release its predicate locks and exempt itself from further
+ * predicate lock tracking. READ ONLY DEFERRABLE transactions run only
+ * on safe snapshots, waiting as necessary for one to be available.
+ *
+ *
+ * Lightweight locks to manage access to the predicate locking shared
+ * memory objects must be taken in this order, and should be released in
+ * reverse order:
+ *
+ * SerializableFinishedListLock
+ * - Protects the list of transactions which have completed but which
+ * may yet matter because they overlap still-active transactions.
+ *
+ * SerializablePredicateListLock
+ * - Protects the linked list of locks held by a transaction. Note
+ * that the locks themselves are also covered by the partition
+ * locks of their respective lock targets; this lock only affects
+ * the linked list connecting the locks related to a transaction.
+ * - All transactions share this single lock (with no partitioning).
+ * - There is never a need for a process other than the one running
+ * an active transaction to walk the list of locks held by that
+ * transaction, except parallel query workers sharing the leader's
+ * transaction. In the parallel case, an extra per-sxact lock is
+ * taken; see below.
+ * - It is relatively infrequent that another process needs to
+ * modify the list for a transaction, but it does happen for such
+ * things as index page splits for pages with predicate locks and
+ * freeing of predicate locked pages by a vacuum process. When
+ * removing a lock in such cases, the lock itself contains the
+ * pointers needed to remove it from the list. When adding a
+ * lock in such cases, the lock can be added using the anchor in
+ * the transaction structure. Neither requires walking the list.
+ * - Cleaning up the list for a terminated transaction is sometimes
+ * not done on a retail basis, in which case no lock is required.
+ * - Due to the above, a process accessing its active transaction's
+ * list always uses a shared lock, regardless of whether it is
+ * walking or maintaining the list. This improves concurrency
+ * for the common access patterns.
+ * - A process which needs to alter the list of a transaction other
+ * than its own active transaction must acquire an exclusive
+ * lock.
+ *
+ * SERIALIZABLEXACT's member 'perXactPredicateListLock'
+ * - Protects the linked list of predicate locks held by a transaction.
+ * Only needed for parallel mode, where multiple backends share the
+ * same SERIALIZABLEXACT object. Not needed if
+ * SerializablePredicateListLock is held exclusively.
+ *
+ * PredicateLockHashPartitionLock(hashcode)
+ * - The same lock protects a target, all locks on that target, and
+ * the linked list of locks on the target.
+ * - When more than one is needed, acquire in ascending address order.
+ * - When all are needed (rare), acquire in ascending index order with
+ * PredicateLockHashPartitionLockByIndex(index).
+ *
+ * SerializableXactHashLock
+ * - Protects both PredXact and SerializableXidHash.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/predicate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *
+ * housekeeping for setting up shared memory predicate lock structures
+ * InitPredicateLocks(void)
+ * PredicateLockShmemSize(void)
+ *
+ * predicate lock reporting
+ * GetPredicateLockStatusData(void)
+ * PageIsPredicateLocked(Relation relation, BlockNumber blkno)
+ *
+ * predicate lock maintenance
+ * GetSerializableTransactionSnapshot(Snapshot snapshot)
+ * SetSerializableTransactionSnapshot(Snapshot snapshot,
+ * VirtualTransactionId *sourcevxid)
+ * RegisterPredicateLockingXid(void)
+ * PredicateLockRelation(Relation relation, Snapshot snapshot)
+ * PredicateLockPage(Relation relation, BlockNumber blkno,
+ * Snapshot snapshot)
+ * PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+ * TransactionId insert_xid)
+ * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+ * BlockNumber newblkno)
+ * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+ * BlockNumber newblkno)
+ * TransferPredicateLocksToHeapRelation(Relation relation)
+ * ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
+ *
+ * conflict detection (may also trigger rollback)
+ * CheckForSerializableConflictOut(Relation relation, TransactionId xid,
+ * Snapshot snapshot)
+ * CheckForSerializableConflictIn(Relation relation, ItemPointer tid,
+ * BlockNumber blkno)
+ * CheckTableForSerializableConflictIn(Relation relation)
+ *
+ * final rollback checking
+ * PreCommit_CheckForSerializationFailure(void)
+ *
+ * two-phase commit support
+ * AtPrepare_PredicateLocks(void);
+ * PostPrepare_PredicateLocks(TransactionId xid);
+ * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
+ * predicatelock_twophase_recover(TransactionId xid, uint16 info,
+ * void *recdata, uint32 len);
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "storage/predicate_internals.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+/* Uncomment the next line to test the graceful degradation code. */
+/* #define TEST_SUMMARIZE_SERIAL */
+
+/*
+ * Test the most selective fields first, for performance.
+ *
+ * a is covered by b if all of the following hold:
+ * 1) a.database = b.database
+ * 2) a.relation = b.relation
+ * 3) b.offset is invalid (b is page-granularity or higher)
+ * 4) either of the following:
+ * 4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
+ * or 4b) a.offset is invalid and b.page is invalid (a is
+ * page-granularity and b is relation-granularity
+ */
+#define TargetTagIsCoveredBy(covered_target, covering_target) \
+ ((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */ \
+ GET_PREDICATELOCKTARGETTAG_RELATION(covering_target)) \
+ && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) == \
+ InvalidOffsetNumber) /* (3) */ \
+ && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) != \
+ InvalidOffsetNumber) /* (4a) */ \
+ && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \
+ GET_PREDICATELOCKTARGETTAG_PAGE(covered_target))) \
+ || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \
+ InvalidBlockNumber) /* (4b) */ \
+ && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target) \
+ != InvalidBlockNumber))) \
+ && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) == /* (1) */ \
+ GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
+
+/*
+ * The predicate locking target and lock shared hash tables are partitioned to
+ * reduce contention. To determine which partition a given target belongs to,
+ * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
+ * apply one of these macros.
+ * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
+ */
+#define PredicateLockHashPartition(hashcode) \
+ ((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
+#define PredicateLockHashPartitionLock(hashcode) \
+ (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \
+ PredicateLockHashPartition(hashcode)].lock)
+#define PredicateLockHashPartitionLockByIndex(i) \
+ (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
+
+#define NPREDICATELOCKTARGETENTS() \
+ mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+
+#define SxactIsOnFinishedList(sxact) (!SHMQueueIsDetached(&((sxact)->finishedLink)))
+
+/*
+ * Note that a sxact is marked "prepared" once it has passed
+ * PreCommit_CheckForSerializationFailure, even if it isn't using
+ * 2PC. This is the point at which it can no longer be aborted.
+ *
+ * The PREPARED flag remains set after commit, so SxactIsCommitted
+ * implies SxactIsPrepared.
+ */
+#define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0)
+#define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0)
+#define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0)
+#define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0)
+#define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0)
+#define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0)
+#define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0)
+/*
+ * The following macro actually means that the specified transaction has a
+ * conflict out *to a transaction which committed ahead of it*. It's hard
+ * to get that into a name of a reasonable length.
+ */
+#define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0)
+#define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0)
+#define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0)
+#define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0)
+#define SxactIsPartiallyReleased(sxact) (((sxact)->flags & SXACT_FLAG_PARTIALLY_RELEASED) != 0)
+
+/*
+ * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed. Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+#define PredicateLockTargetTagHashCode(predicatelocktargettag) \
+ get_hash_value(PredicateLockTargetHash, predicatelocktargettag)
+
+/*
+ * Given a predicate lock tag, and the hash for its target,
+ * compute the lock hash.
+ *
+ * To make the hash code also depend on the transaction, we xor the sxid
+ * struct's address into the hash code, left-shifted so that the
+ * partition-number bits don't change. Since this is only a hash, we
+ * don't care if we lose high-order bits of the address; use an
+ * intermediate variable to suppress cast-pointer-to-int warnings.
+ */
+#define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
+ ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \
+ << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+
+
+/*
+ * The SLRU buffer area through which we access the old xids.
+ */
+static SlruCtlData SerialSlruCtlData;
+
+#define SerialSlruCtl (&SerialSlruCtlData)
+
+#define SERIAL_PAGESIZE BLCKSZ
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+
+/*
+ * Set maximum pages based on the number needed to track all transactions.
+ */
+#define SERIAL_MAX_PAGE (MaxTransactionId / SERIAL_ENTRIESPERPAGE)
+
+#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
+
+#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
+ (SerialSlruCtl->shared->page_buffer[slotno] + \
+ ((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
+
+#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
+
+typedef struct SerialControlData
+{
+ int headPage; /* newest initialized page */
+ TransactionId headXid; /* newest valid Xid in the SLRU */
+ TransactionId tailXid; /* oldest xmin we might be interested in */
+} SerialControlData;
+
+typedef struct SerialControlData *SerialControl;
+
+static SerialControl serialControl;
+
+/*
+ * When the oldest committed transaction on the "finished" list is moved to
+ * SLRU, its predicate locks will be moved to this "dummy" transaction,
+ * collapsing duplicate targets. When a duplicate is found, the later
+ * commitSeqNo is used.
+ */
+static SERIALIZABLEXACT *OldCommittedSxact;
+
+
+/*
+ * These configuration variables are used to set the predicate lock table size
+ * and to control promotion of predicate locks to coarser granularity in an
+ * attempt to degrade performance (mostly as false positive serialization
+ * failure) gracefully in the face of memory pressure.
+ */
+int max_predicate_locks_per_xact; /* set by guc.c */
+int max_predicate_locks_per_relation; /* set by guc.c */
+int max_predicate_locks_per_page; /* set by guc.c */
+
+/*
+ * This provides a list of objects in order to track transactions
+ * participating in predicate locking. Entries in the list are fixed size,
+ * and reside in shared memory. The memory address of an entry must remain
+ * fixed during its lifetime. The list will be protected from concurrent
+ * update externally; no provision is made in this code to manage that. The
+ * number of entries in the list, and the size allowed for each entry is
+ * fixed upon creation.
+ */
+static PredXactList PredXact;
+
+/*
+ * This provides a pool of RWConflict data elements to use in conflict lists
+ * between transactions.
+ */
+static RWConflictPoolHeader RWConflictPool;
+
+/*
+ * The predicate locking hash tables are in shared memory.
+ * Each backend keeps pointers to them.
+ */
+static HTAB *SerializableXidHash;
+static HTAB *PredicateLockTargetHash;
+static HTAB *PredicateLockHash;
+static SHM_QUEUE *FinishedSerializableTransactions;
+
+/*
+ * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing
+ * this entry, you can ensure that there's enough scratch space available for
+ * inserting one entry in the hash table. This is an otherwise-invalid tag.
+ */
+static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0};
+static uint32 ScratchTargetTagHash;
+static LWLock *ScratchPartitionLock;
+
+/*
+ * The local hash table used to determine when to combine multiple fine-
+ * grained locks into a single courser-grained lock.
+ */
+static HTAB *LocalPredicateLockHash = NULL;
+
+/*
+ * Keep a pointer to the currently-running serializable transaction (if any)
+ * for quick reference. Also, remember if we have written anything that could
+ * cause a rw-conflict.
+ */
+static SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
+static bool MyXactDidWrite = false;
+
+/*
+ * The SXACT_FLAG_RO_UNSAFE optimization might lead us to release
+ * MySerializableXact early. If that happens in a parallel query, the leader
+ * needs to defer the destruction of the SERIALIZABLEXACT until end of
+ * transaction, because the workers still have a reference to it. In that
+ * case, the leader stores it here.
+ */
+static SERIALIZABLEXACT *SavedSerializableXact = InvalidSerializableXact;
+
+/* local functions */
+
+static SERIALIZABLEXACT *CreatePredXact(void);
+static void ReleasePredXact(SERIALIZABLEXACT *sxact);
+static SERIALIZABLEXACT *FirstPredXact(void);
+static SERIALIZABLEXACT *NextPredXact(SERIALIZABLEXACT *sxact);
+
+static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
+static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact);
+static void ReleaseRWConflict(RWConflict conflict);
+static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact);
+
+static bool SerialPagePrecedesLogically(int page1, int page2);
+static void SerialInit(void);
+static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo);
+static SerCommitSeqNo SerialGetMinConflictCommitSeqNo(TransactionId xid);
+static void SerialSetActiveSerXmin(TransactionId xid);
+
+static uint32 predicatelock_hash(const void *key, Size keysize);
+static void SummarizeOldestCommittedSxact(void);
+static Snapshot GetSafeSnapshot(Snapshot snapshot);
+static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot,
+ VirtualTransactionId *sourcevxid,
+ int sourcepid);
+static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag);
+static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ PREDICATELOCKTARGETTAG *parent);
+static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
+static void RemoveScratchTarget(bool lockheld);
+static void RestoreScratchTarget(bool lockheld);
+static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target,
+ uint32 targettaghash);
+static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag);
+static int MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag);
+static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag);
+static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
+static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
+ uint32 targettaghash,
+ SERIALIZABLEXACT *sxact);
+static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash);
+static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
+ PREDICATELOCKTARGETTAG newtargettag,
+ bool removeOld);
+static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag);
+static void DropAllPredicateLocksFromTable(Relation relation,
+ bool transfer);
+static void SetNewSxactGlobalXmin(void);
+static void ClearOldPredicateLocks(void);
+static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
+ bool summarize);
+static bool XidIsConcurrent(TransactionId xid);
+static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
+static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+ SERIALIZABLEXACT *writer);
+static void CreateLocalPredicateLockHash(void);
+static void ReleasePredicateLocksLocal(void);
+
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Does this relation participate in predicate locking? Temporary and system
+ * relations are exempt, as are materialized views.
+ */
+static inline bool
+PredicateLockingNeededForRelation(Relation relation)
+{
+ return !(relation->rd_id < FirstBootstrapObjectId ||
+ RelationUsesLocalBuffers(relation) ||
+ relation->rd_rel->relkind == RELKIND_MATVIEW);
+}
+
+/*
+ * When a public interface method is called for a read, this is the test to
+ * see if we should do a quick return.
+ *
+ * Note: this function has side-effects! If this transaction has been flagged
+ * as RO-safe since the last call, we release all predicate locks and reset
+ * MySerializableXact. That makes subsequent calls to return quickly.
+ *
+ * This is marked as 'inline' to eliminate the function call overhead in the
+ * common case that serialization is not needed.
+ */
+static inline bool
+SerializationNeededForRead(Relation relation, Snapshot snapshot)
+{
+ /* Nothing to do if this is not a serializable transaction */
+ if (MySerializableXact == InvalidSerializableXact)
+ return false;
+
+ /*
+ * Don't acquire locks or conflict when scanning with a special snapshot.
+ * This excludes things like CLUSTER and REINDEX. They use the wholesale
+ * functions TransferPredicateLocksToHeapRelation() and
+ * CheckTableForSerializableConflictIn() to participate in serialization,
+ * but the scans involved don't need serialization.
+ */
+ if (!IsMVCCSnapshot(snapshot))
+ return false;
+
+ /*
+ * Check if we have just become "RO-safe". If we have, immediately release
+ * all locks as they're not needed anymore. This also resets
+ * MySerializableXact, so that subsequent calls to this function can exit
+ * quickly.
+ *
+ * A transaction is flagged as RO_SAFE if all concurrent R/W transactions
+ * commit without having conflicts out to an earlier snapshot, thus
+ * ensuring that no conflicts are possible for this transaction.
+ */
+ if (SxactIsROSafe(MySerializableXact))
+ {
+ ReleasePredicateLocks(false, true);
+ return false;
+ }
+
+ /* Check if the relation doesn't participate in predicate locking */
+ if (!PredicateLockingNeededForRelation(relation))
+ return false;
+
+ return true; /* no excuse to skip predicate locking */
+}
+
+/*
+ * Like SerializationNeededForRead(), but called on writes.
+ * The logic is the same, but there is no snapshot and we can't be RO-safe.
+ */
+static inline bool
+SerializationNeededForWrite(Relation relation)
+{
+ /* Nothing to do if this is not a serializable transaction */
+ if (MySerializableXact == InvalidSerializableXact)
+ return false;
+
+ /* Check if the relation doesn't participate in predicate locking */
+ if (!PredicateLockingNeededForRelation(relation))
+ return false;
+
+ return true; /* no excuse to skip predicate locking */
+}
+
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * These functions are a simple implementation of a list for this specific
+ * type of struct. If there is ever a generalized shared memory list, we
+ * should probably switch to that.
+ */
+static SERIALIZABLEXACT *
+CreatePredXact(void)
+{
+ PredXactListElement ptle;
+
+ ptle = (PredXactListElement)
+ SHMQueueNext(&PredXact->availableList,
+ &PredXact->availableList,
+ offsetof(PredXactListElementData, link));
+ if (!ptle)
+ return NULL;
+
+ SHMQueueDelete(&ptle->link);
+ SHMQueueInsertBefore(&PredXact->activeList, &ptle->link);
+ return &ptle->sxact;
+}
+
+static void
+ReleasePredXact(SERIALIZABLEXACT *sxact)
+{
+ PredXactListElement ptle;
+
+ Assert(ShmemAddrIsValid(sxact));
+
+ ptle = (PredXactListElement)
+ (((char *) sxact)
+ - offsetof(PredXactListElementData, sxact)
+ + offsetof(PredXactListElementData, link));
+ SHMQueueDelete(&ptle->link);
+ SHMQueueInsertBefore(&PredXact->availableList, &ptle->link);
+}
+
+static SERIALIZABLEXACT *
+FirstPredXact(void)
+{
+ PredXactListElement ptle;
+
+ ptle = (PredXactListElement)
+ SHMQueueNext(&PredXact->activeList,
+ &PredXact->activeList,
+ offsetof(PredXactListElementData, link));
+ if (!ptle)
+ return NULL;
+
+ return &ptle->sxact;
+}
+
+static SERIALIZABLEXACT *
+NextPredXact(SERIALIZABLEXACT *sxact)
+{
+ PredXactListElement ptle;
+
+ Assert(ShmemAddrIsValid(sxact));
+
+ ptle = (PredXactListElement)
+ (((char *) sxact)
+ - offsetof(PredXactListElementData, sxact)
+ + offsetof(PredXactListElementData, link));
+ ptle = (PredXactListElement)
+ SHMQueueNext(&PredXact->activeList,
+ &ptle->link,
+ offsetof(PredXactListElementData, link));
+ if (!ptle)
+ return NULL;
+
+ return &ptle->sxact;
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * These functions manage primitive access to the RWConflict pool and lists.
+ */
+static bool
+RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
+{
+ RWConflict conflict;
+
+ Assert(reader != writer);
+
+ /* Check the ends of the purported conflict first. */
+ if (SxactIsDoomed(reader)
+ || SxactIsDoomed(writer)
+ || SHMQueueEmpty(&reader->outConflicts)
+ || SHMQueueEmpty(&writer->inConflicts))
+ return false;
+
+ /* A conflict is possible; walk the list to find out. */
+ conflict = (RWConflict)
+ SHMQueueNext(&reader->outConflicts,
+ &reader->outConflicts,
+ offsetof(RWConflictData, outLink));
+ while (conflict)
+ {
+ if (conflict->sxactIn == writer)
+ return true;
+ conflict = (RWConflict)
+ SHMQueueNext(&reader->outConflicts,
+ &conflict->outLink,
+ offsetof(RWConflictData, outLink));
+ }
+
+ /* No conflict found. */
+ return false;
+}
+
+static void
+SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+{
+ RWConflict conflict;
+
+ Assert(reader != writer);
+ Assert(!RWConflictExists(reader, writer));
+
+ conflict = (RWConflict)
+ SHMQueueNext(&RWConflictPool->availableList,
+ &RWConflictPool->availableList,
+ offsetof(RWConflictData, outLink));
+ if (!conflict)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("not enough elements in RWConflictPool to record a read/write conflict"),
+ errhint("You might need to run fewer transactions at a time or increase max_connections.")));
+
+ SHMQueueDelete(&conflict->outLink);
+
+ conflict->sxactOut = reader;
+ conflict->sxactIn = writer;
+ SHMQueueInsertBefore(&reader->outConflicts, &conflict->outLink);
+ SHMQueueInsertBefore(&writer->inConflicts, &conflict->inLink);
+}
+
+static void
+SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact,
+ SERIALIZABLEXACT *activeXact)
+{
+ RWConflict conflict;
+
+ Assert(roXact != activeXact);
+ Assert(SxactIsReadOnly(roXact));
+ Assert(!SxactIsReadOnly(activeXact));
+
+ conflict = (RWConflict)
+ SHMQueueNext(&RWConflictPool->availableList,
+ &RWConflictPool->availableList,
+ offsetof(RWConflictData, outLink));
+ if (!conflict)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"),
+ errhint("You might need to run fewer transactions at a time or increase max_connections.")));
+
+ SHMQueueDelete(&conflict->outLink);
+
+ conflict->sxactOut = activeXact;
+ conflict->sxactIn = roXact;
+ SHMQueueInsertBefore(&activeXact->possibleUnsafeConflicts,
+ &conflict->outLink);
+ SHMQueueInsertBefore(&roXact->possibleUnsafeConflicts,
+ &conflict->inLink);
+}
+
+static void
+ReleaseRWConflict(RWConflict conflict)
+{
+ SHMQueueDelete(&conflict->inLink);
+ SHMQueueDelete(&conflict->outLink);
+ SHMQueueInsertBefore(&RWConflictPool->availableList, &conflict->outLink);
+}
+
+static void
+FlagSxactUnsafe(SERIALIZABLEXACT *sxact)
+{
+ RWConflict conflict,
+ nextConflict;
+
+ Assert(SxactIsReadOnly(sxact));
+ Assert(!SxactIsROSafe(sxact));
+
+ sxact->flags |= SXACT_FLAG_RO_UNSAFE;
+
+ /*
+ * We know this isn't a safe snapshot, so we can stop looking for other
+ * potential conflicts.
+ */
+ conflict = (RWConflict)
+ SHMQueueNext(&sxact->possibleUnsafeConflicts,
+ &sxact->possibleUnsafeConflicts,
+ offsetof(RWConflictData, inLink));
+ while (conflict)
+ {
+ nextConflict = (RWConflict)
+ SHMQueueNext(&sxact->possibleUnsafeConflicts,
+ &conflict->inLink,
+ offsetof(RWConflictData, inLink));
+
+ Assert(!SxactIsReadOnly(conflict->sxactOut));
+ Assert(sxact == conflict->sxactIn);
+
+ ReleaseRWConflict(conflict);
+
+ conflict = nextConflict;
+ }
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Decide whether a Serial page number is "older" for truncation purposes.
+ * Analogous to CLOGPagePrecedes().
+ */
+static bool
+SerialPagePrecedesLogically(int page1, int page2)
+{
+ TransactionId xid1;
+ TransactionId xid2;
+
+ xid1 = ((TransactionId) page1) * SERIAL_ENTRIESPERPAGE;
+ xid1 += FirstNormalTransactionId + 1;
+ xid2 = ((TransactionId) page2) * SERIAL_ENTRIESPERPAGE;
+ xid2 += FirstNormalTransactionId + 1;
+
+ return (TransactionIdPrecedes(xid1, xid2) &&
+ TransactionIdPrecedes(xid1, xid2 + SERIAL_ENTRIESPERPAGE - 1));
+}
+
+#ifdef USE_ASSERT_CHECKING
+static void
+SerialPagePrecedesLogicallyUnitTests(void)
+{
+ int per_page = SERIAL_ENTRIESPERPAGE,
+ offset = per_page / 2;
+ int newestPage,
+ oldestPage,
+ headPage,
+ targetPage;
+ TransactionId newestXact,
+ oldestXact;
+
+ /* GetNewTransactionId() has assigned the last XID it can safely use. */
+ newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1; /* nothing special */
+ newestXact = newestPage * per_page + offset;
+ Assert(newestXact / per_page == newestPage);
+ oldestXact = newestXact + 1;
+ oldestXact -= 1U << 31;
+ oldestPage = oldestXact / per_page;
+
+ /*
+ * In this scenario, the SLRU headPage pertains to the last ~1000 XIDs
+ * assigned. oldestXact finishes, ~2B XIDs having elapsed since it
+ * started. Further transactions cause us to summarize oldestXact to
+ * tailPage. Function must return false so SerialAdd() doesn't zero
+ * tailPage (which may contain entries for other old, recently-finished
+ * XIDs) and half the SLRU. Reaching this requires burning ~2B XIDs in
+ * single-user mode, a negligible possibility.
+ */
+ headPage = newestPage;
+ targetPage = oldestPage;
+ Assert(!SerialPagePrecedesLogically(headPage, targetPage));
+
+ /*
+ * In this scenario, the SLRU headPage pertains to oldestXact. We're
+ * summarizing an XID near newestXact. (Assume few other XIDs used
+ * SERIALIZABLE, hence the minimal headPage advancement. Assume
+ * oldestXact was long-running and only recently reached the SLRU.)
+ * Function must return true to make SerialAdd() create targetPage.
+ *
+ * Today's implementation mishandles this case, but it doesn't matter
+ * enough to fix. Verify that the defect affects just one page by
+ * asserting correct treatment of its prior page. Reaching this case
+ * requires burning ~2B XIDs in single-user mode, a negligible
+ * possibility. Moreover, if it does happen, the consequence would be
+ * mild, namely a new transaction failing in SimpleLruReadPage().
+ */
+ headPage = oldestPage;
+ targetPage = newestPage;
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+#if 0
+ Assert(SerialPagePrecedesLogically(headPage, targetPage));
+#endif
+}
+#endif
+
+/*
+ * Initialize for the tracking of old serializable committed xids.
+ */
+static void
+SerialInit(void)
+{
+ bool found;
+
+ /*
+ * Set up SLRU management of the pg_serial data.
+ */
+ SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
+ SimpleLruInit(SerialSlruCtl, "Serial",
+ NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+ LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
+#ifdef USE_ASSERT_CHECKING
+ SerialPagePrecedesLogicallyUnitTests();
+#endif
+ SlruPagePrecedesUnitTests(SerialSlruCtl, SERIAL_ENTRIESPERPAGE);
+
+ /*
+ * Create or attach to the SerialControl structure.
+ */
+ serialControl = (SerialControl)
+ ShmemInitStruct("SerialControlData", sizeof(SerialControlData), &found);
+
+ Assert(found == IsUnderPostmaster);
+ if (!found)
+ {
+ /*
+ * Set control information to reflect empty SLRU.
+ */
+ serialControl->headPage = -1;
+ serialControl->headXid = InvalidTransactionId;
+ serialControl->tailXid = InvalidTransactionId;
+ }
+}
+
+/*
+ * Record a committed read write serializable xid and the minimum
+ * commitSeqNo of any transactions to which this xid had a rw-conflict out.
+ * An invalid commitSeqNo means that there were no conflicts out from xid.
+ */
+static void
+SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
+{
+ TransactionId tailXid;
+ int targetPage;
+ int slotno;
+ int firstZeroPage;
+ bool isNewPage;
+
+ Assert(TransactionIdIsValid(xid));
+
+ targetPage = SerialPage(xid);
+
+ LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * If no serializable transactions are active, there shouldn't be anything
+ * to push out to the SLRU. Hitting this assert would mean there's
+ * something wrong with the earlier cleanup logic.
+ */
+ tailXid = serialControl->tailXid;
+ Assert(TransactionIdIsValid(tailXid));
+
+ /*
+ * If the SLRU is currently unused, zero out the whole active region from
+ * tailXid to headXid before taking it into use. Otherwise zero out only
+ * any new pages that enter the tailXid-headXid range as we advance
+ * headXid.
+ */
+ if (serialControl->headPage < 0)
+ {
+ firstZeroPage = SerialPage(tailXid);
+ isNewPage = true;
+ }
+ else
+ {
+ firstZeroPage = SerialNextPage(serialControl->headPage);
+ isNewPage = SerialPagePrecedesLogically(serialControl->headPage,
+ targetPage);
+ }
+
+ if (!TransactionIdIsValid(serialControl->headXid)
+ || TransactionIdFollows(xid, serialControl->headXid))
+ serialControl->headXid = xid;
+ if (isNewPage)
+ serialControl->headPage = targetPage;
+
+ if (isNewPage)
+ {
+ /* Initialize intervening pages. */
+ while (firstZeroPage != targetPage)
+ {
+ (void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage);
+ firstZeroPage = SerialNextPage(firstZeroPage);
+ }
+ slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage);
+ }
+ else
+ slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid);
+
+ SerialValue(slotno, xid) = minConflictCommitSeqNo;
+ SerialSlruCtl->shared->page_dirty[slotno] = true;
+
+ LWLockRelease(SerialSLRULock);
+}
+
+/*
+ * Get the minimum commitSeqNo for any conflict out for the given xid. For
+ * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo
+ * will be returned.
+ */
+static SerCommitSeqNo
+SerialGetMinConflictCommitSeqNo(TransactionId xid)
+{
+ TransactionId headXid;
+ TransactionId tailXid;
+ SerCommitSeqNo val;
+ int slotno;
+
+ Assert(TransactionIdIsValid(xid));
+
+ LWLockAcquire(SerialSLRULock, LW_SHARED);
+ headXid = serialControl->headXid;
+ tailXid = serialControl->tailXid;
+ LWLockRelease(SerialSLRULock);
+
+ if (!TransactionIdIsValid(headXid))
+ return 0;
+
+ Assert(TransactionIdIsValid(tailXid));
+
+ if (TransactionIdPrecedes(xid, tailXid)
+ || TransactionIdFollows(xid, headXid))
+ return 0;
+
+ /*
+ * The following function must be called without holding SerialSLRULock,
+ * but will return with that lock held, which must then be released.
+ */
+ slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl,
+ SerialPage(xid), xid);
+ val = SerialValue(slotno, xid);
+ LWLockRelease(SerialSLRULock);
+ return val;
+}
+
+/*
+ * Call this whenever there is a new xmin for active serializable
+ * transactions. We don't need to keep information on transactions which
+ * precede that. InvalidTransactionId means none active, so everything in
+ * the SLRU can be discarded.
+ */
+static void
+SerialSetActiveSerXmin(TransactionId xid)
+{
+ LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * When no sxacts are active, nothing overlaps, set the xid values to
+ * invalid to show that there are no valid entries. Don't clear headPage,
+ * though. A new xmin might still land on that page, and we don't want to
+ * repeatedly zero out the same page.
+ */
+ if (!TransactionIdIsValid(xid))
+ {
+ serialControl->tailXid = InvalidTransactionId;
+ serialControl->headXid = InvalidTransactionId;
+ LWLockRelease(SerialSLRULock);
+ return;
+ }
+
+ /*
+ * When we're recovering prepared transactions, the global xmin might move
+ * backwards depending on the order they're recovered. Normally that's not
+ * OK, but during recovery no serializable transactions will commit, so
+ * the SLRU is empty and we can get away with it.
+ */
+ if (RecoveryInProgress())
+ {
+ Assert(serialControl->headPage < 0);
+ if (!TransactionIdIsValid(serialControl->tailXid)
+ || TransactionIdPrecedes(xid, serialControl->tailXid))
+ {
+ serialControl->tailXid = xid;
+ }
+ LWLockRelease(SerialSLRULock);
+ return;
+ }
+
+ Assert(!TransactionIdIsValid(serialControl->tailXid)
+ || TransactionIdFollows(xid, serialControl->tailXid));
+
+ serialControl->tailXid = xid;
+
+ LWLockRelease(SerialSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ *
+ * We don't have any data that needs to survive a restart, but this is a
+ * convenient place to truncate the SLRU.
+ */
+void
+CheckPointPredicate(void)
+{
+ int tailPage;
+
+ LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+ /* Exit quickly if the SLRU is currently not in use. */
+ if (serialControl->headPage < 0)
+ {
+ LWLockRelease(SerialSLRULock);
+ return;
+ }
+
+ if (TransactionIdIsValid(serialControl->tailXid))
+ {
+ /* We can truncate the SLRU up to the page containing tailXid */
+ tailPage = SerialPage(serialControl->tailXid);
+ }
+ else
+ {
+ /*----------
+ * The SLRU is no longer needed. Truncate to head before we set head
+ * invalid.
+ *
+ * XXX: It's possible that the SLRU is not needed again until XID
+ * wrap-around has happened, so that the segment containing headPage
+ * that we leave behind will appear to be new again. In that case it
+ * won't be removed until XID horizon advances enough to make it
+ * current again.
+ *
+ * XXX: This should happen in vac_truncate_clog(), not in checkpoints.
+ * Consider this scenario, starting from a system with no in-progress
+ * transactions and VACUUM FREEZE having maximized oldestXact:
+ * - Start a SERIALIZABLE transaction.
+ * - Start, finish, and summarize a SERIALIZABLE transaction, creating
+ * one SLRU page.
+ * - Consume XIDs to reach xidStopLimit.
+ * - Finish all transactions. Due to the long-running SERIALIZABLE
+ * transaction, earlier checkpoints did not touch headPage. The
+ * next checkpoint will change it, but that checkpoint happens after
+ * the end of the scenario.
+ * - VACUUM to advance XID limits.
+ * - Consume ~2M XIDs, crossing the former xidWrapLimit.
+ * - Start, finish, and summarize a SERIALIZABLE transaction.
+ * SerialAdd() declines to create the targetPage, because headPage
+ * is not regarded as in the past relative to that targetPage. The
+ * transaction instigating the summarize fails in
+ * SimpleLruReadPage().
+ */
+ tailPage = serialControl->headPage;
+ serialControl->headPage = -1;
+ }
+
+ LWLockRelease(SerialSLRULock);
+
+ /* Truncate away pages that are no longer required */
+ SimpleLruTruncate(SerialSlruCtl, tailPage);
+
+ /*
+ * Write dirty SLRU pages to disk
+ *
+ * This is not actually necessary from a correctness point of view. We do
+ * it merely as a debugging aid.
+ *
+ * We're doing this after the truncation to avoid writing pages right
+ * before deleting the file in which they sit, which would be completely
+ * pointless.
+ */
+ SimpleLruWriteAll(SerialSlruCtl, true);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * InitPredicateLocks -- Initialize the predicate locking data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments. In the normal postmaster case, the shared hash tables
+ * are created here. Backends inherit the pointers
+ * to the shared tables via fork(). In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables.
+ */
+void
+InitPredicateLocks(void)
+{
+ HASHCTL info;
+ long max_table_size;
+ Size requestSize;
+ bool found;
+
+#ifndef EXEC_BACKEND
+ Assert(!IsUnderPostmaster);
+#endif
+
+ /*
+ * Compute size of predicate lock target hashtable. Note these
+ * calculations must agree with PredicateLockShmemSize!
+ */
+ max_table_size = NPREDICATELOCKTARGETENTS();
+
+ /*
+ * Allocate hash table for PREDICATELOCKTARGET structs. This stores
+ * per-predicate-lock-target information.
+ */
+ info.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ info.entrysize = sizeof(PREDICATELOCKTARGET);
+ info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+
+ PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
+ max_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_BLOBS |
+ HASH_PARTITION | HASH_FIXED_SIZE);
+
+ /*
+ * Reserve a dummy entry in the hash table; we use it to make sure there's
+ * always one entry available when we need to split or combine a page,
+ * because running out of space there could mean aborting a
+ * non-serializable transaction.
+ */
+ if (!IsUnderPostmaster)
+ {
+ (void) hash_search(PredicateLockTargetHash, &ScratchTargetTag,
+ HASH_ENTER, &found);
+ Assert(!found);
+ }
+
+ /* Pre-calculate the hash and partition lock of the scratch entry */
+ ScratchTargetTagHash = PredicateLockTargetTagHashCode(&ScratchTargetTag);
+ ScratchPartitionLock = PredicateLockHashPartitionLock(ScratchTargetTagHash);
+
+ /*
+ * Allocate hash table for PREDICATELOCK structs. This stores per
+ * xact-lock-of-a-target information.
+ */
+ info.keysize = sizeof(PREDICATELOCKTAG);
+ info.entrysize = sizeof(PREDICATELOCK);
+ info.hash = predicatelock_hash;
+ info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+
+ /* Assume an average of 2 xacts per target */
+ max_table_size *= 2;
+
+ PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
+ max_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_FUNCTION |
+ HASH_PARTITION | HASH_FIXED_SIZE);
+
+ /*
+ * Compute size for serializable transaction hashtable. Note these
+ * calculations must agree with PredicateLockShmemSize!
+ */
+ max_table_size = (MaxBackends + max_prepared_xacts);
+
+ /*
+ * Allocate a list to hold information on transactions participating in
+ * predicate locking.
+ *
+ * Assume an average of 10 predicate locking transactions per backend.
+ * This allows aggressive cleanup while detail is present before data must
+ * be summarized for storage in SLRU and the "dummy" transaction.
+ */
+ max_table_size *= 10;
+
+ PredXact = ShmemInitStruct("PredXactList",
+ PredXactListDataSize,
+ &found);
+ Assert(found == IsUnderPostmaster);
+ if (!found)
+ {
+ int i;
+
+ SHMQueueInit(&PredXact->availableList);
+ SHMQueueInit(&PredXact->activeList);
+ PredXact->SxactGlobalXmin = InvalidTransactionId;
+ PredXact->SxactGlobalXminCount = 0;
+ PredXact->WritableSxactCount = 0;
+ PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
+ PredXact->CanPartialClearThrough = 0;
+ PredXact->HavePartialClearedThrough = 0;
+ requestSize = mul_size((Size) max_table_size,
+ PredXactListElementDataSize);
+ PredXact->element = ShmemAlloc(requestSize);
+ /* Add all elements to available list, clean. */
+ memset(PredXact->element, 0, requestSize);
+ for (i = 0; i < max_table_size; i++)
+ {
+ LWLockInitialize(&PredXact->element[i].sxact.perXactPredicateListLock,
+ LWTRANCHE_PER_XACT_PREDICATE_LIST);
+ SHMQueueInsertBefore(&(PredXact->availableList),
+ &(PredXact->element[i].link));
+ }
+ PredXact->OldCommittedSxact = CreatePredXact();
+ SetInvalidVirtualTransactionId(PredXact->OldCommittedSxact->vxid);
+ PredXact->OldCommittedSxact->prepareSeqNo = 0;
+ PredXact->OldCommittedSxact->commitSeqNo = 0;
+ PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0;
+ SHMQueueInit(&PredXact->OldCommittedSxact->outConflicts);
+ SHMQueueInit(&PredXact->OldCommittedSxact->inConflicts);
+ SHMQueueInit(&PredXact->OldCommittedSxact->predicateLocks);
+ SHMQueueInit(&PredXact->OldCommittedSxact->finishedLink);
+ SHMQueueInit(&PredXact->OldCommittedSxact->possibleUnsafeConflicts);
+ PredXact->OldCommittedSxact->topXid = InvalidTransactionId;
+ PredXact->OldCommittedSxact->finishedBefore = InvalidTransactionId;
+ PredXact->OldCommittedSxact->xmin = InvalidTransactionId;
+ PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED;
+ PredXact->OldCommittedSxact->pid = 0;
+ }
+ /* This never changes, so let's keep a local copy. */
+ OldCommittedSxact = PredXact->OldCommittedSxact;
+
+ /*
+ * Allocate hash table for SERIALIZABLEXID structs. This stores per-xid
+ * information for serializable transactions which have accessed data.
+ */
+ info.keysize = sizeof(SERIALIZABLEXIDTAG);
+ info.entrysize = sizeof(SERIALIZABLEXID);
+
+ SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
+ max_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_BLOBS |
+ HASH_FIXED_SIZE);
+
+ /*
+ * Allocate space for tracking rw-conflicts in lists attached to the
+ * transactions.
+ *
+ * Assume an average of 5 conflicts per transaction. Calculations suggest
+ * that this will prevent resource exhaustion in even the most pessimal
+ * loads up to max_connections = 200 with all 200 connections pounding the
+ * database with serializable transactions. Beyond that, there may be
+ * occasional transactions canceled when trying to flag conflicts. That's
+ * probably OK.
+ */
+ max_table_size *= 5;
+
+ RWConflictPool = ShmemInitStruct("RWConflictPool",
+ RWConflictPoolHeaderDataSize,
+ &found);
+ Assert(found == IsUnderPostmaster);
+ if (!found)
+ {
+ int i;
+
+ SHMQueueInit(&RWConflictPool->availableList);
+ requestSize = mul_size((Size) max_table_size,
+ RWConflictDataSize);
+ RWConflictPool->element = ShmemAlloc(requestSize);
+ /* Add all elements to available list, clean. */
+ memset(RWConflictPool->element, 0, requestSize);
+ for (i = 0; i < max_table_size; i++)
+ {
+ SHMQueueInsertBefore(&(RWConflictPool->availableList),
+ &(RWConflictPool->element[i].outLink));
+ }
+ }
+
+ /*
+ * Create or attach to the header for the list of finished serializable
+ * transactions.
+ */
+ FinishedSerializableTransactions = (SHM_QUEUE *)
+ ShmemInitStruct("FinishedSerializableTransactions",
+ sizeof(SHM_QUEUE),
+ &found);
+ Assert(found == IsUnderPostmaster);
+ if (!found)
+ SHMQueueInit(FinishedSerializableTransactions);
+
+ /*
+ * Initialize the SLRU storage for old committed serializable
+ * transactions.
+ */
+ SerialInit();
+}
+
+/*
+ * Estimate shared-memory space used for predicate lock table
+ */
+Size
+PredicateLockShmemSize(void)
+{
+ Size size = 0;
+ long max_table_size;
+
+ /* predicate lock target hash table */
+ max_table_size = NPREDICATELOCKTARGETENTS();
+ size = add_size(size, hash_estimate_size(max_table_size,
+ sizeof(PREDICATELOCKTARGET)));
+
+ /* predicate lock hash table */
+ max_table_size *= 2;
+ size = add_size(size, hash_estimate_size(max_table_size,
+ sizeof(PREDICATELOCK)));
+
+ /*
+ * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
+ * margin.
+ */
+ size = add_size(size, size / 10);
+
+ /* transaction list */
+ max_table_size = MaxBackends + max_prepared_xacts;
+ max_table_size *= 10;
+ size = add_size(size, PredXactListDataSize);
+ size = add_size(size, mul_size((Size) max_table_size,
+ PredXactListElementDataSize));
+
+ /* transaction xid table */
+ size = add_size(size, hash_estimate_size(max_table_size,
+ sizeof(SERIALIZABLEXID)));
+
+ /* rw-conflict pool */
+ max_table_size *= 5;
+ size = add_size(size, RWConflictPoolHeaderDataSize);
+ size = add_size(size, mul_size((Size) max_table_size,
+ RWConflictDataSize));
+
+ /* Head for list of finished serializable transactions. */
+ size = add_size(size, sizeof(SHM_QUEUE));
+
+ /* Shared memory structures for SLRU tracking of old committed xids. */
+ size = add_size(size, sizeof(SerialControlData));
+ size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+
+ return size;
+}
+
+
+/*
+ * Compute the hash code associated with a PREDICATELOCKTAG.
+ *
+ * Because we want to use just one set of partition locks for both the
+ * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
+ * that PREDICATELOCKs fall into the same partition number as their
+ * associated PREDICATELOCKTARGETs. dynahash.c expects the partition number
+ * to be the low-order bits of the hash code, and therefore a
+ * PREDICATELOCKTAG's hash code must have the same low-order bits as the
+ * associated PREDICATELOCKTARGETTAG's hash code. We achieve this with this
+ * specialized hash function.
+ */
+static uint32
+predicatelock_hash(const void *key, Size keysize)
+{
+ const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
+ uint32 targethash;
+
+ Assert(keysize == sizeof(PREDICATELOCKTAG));
+
+ /* Look into the associated target object, and compute its hash code */
+ targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
+
+ return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
+}
+
+
+/*
+ * GetPredicateLockStatusData
+ * Return a table containing the internal state of the predicate
+ * lock manager for use in pg_lock_status.
+ *
+ * Like GetLockStatusData, this function tries to hold the partition LWLocks
+ * for as short a time as possible by returning two arrays that simply
+ * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
+ * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
+ * SERIALIZABLEXACT will likely appear.
+ */
+PredicateLockData *
+GetPredicateLockStatusData(void)
+{
+ PredicateLockData *data;
+ int i;
+ int els,
+ el;
+ HASH_SEQ_STATUS seqstat;
+ PREDICATELOCK *predlock;
+
+ data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
+
+ /*
+ * To ensure consistency, take simultaneous locks on all partition locks
+ * in ascending order, then SerializableXactHashLock.
+ */
+ for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+ /* Get number of locks and allocate appropriately-sized arrays. */
+ els = hash_get_num_entries(PredicateLockHash);
+ data->nelements = els;
+ data->locktags = (PREDICATELOCKTARGETTAG *)
+ palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
+ data->xacts = (SERIALIZABLEXACT *)
+ palloc(sizeof(SERIALIZABLEXACT) * els);
+
+
+ /* Scan through PredicateLockHash and copy contents */
+ hash_seq_init(&seqstat, PredicateLockHash);
+
+ el = 0;
+
+ while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
+ {
+ data->locktags[el] = predlock->tag.myTarget->tag;
+ data->xacts[el] = *predlock->tag.myXact;
+ el++;
+ }
+
+ Assert(el == els);
+
+ /* Release locks in reverse order */
+ LWLockRelease(SerializableXactHashLock);
+ for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+
+ return data;
+}
+
+/*
+ * Free up shared memory structures by pushing the oldest sxact (the one at
+ * the front of the SummarizeOldestCommittedSxact queue) into summary form.
+ * Each call will free exactly one SERIALIZABLEXACT structure and may also
+ * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK,
+ * PREDICATELOCKTARGET, RWConflictData.
+ */
+static void
+SummarizeOldestCommittedSxact(void)
+{
+ SERIALIZABLEXACT *sxact;
+
+ LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+
+ /*
+ * This function is only called if there are no sxact slots available.
+ * Some of them must belong to old, already-finished transactions, so
+ * there should be something in FinishedSerializableTransactions list that
+ * we can summarize. However, there's a race condition: while we were not
+ * holding any locks, a transaction might have ended and cleaned up all
+ * the finished sxact entries already, freeing up their sxact slots. In
+ * that case, we have nothing to do here. The caller will find one of the
+ * slots released by the other backend when it retries.
+ */
+ if (SHMQueueEmpty(FinishedSerializableTransactions))
+ {
+ LWLockRelease(SerializableFinishedListLock);
+ return;
+ }
+
+ /*
+ * Grab the first sxact off the finished list -- this will be the earliest
+ * commit. Remove it from the list.
+ */
+ sxact = (SERIALIZABLEXACT *)
+ SHMQueueNext(FinishedSerializableTransactions,
+ FinishedSerializableTransactions,
+ offsetof(SERIALIZABLEXACT, finishedLink));
+ SHMQueueDelete(&(sxact->finishedLink));
+
+ /* Add to SLRU summary information. */
+ if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact))
+ SerialAdd(sxact->topXid, SxactHasConflictOut(sxact)
+ ? sxact->SeqNo.earliestOutConflictCommit : InvalidSerCommitSeqNo);
+
+ /* Summarize and release the detail. */
+ ReleaseOneSerializableXact(sxact, false, true);
+
+ LWLockRelease(SerializableFinishedListLock);
+}
+
+/*
+ * GetSafeSnapshot
+ * Obtain and register a snapshot for a READ ONLY DEFERRABLE
+ * transaction. Ensures that the snapshot is "safe", i.e. a
+ * read-only transaction running on it can execute serializably
+ * without further checks. This requires waiting for concurrent
+ * transactions to complete, and retrying with a new snapshot if
+ * one of them could possibly create a conflict.
+ *
+ * As with GetSerializableTransactionSnapshot (which this is a subroutine
+ * for), the passed-in Snapshot pointer should reference a static data
+ * area that can safely be passed to GetSnapshotData.
+ */
+static Snapshot
+GetSafeSnapshot(Snapshot origSnapshot)
+{
+ Snapshot snapshot;
+
+ Assert(XactReadOnly && XactDeferrable);
+
+ while (true)
+ {
+ /*
+ * GetSerializableTransactionSnapshotInt is going to call
+ * GetSnapshotData, so we need to provide it the static snapshot area
+ * our caller passed to us. The pointer returned is actually the same
+ * one passed to it, but we avoid assuming that here.
+ */
+ snapshot = GetSerializableTransactionSnapshotInt(origSnapshot,
+ NULL, InvalidPid);
+
+ if (MySerializableXact == InvalidSerializableXact)
+ return snapshot; /* no concurrent r/w xacts; it's safe */
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Wait for concurrent transactions to finish. Stop early if one of
+ * them marked us as conflicted.
+ */
+ MySerializableXact->flags |= SXACT_FLAG_DEFERRABLE_WAITING;
+ while (!(SHMQueueEmpty(&MySerializableXact->possibleUnsafeConflicts) ||
+ SxactIsROUnsafe(MySerializableXact)))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ }
+ MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING;
+
+ if (!SxactIsROUnsafe(MySerializableXact))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ break; /* success */
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+
+ /* else, need to retry... */
+ ereport(DEBUG2,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg_internal("deferrable snapshot was unsafe; trying a new one")));
+ ReleasePredicateLocks(false, false);
+ }
+
+ /*
+ * Now we have a safe snapshot, so we don't need to do any further checks.
+ */
+ Assert(SxactIsROSafe(MySerializableXact));
+ ReleasePredicateLocks(false, true);
+
+ return snapshot;
+}
+
+/*
+ * GetSafeSnapshotBlockingPids
+ * If the specified process is currently blocked in GetSafeSnapshot,
+ * write the process IDs of all processes that it is blocked by
+ * into the caller-supplied buffer output[]. The list is truncated at
+ * output_size, and the number of PIDs written into the buffer is
+ * returned. Returns zero if the given PID is not currently blocked
+ * in GetSafeSnapshot.
+ */
+int
+GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
+{
+ int num_written = 0;
+ SERIALIZABLEXACT *sxact;
+
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+ /* Find blocked_pid's SERIALIZABLEXACT by linear search. */
+ for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact))
+ {
+ if (sxact->pid == blocked_pid)
+ break;
+ }
+
+ /* Did we find it, and is it currently waiting in GetSafeSnapshot? */
+ if (sxact != NULL && SxactIsDeferrableWaiting(sxact))
+ {
+ RWConflict possibleUnsafeConflict;
+
+ /* Traverse the list of possible unsafe conflicts collecting PIDs. */
+ possibleUnsafeConflict = (RWConflict)
+ SHMQueueNext(&sxact->possibleUnsafeConflicts,
+ &sxact->possibleUnsafeConflicts,
+ offsetof(RWConflictData, inLink));
+
+ while (possibleUnsafeConflict != NULL && num_written < output_size)
+ {
+ output[num_written++] = possibleUnsafeConflict->sxactOut->pid;
+ possibleUnsafeConflict = (RWConflict)
+ SHMQueueNext(&sxact->possibleUnsafeConflicts,
+ &possibleUnsafeConflict->inLink,
+ offsetof(RWConflictData, inLink));
+ }
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+
+ return num_written;
+}
+
+/*
+ * Acquire a snapshot that can be used for the current transaction.
+ *
+ * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+ * It should be current for this process and be contained in PredXact.
+ *
+ * The passed-in Snapshot pointer should reference a static data area that
+ * can safely be passed to GetSnapshotData. The return value is actually
+ * always this same pointer; no new snapshot data structure is allocated
+ * within this function.
+ */
+Snapshot
+GetSerializableTransactionSnapshot(Snapshot snapshot)
+{
+ Assert(IsolationIsSerializable());
+
+ /*
+ * Can't use serializable mode while recovery is still active, as it is,
+ * for example, on a hot standby. We could get here despite the check in
+ * check_XactIsoLevel() if default_transaction_isolation is set to
+ * serializable, so phrase the hint accordingly.
+ */
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot use serializable mode in a hot standby"),
+ errdetail("\"default_transaction_isolation\" is set to \"serializable\"."),
+ errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default.")));
+
+ /*
+ * A special optimization is available for SERIALIZABLE READ ONLY
+ * DEFERRABLE transactions -- we can wait for a suitable snapshot and
+ * thereby avoid all SSI overhead once it's running.
+ */
+ if (XactReadOnly && XactDeferrable)
+ return GetSafeSnapshot(snapshot);
+
+ return GetSerializableTransactionSnapshotInt(snapshot,
+ NULL, InvalidPid);
+}
+
+/*
+ * Import a snapshot to be used for the current transaction.
+ *
+ * This is nearly the same as GetSerializableTransactionSnapshot, except that
+ * we don't take a new snapshot, but rather use the data we're handed.
+ *
+ * The caller must have verified that the snapshot came from a serializable
+ * transaction; and if we're read-write, the source transaction must not be
+ * read-only.
+ */
+void
+SetSerializableTransactionSnapshot(Snapshot snapshot,
+ VirtualTransactionId *sourcevxid,
+ int sourcepid)
+{
+ Assert(IsolationIsSerializable());
+
+ /*
+ * If this is called by parallel.c in a parallel worker, we don't want to
+ * create a SERIALIZABLEXACT just yet because the leader's
+ * SERIALIZABLEXACT will be installed with AttachSerializableXact(). We
+ * also don't want to reject SERIALIZABLE READ ONLY DEFERRABLE in this
+ * case, because the leader has already determined that the snapshot it
+ * has passed us is safe. So there is nothing for us to do.
+ */
+ if (IsParallelWorker())
+ return;
+
+ /*
+ * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to
+ * import snapshots, since there's no way to wait for a safe snapshot when
+ * we're using the snap we're told to. (XXX instead of throwing an error,
+ * we could just ignore the XactDeferrable flag?)
+ */
+ if (XactReadOnly && XactDeferrable)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE")));
+
+ (void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid,
+ sourcepid);
+}
+
+/*
+ * Guts of GetSerializableTransactionSnapshot
+ *
+ * If sourcevxid is valid, this is actually an import operation and we should
+ * skip calling GetSnapshotData, because the snapshot contents are already
+ * loaded up. HOWEVER: to avoid race conditions, we must check that the
+ * source xact is still running after we acquire SerializableXactHashLock.
+ * We do that by calling ProcArrayInstallImportedXmin.
+ */
+static Snapshot
+GetSerializableTransactionSnapshotInt(Snapshot snapshot,
+ VirtualTransactionId *sourcevxid,
+ int sourcepid)
+{
+ PGPROC *proc;
+ VirtualTransactionId vxid;
+ SERIALIZABLEXACT *sxact,
+ *othersxact;
+
+ /* We only do this for serializable transactions. Once. */
+ Assert(MySerializableXact == InvalidSerializableXact);
+
+ Assert(!RecoveryInProgress());
+
+ /*
+ * Since all parts of a serializable transaction must use the same
+ * snapshot, it is too late to establish one after a parallel operation
+ * has begun.
+ */
+ if (IsInParallelMode())
+ elog(ERROR, "cannot establish serializable snapshot during a parallel operation");
+
+ proc = MyProc;
+ Assert(proc != NULL);
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+
+ /*
+ * First we get the sxact structure, which may involve looping and access
+ * to the "finished" list to free a structure for use.
+ *
+ * We must hold SerializableXactHashLock when taking/checking the snapshot
+ * to avoid race conditions, for much the same reasons that
+ * GetSnapshotData takes the ProcArrayLock. Since we might have to
+ * release SerializableXactHashLock to call SummarizeOldestCommittedSxact,
+ * this means we have to create the sxact first, which is a bit annoying
+ * (in particular, an elog(ERROR) in procarray.c would cause us to leak
+ * the sxact). Consider refactoring to avoid this.
+ */
+#ifdef TEST_SUMMARIZE_SERIAL
+ SummarizeOldestCommittedSxact();
+#endif
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ do
+ {
+ sxact = CreatePredXact();
+ /* If null, push out committed sxact to SLRU summary & retry. */
+ if (!sxact)
+ {
+ LWLockRelease(SerializableXactHashLock);
+ SummarizeOldestCommittedSxact();
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ }
+ } while (!sxact);
+
+ /* Get the snapshot, or check that it's safe to use */
+ if (!sourcevxid)
+ snapshot = GetSnapshotData(snapshot);
+ else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid))
+ {
+ ReleasePredXact(sxact);
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not import the requested snapshot"),
+ errdetail("The source process with PID %d is not running anymore.",
+ sourcepid)));
+ }
+
+ /*
+ * If there are no serializable transactions which are not read-only, we
+ * can "opt out" of predicate locking and conflict checking for a
+ * read-only transaction.
+ *
+ * The reason this is safe is that a read-only transaction can only become
+ * part of a dangerous structure if it overlaps a writable transaction
+ * which in turn overlaps a writable transaction which committed before
+ * the read-only transaction started. A new writable transaction can
+ * overlap this one, but it can't meet the other condition of overlapping
+ * a transaction which committed before this one started.
+ */
+ if (XactReadOnly && PredXact->WritableSxactCount == 0)
+ {
+ ReleasePredXact(sxact);
+ LWLockRelease(SerializableXactHashLock);
+ return snapshot;
+ }
+
+ /* Maintain serializable global xmin info. */
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+ {
+ Assert(PredXact->SxactGlobalXminCount == 0);
+ PredXact->SxactGlobalXmin = snapshot->xmin;
+ PredXact->SxactGlobalXminCount = 1;
+ SerialSetActiveSerXmin(snapshot->xmin);
+ }
+ else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
+ {
+ Assert(PredXact->SxactGlobalXminCount > 0);
+ PredXact->SxactGlobalXminCount++;
+ }
+ else
+ {
+ Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin));
+ }
+
+ /* Initialize the structure. */
+ sxact->vxid = vxid;
+ sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo;
+ sxact->prepareSeqNo = InvalidSerCommitSeqNo;
+ sxact->commitSeqNo = InvalidSerCommitSeqNo;
+ SHMQueueInit(&(sxact->outConflicts));
+ SHMQueueInit(&(sxact->inConflicts));
+ SHMQueueInit(&(sxact->possibleUnsafeConflicts));
+ sxact->topXid = GetTopTransactionIdIfAny();
+ sxact->finishedBefore = InvalidTransactionId;
+ sxact->xmin = snapshot->xmin;
+ sxact->pid = MyProcPid;
+ SHMQueueInit(&(sxact->predicateLocks));
+ SHMQueueElemInit(&(sxact->finishedLink));
+ sxact->flags = 0;
+ if (XactReadOnly)
+ {
+ sxact->flags |= SXACT_FLAG_READ_ONLY;
+
+ /*
+ * Register all concurrent r/w transactions as possible conflicts; if
+ * all of them commit without any outgoing conflicts to earlier
+ * transactions then this snapshot can be deemed safe (and we can run
+ * without tracking predicate locks).
+ */
+ for (othersxact = FirstPredXact();
+ othersxact != NULL;
+ othersxact = NextPredXact(othersxact))
+ {
+ if (!SxactIsCommitted(othersxact)
+ && !SxactIsDoomed(othersxact)
+ && !SxactIsReadOnly(othersxact))
+ {
+ SetPossibleUnsafeConflict(sxact, othersxact);
+ }
+ }
+ }
+ else
+ {
+ ++(PredXact->WritableSxactCount);
+ Assert(PredXact->WritableSxactCount <=
+ (MaxBackends + max_prepared_xacts));
+ }
+
+ MySerializableXact = sxact;
+ MyXactDidWrite = false; /* haven't written anything yet */
+
+ LWLockRelease(SerializableXactHashLock);
+
+ CreateLocalPredicateLockHash();
+
+ return snapshot;
+}
+
+static void
+CreateLocalPredicateLockHash(void)
+{
+ HASHCTL hash_ctl;
+
+ /* Initialize the backend-local hash table of parent locks */
+ Assert(LocalPredicateLockHash == NULL);
+ hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
+ LocalPredicateLockHash = hash_create("Local predicate lock",
+ max_predicate_locks_per_xact,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS);
+}
+
+/*
+ * Register the top level XID in SerializableXidHash.
+ * Also store it for easy reference in MySerializableXact.
+ */
+void
+RegisterPredicateLockingXid(TransactionId xid)
+{
+ SERIALIZABLEXIDTAG sxidtag;
+ SERIALIZABLEXID *sxid;
+ bool found;
+
+ /*
+ * If we're not tracking predicate lock data for this transaction, we
+ * should ignore the request and return quickly.
+ */
+ if (MySerializableXact == InvalidSerializableXact)
+ return;
+
+ /* We should have a valid XID and be at the top level. */
+ Assert(TransactionIdIsValid(xid));
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /* This should only be done once per transaction. */
+ Assert(MySerializableXact->topXid == InvalidTransactionId);
+
+ MySerializableXact->topXid = xid;
+
+ sxidtag.xid = xid;
+ sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+ &sxidtag,
+ HASH_ENTER, &found);
+ Assert(!found);
+
+ /* Initialize the structure. */
+ sxid->myXact = MySerializableXact;
+ LWLockRelease(SerializableXactHashLock);
+}
+
+
+/*
+ * Check whether there are any predicate locks held by any transaction
+ * for the page at the given block number.
+ *
+ * Note that the transaction may be completed but not yet subject to
+ * cleanup due to overlapping serializable transactions. This must
+ * return valid information regardless of transaction isolation level.
+ *
+ * Also note that this doesn't check for a conflicting relation lock,
+ * just a lock specifically on the given page.
+ *
+ * One use is to support proper behavior during GiST index vacuum.
+ */
+bool
+PageIsPredicateLocked(Relation relation, BlockNumber blkno)
+{
+ PREDICATELOCKTARGETTAG targettag;
+ uint32 targettaghash;
+ LWLock *partitionLock;
+ PREDICATELOCKTARGET *target;
+
+ SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+ relation->rd_node.dbNode,
+ relation->rd_id,
+ blkno);
+
+ targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ LWLockAcquire(partitionLock, LW_SHARED);
+ target = (PREDICATELOCKTARGET *)
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ &targettag, targettaghash,
+ HASH_FIND, NULL);
+ LWLockRelease(partitionLock);
+
+ return (target != NULL);
+}
+
+
+/*
+ * Check whether a particular lock is held by this transaction.
+ *
+ * Important note: this function may return false even if the lock is
+ * being held, because it uses the local lock table which is not
+ * updated if another transaction modifies our lock list (e.g. to
+ * split an index page). It can also return true when a coarser
+ * granularity lock that covers this target is being held. Be careful
+ * to only use this function in circumstances where such errors are
+ * acceptable!
+ */
+static bool
+PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
+{
+ LOCALPREDICATELOCK *lock;
+
+ /* check local hash table */
+ lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ targettag,
+ HASH_FIND, NULL);
+
+ if (!lock)
+ return false;
+
+ /*
+ * Found entry in the table, but still need to check whether it's actually
+ * held -- it could just be a parent of some held lock.
+ */
+ return lock->held;
+}
+
+/*
+ * Return the parent lock tag in the lock hierarchy: the next coarser
+ * lock that covers the provided tag.
+ *
+ * Returns true and sets *parent to the parent tag if one exists,
+ * returns false if none exists.
+ */
+static bool
+GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ PREDICATELOCKTARGETTAG *parent)
+{
+ switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ {
+ case PREDLOCKTAG_RELATION:
+ /* relation locks have no parent lock */
+ return false;
+
+ case PREDLOCKTAG_PAGE:
+ /* parent lock is relation lock */
+ SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
+ GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
+
+ return true;
+
+ case PREDLOCKTAG_TUPLE:
+ /* parent lock is page lock */
+ SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
+ GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
+ GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
+ return true;
+ }
+
+ /* not reachable */
+ Assert(false);
+ return false;
+}
+
+/*
+ * Check whether the lock we are considering is already covered by a
+ * coarser lock for our transaction.
+ *
+ * Like PredicateLockExists, this function might return a false
+ * negative, but it will never return a false positive.
+ */
+static bool
+CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
+{
+ PREDICATELOCKTARGETTAG targettag,
+ parenttag;
+
+ targettag = *newtargettag;
+
+ /* check parents iteratively until no more */
+ while (GetParentPredicateLockTag(&targettag, &parenttag))
+ {
+ targettag = parenttag;
+ if (PredicateLockExists(&targettag))
+ return true;
+ }
+
+ /* no more parents to check; lock is not covered */
+ return false;
+}
+
+/*
+ * Remove the dummy entry from the predicate lock target hash, to free up some
+ * scratch space. The caller must be holding SerializablePredicateListLock,
+ * and must restore the entry with RestoreScratchTarget() before releasing the
+ * lock.
+ *
+ * If lockheld is true, the caller is already holding the partition lock
+ * of the partition containing the scratch entry.
+ */
+static void
+RemoveScratchTarget(bool lockheld)
+{
+ bool found;
+
+ Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+ if (!lockheld)
+ LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ &ScratchTargetTag,
+ ScratchTargetTagHash,
+ HASH_REMOVE, &found);
+ Assert(found);
+ if (!lockheld)
+ LWLockRelease(ScratchPartitionLock);
+}
+
+/*
+ * Re-insert the dummy entry in predicate lock target hash.
+ */
+static void
+RestoreScratchTarget(bool lockheld)
+{
+ bool found;
+
+ Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+ if (!lockheld)
+ LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ &ScratchTargetTag,
+ ScratchTargetTagHash,
+ HASH_ENTER, &found);
+ Assert(!found);
+ if (!lockheld)
+ LWLockRelease(ScratchPartitionLock);
+}
+
+/*
+ * Check whether the list of related predicate locks is empty for a
+ * predicate lock target, and remove the target if it is.
+ */
+static void
+RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
+{
+ PREDICATELOCKTARGET *rmtarget PG_USED_FOR_ASSERTS_ONLY;
+
+ Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+ /* Can't remove it until no locks at this target. */
+ if (!SHMQueueEmpty(&target->predicateLocks))
+ return;
+
+ /* Actually remove the target. */
+ rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ &target->tag,
+ targettaghash,
+ HASH_REMOVE, NULL);
+ Assert(rmtarget == target);
+}
+
+/*
+ * Delete child target locks owned by this process.
+ * This implementation is assuming that the usage of each target tag field
+ * is uniform. No need to make this hard if we don't have to.
+ *
+ * We acquire an LWLock in the case of parallel mode, because worker
+ * backends have access to the leader's SERIALIZABLEXACT. Otherwise,
+ * we aren't acquiring LWLocks for the predicate lock or lock
+ * target structures associated with this transaction unless we're going
+ * to modify them, because no other process is permitted to modify our
+ * locks.
+ */
+static void
+DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
+{
+ SERIALIZABLEXACT *sxact;
+ PREDICATELOCK *predlock;
+
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ sxact = MySerializableXact;
+ if (IsInParallelMode())
+ LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+ predlock = (PREDICATELOCK *)
+ SHMQueueNext(&(sxact->predicateLocks),
+ &(sxact->predicateLocks),
+ offsetof(PREDICATELOCK, xactLink));
+ while (predlock)
+ {
+ SHM_QUEUE *predlocksxactlink;
+ PREDICATELOCK *nextpredlock;
+ PREDICATELOCKTAG oldlocktag;
+ PREDICATELOCKTARGET *oldtarget;
+ PREDICATELOCKTARGETTAG oldtargettag;
+
+ predlocksxactlink = &(predlock->xactLink);
+ nextpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(sxact->predicateLocks),
+ predlocksxactlink,
+ offsetof(PREDICATELOCK, xactLink));
+
+ oldlocktag = predlock->tag;
+ Assert(oldlocktag.myXact == sxact);
+ oldtarget = oldlocktag.myTarget;
+ oldtargettag = oldtarget->tag;
+
+ if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
+ {
+ uint32 oldtargettaghash;
+ LWLock *partitionLock;
+ PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY;
+
+ oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ SHMQueueDelete(predlocksxactlink);
+ SHMQueueDelete(&(predlock->targetLink));
+ rmpredlock = hash_search_with_hash_value
+ (PredicateLockHash,
+ &oldlocktag,
+ PredicateLockHashCodeFromTargetHashCode(&oldlocktag,
+ oldtargettaghash),
+ HASH_REMOVE, NULL);
+ Assert(rmpredlock == predlock);
+
+ RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
+
+ LWLockRelease(partitionLock);
+
+ DecrementParentLocks(&oldtargettag);
+ }
+
+ predlock = nextpredlock;
+ }
+ if (IsInParallelMode())
+ LWLockRelease(&sxact->perXactPredicateListLock);
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * Returns the promotion limit for a given predicate lock target. This is the
+ * max number of descendant locks allowed before promoting to the specified
+ * tag. Note that the limit includes non-direct descendants (e.g., both tuples
+ * and pages for a relation lock).
+ *
+ * Currently the default limit is 2 for a page lock, and half of the value of
+ * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior
+ * of earlier releases when upgrading.
+ *
+ * TODO SSI: We should probably add additional GUCs to allow a maximum ratio
+ * of page and tuple locks based on the pages in a relation, and the maximum
+ * ratio of tuple locks to tuples in a page. This would provide more
+ * generally "balanced" allocation of locks to where they are most useful,
+ * while still allowing the absolute numbers to prevent one relation from
+ * tying up all predicate lock resources.
+ */
+static int
+MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag)
+{
+ switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ {
+ case PREDLOCKTAG_RELATION:
+ return max_predicate_locks_per_relation < 0
+ ? (max_predicate_locks_per_xact
+ / (-max_predicate_locks_per_relation)) - 1
+ : max_predicate_locks_per_relation;
+
+ case PREDLOCKTAG_PAGE:
+ return max_predicate_locks_per_page;
+
+ case PREDLOCKTAG_TUPLE:
+
+ /*
+ * not reachable: nothing is finer-granularity than a tuple, so we
+ * should never try to promote to it.
+ */
+ Assert(false);
+ return 0;
+ }
+
+ /* not reachable */
+ Assert(false);
+ return 0;
+}
+
+/*
+ * For all ancestors of a newly-acquired predicate lock, increment
+ * their child count in the parent hash table. If any of them have
+ * more descendants than their promotion threshold, acquire the
+ * coarsest such lock.
+ *
+ * Returns true if a parent lock was acquired and false otherwise.
+ */
+static bool
+CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
+{
+ PREDICATELOCKTARGETTAG targettag,
+ nexttag,
+ promotiontag;
+ LOCALPREDICATELOCK *parentlock;
+ bool found,
+ promote;
+
+ promote = false;
+
+ targettag = *reqtag;
+
+ /* check parents iteratively */
+ while (GetParentPredicateLockTag(&targettag, &nexttag))
+ {
+ targettag = nexttag;
+ parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ &targettag,
+ HASH_ENTER,
+ &found);
+ if (!found)
+ {
+ parentlock->held = false;
+ parentlock->childLocks = 1;
+ }
+ else
+ parentlock->childLocks++;
+
+ if (parentlock->childLocks >
+ MaxPredicateChildLocks(&targettag))
+ {
+ /*
+ * We should promote to this parent lock. Continue to check its
+ * ancestors, however, both to get their child counts right and to
+ * check whether we should just go ahead and promote to one of
+ * them.
+ */
+ promotiontag = targettag;
+ promote = true;
+ }
+ }
+
+ if (promote)
+ {
+ /* acquire coarsest ancestor eligible for promotion */
+ PredicateLockAcquire(&promotiontag);
+ return true;
+ }
+ else
+ return false;
+}
+
+/*
+ * When releasing a lock, decrement the child count on all ancestor
+ * locks.
+ *
+ * This is called only when releasing a lock via
+ * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
+ * we've acquired its parent, possibly due to promotion) or when a new
+ * MVCC write lock makes the predicate lock unnecessary. There's no
+ * point in calling it when locks are released at transaction end, as
+ * this information is no longer needed.
+ */
+static void
+DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
+{
+ PREDICATELOCKTARGETTAG parenttag,
+ nexttag;
+
+ parenttag = *targettag;
+
+ while (GetParentPredicateLockTag(&parenttag, &nexttag))
+ {
+ uint32 targettaghash;
+ LOCALPREDICATELOCK *parentlock,
+ *rmlock PG_USED_FOR_ASSERTS_ONLY;
+
+ parenttag = nexttag;
+ targettaghash = PredicateLockTargetTagHashCode(&parenttag);
+ parentlock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ &parenttag, targettaghash,
+ HASH_FIND, NULL);
+
+ /*
+ * There's a small chance the parent lock doesn't exist in the lock
+ * table. This can happen if we prematurely removed it because an
+ * index split caused the child refcount to be off.
+ */
+ if (parentlock == NULL)
+ continue;
+
+ parentlock->childLocks--;
+
+ /*
+ * Under similar circumstances the parent lock's refcount might be
+ * zero. This only happens if we're holding that lock (otherwise we
+ * would have removed the entry).
+ */
+ if (parentlock->childLocks < 0)
+ {
+ Assert(parentlock->held);
+ parentlock->childLocks = 0;
+ }
+
+ if ((parentlock->childLocks == 0) && (!parentlock->held))
+ {
+ rmlock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ &parenttag, targettaghash,
+ HASH_REMOVE, NULL);
+ Assert(rmlock == parentlock);
+ }
+ }
+}
+
+/*
+ * Indicate that a predicate lock on the given target is held by the
+ * specified transaction. Has no effect if the lock is already held.
+ *
+ * This updates the lock table and the sxact's lock list, and creates
+ * the lock target if necessary, but does *not* do anything related to
+ * granularity promotion or the local lock table. See
+ * PredicateLockAcquire for that.
+ */
+static void
+CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
+ uint32 targettaghash,
+ SERIALIZABLEXACT *sxact)
+{
+ PREDICATELOCKTARGET *target;
+ PREDICATELOCKTAG locktag;
+ PREDICATELOCK *lock;
+ LWLock *partitionLock;
+ bool found;
+
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ if (IsInParallelMode())
+ LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /* Make sure that the target is represented. */
+ target = (PREDICATELOCKTARGET *)
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ targettag, targettaghash,
+ HASH_ENTER_NULL, &found);
+ if (!target)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_pred_locks_per_transaction.")));
+ if (!found)
+ SHMQueueInit(&(target->predicateLocks));
+
+ /* We've got the sxact and target, make sure they're joined. */
+ locktag.myTarget = target;
+ locktag.myXact = sxact;
+ lock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash, &locktag,
+ PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
+ HASH_ENTER_NULL, &found);
+ if (!lock)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_pred_locks_per_transaction.")));
+
+ if (!found)
+ {
+ SHMQueueInsertBefore(&(target->predicateLocks), &(lock->targetLink));
+ SHMQueueInsertBefore(&(sxact->predicateLocks),
+ &(lock->xactLink));
+ lock->commitSeqNo = InvalidSerCommitSeqNo;
+ }
+
+ LWLockRelease(partitionLock);
+ if (IsInParallelMode())
+ LWLockRelease(&sxact->perXactPredicateListLock);
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * Acquire a predicate lock on the specified target for the current
+ * connection if not already held. This updates the local lock table
+ * and uses it to implement granularity promotion. It will consolidate
+ * multiple locks into a coarser lock if warranted, and will release
+ * any finer-grained locks covered by the new one.
+ */
+static void
+PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
+{
+ uint32 targettaghash;
+ bool found;
+ LOCALPREDICATELOCK *locallock;
+
+ /* Do we have the lock already, or a covering lock? */
+ if (PredicateLockExists(targettag))
+ return;
+
+ if (CoarserLockCovers(targettag))
+ return;
+
+ /* the same hash and LW lock apply to the lock target and the local lock. */
+ targettaghash = PredicateLockTargetTagHashCode(targettag);
+
+ /* Acquire lock in local table */
+ locallock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ targettag, targettaghash,
+ HASH_ENTER, &found);
+ locallock->held = true;
+ if (!found)
+ locallock->childLocks = 0;
+
+ /* Actually create the lock */
+ CreatePredicateLock(targettag, targettaghash, MySerializableXact);
+
+ /*
+ * Lock has been acquired. Check whether it should be promoted to a
+ * coarser granularity, or whether there are finer-granularity locks to
+ * clean up.
+ */
+ if (CheckAndPromotePredicateLockRequest(targettag))
+ {
+ /*
+ * Lock request was promoted to a coarser-granularity lock, and that
+ * lock was acquired. It will delete this lock and any of its
+ * children, so we're done.
+ */
+ }
+ else
+ {
+ /* Clean up any finer-granularity locks */
+ if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
+ DeleteChildTargetLocks(targettag);
+ }
+}
+
+
+/*
+ * PredicateLockRelation
+ *
+ * Gets a predicate lock at the relation level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ * Clear any finer-grained predicate locks this session has on the relation.
+ */
+void
+PredicateLockRelation(Relation relation, Snapshot snapshot)
+{
+ PREDICATELOCKTARGETTAG tag;
+
+ if (!SerializationNeededForRead(relation, snapshot))
+ return;
+
+ SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+ relation->rd_node.dbNode,
+ relation->rd_id);
+ PredicateLockAcquire(&tag);
+}
+
+/*
+ * PredicateLockPage
+ *
+ * Gets a predicate lock at the page level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ * Skip if a coarser predicate lock already covers this page.
+ * Clear any finer-grained predicate locks this session has on the relation.
+ */
+void
+PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
+{
+ PREDICATELOCKTARGETTAG tag;
+
+ if (!SerializationNeededForRead(relation, snapshot))
+ return;
+
+ SET_PREDICATELOCKTARGETTAG_PAGE(tag,
+ relation->rd_node.dbNode,
+ relation->rd_id,
+ blkno);
+ PredicateLockAcquire(&tag);
+}
+
+/*
+ * PredicateLockTID
+ *
+ * Gets a predicate lock at the tuple level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ */
+void
+PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+ TransactionId tuple_xid)
+{
+ PREDICATELOCKTARGETTAG tag;
+
+ if (!SerializationNeededForRead(relation, snapshot))
+ return;
+
+ /*
+ * Return if this xact wrote it.
+ */
+ if (relation->rd_index == NULL)
+ {
+ /* If we wrote it; we already have a write lock. */
+ if (TransactionIdIsCurrentTransactionId(tuple_xid))
+ return;
+ }
+
+ /*
+ * Do quick-but-not-definitive test for a relation lock first. This will
+ * never cause a return when the relation is *not* locked, but will
+ * occasionally let the check continue when there really *is* a relation
+ * level lock.
+ */
+ SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+ relation->rd_node.dbNode,
+ relation->rd_id);
+ if (PredicateLockExists(&tag))
+ return;
+
+ SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
+ relation->rd_node.dbNode,
+ relation->rd_id,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ PredicateLockAcquire(&tag);
+}
+
+
+/*
+ * DeleteLockTarget
+ *
+ * Remove a predicate lock target along with any locks held for it.
+ *
+ * Caller must hold SerializablePredicateListLock and the
+ * appropriate hash partition lock for the target.
+ */
+static void
+DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
+{
+ PREDICATELOCK *predlock;
+ SHM_QUEUE *predlocktargetlink;
+ PREDICATELOCK *nextpredlock;
+ bool found;
+
+ Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
+ LW_EXCLUSIVE));
+ Assert(LWLockHeldByMe(PredicateLockHashPartitionLock(targettaghash)));
+
+ predlock = (PREDICATELOCK *)
+ SHMQueueNext(&(target->predicateLocks),
+ &(target->predicateLocks),
+ offsetof(PREDICATELOCK, targetLink));
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ while (predlock)
+ {
+ predlocktargetlink = &(predlock->targetLink);
+ nextpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(target->predicateLocks),
+ predlocktargetlink,
+ offsetof(PREDICATELOCK, targetLink));
+
+ SHMQueueDelete(&(predlock->xactLink));
+ SHMQueueDelete(&(predlock->targetLink));
+
+ hash_search_with_hash_value
+ (PredicateLockHash,
+ &predlock->tag,
+ PredicateLockHashCodeFromTargetHashCode(&predlock->tag,
+ targettaghash),
+ HASH_REMOVE, &found);
+ Assert(found);
+
+ predlock = nextpredlock;
+ }
+ LWLockRelease(SerializableXactHashLock);
+
+ /* Remove the target itself, if possible. */
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+}
+
+
+/*
+ * TransferPredicateLocksToNewTarget
+ *
+ * Move or copy all the predicate locks for a lock target, for use by
+ * index page splits/combines and other things that create or replace
+ * lock targets. If 'removeOld' is true, the old locks and the target
+ * will be removed.
+ *
+ * Returns true on success, or false if we ran out of shared memory to
+ * allocate the new target or locks. Guaranteed to always succeed if
+ * removeOld is set (by using the scratch entry in PredicateLockTargetHash
+ * for scratch space).
+ *
+ * Warning: the "removeOld" option should be used only with care,
+ * because this function does not (indeed, can not) update other
+ * backends' LocalPredicateLockHash. If we are only adding new
+ * entries, this is not a problem: the local lock table is used only
+ * as a hint, so missing entries for locks that are held are
+ * OK. Having entries for locks that are no longer held, as can happen
+ * when using "removeOld", is not in general OK. We can only use it
+ * safely when replacing a lock with a coarser-granularity lock that
+ * covers it, or if we are absolutely certain that no one will need to
+ * refer to that lock in the future.
+ *
+ * Caller must hold SerializablePredicateListLock exclusively.
+ */
+static bool
+TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
+ PREDICATELOCKTARGETTAG newtargettag,
+ bool removeOld)
+{
+ uint32 oldtargettaghash;
+ LWLock *oldpartitionLock;
+ PREDICATELOCKTARGET *oldtarget;
+ uint32 newtargettaghash;
+ LWLock *newpartitionLock;
+ bool found;
+ bool outOfShmem = false;
+
+ Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
+ LW_EXCLUSIVE));
+
+ oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+ oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+
+ if (removeOld)
+ {
+ /*
+ * Remove the dummy entry to give us scratch space, so we know we'll
+ * be able to create the new lock target.
+ */
+ RemoveScratchTarget(false);
+ }
+
+ /*
+ * We must get the partition locks in ascending sequence to avoid
+ * deadlocks. If old and new partitions are the same, we must request the
+ * lock only once.
+ */
+ if (oldpartitionLock < newpartitionLock)
+ {
+ LWLockAcquire(oldpartitionLock,
+ (removeOld ? LW_EXCLUSIVE : LW_SHARED));
+ LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ }
+ else if (oldpartitionLock > newpartitionLock)
+ {
+ LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ LWLockAcquire(oldpartitionLock,
+ (removeOld ? LW_EXCLUSIVE : LW_SHARED));
+ }
+ else
+ LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Look for the old target. If not found, that's OK; no predicate locks
+ * are affected, so we can just clean up and return. If it does exist,
+ * walk its list of predicate locks and move or copy them to the new
+ * target.
+ */
+ oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ &oldtargettag,
+ oldtargettaghash,
+ HASH_FIND, NULL);
+
+ if (oldtarget)
+ {
+ PREDICATELOCKTARGET *newtarget;
+ PREDICATELOCK *oldpredlock;
+ PREDICATELOCKTAG newpredlocktag;
+
+ newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ &newtargettag,
+ newtargettaghash,
+ HASH_ENTER_NULL, &found);
+
+ if (!newtarget)
+ {
+ /* Failed to allocate due to insufficient shmem */
+ outOfShmem = true;
+ goto exit;
+ }
+
+ /* If we created a new entry, initialize it */
+ if (!found)
+ SHMQueueInit(&(newtarget->predicateLocks));
+
+ newpredlocktag.myTarget = newtarget;
+
+ /*
+ * Loop through all the locks on the old target, replacing them with
+ * locks on the new target.
+ */
+ oldpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(oldtarget->predicateLocks),
+ &(oldtarget->predicateLocks),
+ offsetof(PREDICATELOCK, targetLink));
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ while (oldpredlock)
+ {
+ SHM_QUEUE *predlocktargetlink;
+ PREDICATELOCK *nextpredlock;
+ PREDICATELOCK *newpredlock;
+ SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo;
+
+ predlocktargetlink = &(oldpredlock->targetLink);
+ nextpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(oldtarget->predicateLocks),
+ predlocktargetlink,
+ offsetof(PREDICATELOCK, targetLink));
+ newpredlocktag.myXact = oldpredlock->tag.myXact;
+
+ if (removeOld)
+ {
+ SHMQueueDelete(&(oldpredlock->xactLink));
+ SHMQueueDelete(&(oldpredlock->targetLink));
+
+ hash_search_with_hash_value
+ (PredicateLockHash,
+ &oldpredlock->tag,
+ PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag,
+ oldtargettaghash),
+ HASH_REMOVE, &found);
+ Assert(found);
+ }
+
+ newpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &newpredlocktag,
+ PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+ newtargettaghash),
+ HASH_ENTER_NULL,
+ &found);
+ if (!newpredlock)
+ {
+ /* Out of shared memory. Undo what we've done so far. */
+ LWLockRelease(SerializableXactHashLock);
+ DeleteLockTarget(newtarget, newtargettaghash);
+ outOfShmem = true;
+ goto exit;
+ }
+ if (!found)
+ {
+ SHMQueueInsertBefore(&(newtarget->predicateLocks),
+ &(newpredlock->targetLink));
+ SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks),
+ &(newpredlock->xactLink));
+ newpredlock->commitSeqNo = oldCommitSeqNo;
+ }
+ else
+ {
+ if (newpredlock->commitSeqNo < oldCommitSeqNo)
+ newpredlock->commitSeqNo = oldCommitSeqNo;
+ }
+
+ Assert(newpredlock->commitSeqNo != 0);
+ Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
+ || (newpredlock->tag.myXact == OldCommittedSxact));
+
+ oldpredlock = nextpredlock;
+ }
+ LWLockRelease(SerializableXactHashLock);
+
+ if (removeOld)
+ {
+ Assert(SHMQueueEmpty(&oldtarget->predicateLocks));
+ RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
+ }
+ }
+
+
+exit:
+ /* Release partition locks in reverse order of acquisition. */
+ if (oldpartitionLock < newpartitionLock)
+ {
+ LWLockRelease(newpartitionLock);
+ LWLockRelease(oldpartitionLock);
+ }
+ else if (oldpartitionLock > newpartitionLock)
+ {
+ LWLockRelease(oldpartitionLock);
+ LWLockRelease(newpartitionLock);
+ }
+ else
+ LWLockRelease(newpartitionLock);
+
+ if (removeOld)
+ {
+ /* We shouldn't run out of memory if we're moving locks */
+ Assert(!outOfShmem);
+
+ /* Put the scratch entry back */
+ RestoreScratchTarget(false);
+ }
+
+ return !outOfShmem;
+}
+
+/*
+ * Drop all predicate locks of any granularity from the specified relation,
+ * which can be a heap relation or an index relation. If 'transfer' is true,
+ * acquire a relation lock on the heap for any transactions with any lock(s)
+ * on the specified relation.
+ *
+ * This requires grabbing a lot of LW locks and scanning the entire lock
+ * target table for matches. That makes this more expensive than most
+ * predicate lock management functions, but it will only be called for DDL
+ * type commands that are expensive anyway, and there are fast returns when
+ * no serializable transactions are active or the relation is temporary.
+ *
+ * We don't use the TransferPredicateLocksToNewTarget function because it
+ * acquires its own locks on the partitions of the two targets involved,
+ * and we'll already be holding all partition locks.
+ *
+ * We can't throw an error from here, because the call could be from a
+ * transaction which is not serializable.
+ *
+ * NOTE: This is currently only called with transfer set to true, but that may
+ * change. If we decide to clean up the locks from a table on commit of a
+ * transaction which executed DROP TABLE, the false condition will be useful.
+ */
+static void
+DropAllPredicateLocksFromTable(Relation relation, bool transfer)
+{
+ HASH_SEQ_STATUS seqstat;
+ PREDICATELOCKTARGET *oldtarget;
+ PREDICATELOCKTARGET *heaptarget;
+ Oid dbId;
+ Oid relId;
+ Oid heapId;
+ int i;
+ bool isIndex;
+ bool found;
+ uint32 heaptargettaghash;
+
+ /*
+ * Bail out quickly if there are no serializable transactions running.
+ * It's safe to check this without taking locks because the caller is
+ * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which
+ * would matter here can be acquired while that is held.
+ */
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+ return;
+
+ if (!PredicateLockingNeededForRelation(relation))
+ return;
+
+ dbId = relation->rd_node.dbNode;
+ relId = relation->rd_id;
+ if (relation->rd_index == NULL)
+ {
+ isIndex = false;
+ heapId = relId;
+ }
+ else
+ {
+ isIndex = true;
+ heapId = relation->rd_index->indrelid;
+ }
+ Assert(heapId != InvalidOid);
+ Assert(transfer || !isIndex); /* index OID only makes sense with
+ * transfer */
+
+ /* Retrieve first time needed, then keep. */
+ heaptargettaghash = 0;
+ heaptarget = NULL;
+
+ /* Acquire locks on all lock partitions */
+ LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+ for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Remove the dummy entry to give us scratch space, so we know we'll be
+ * able to create the new lock target.
+ */
+ if (transfer)
+ RemoveScratchTarget(true);
+
+ /* Scan through target map */
+ hash_seq_init(&seqstat, PredicateLockTargetHash);
+
+ while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
+ {
+ PREDICATELOCK *oldpredlock;
+
+ /*
+ * Check whether this is a target which needs attention.
+ */
+ if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId)
+ continue; /* wrong relation id */
+ if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId)
+ continue; /* wrong database id */
+ if (transfer && !isIndex
+ && GET_PREDICATELOCKTARGETTAG_TYPE(oldtarget->tag) == PREDLOCKTAG_RELATION)
+ continue; /* already the right lock */
+
+ /*
+ * If we made it here, we have work to do. We make sure the heap
+ * relation lock exists, then we walk the list of predicate locks for
+ * the old target we found, moving all locks to the heap relation lock
+ * -- unless they already hold that.
+ */
+
+ /*
+ * First make sure we have the heap relation target. We only need to
+ * do this once.
+ */
+ if (transfer && heaptarget == NULL)
+ {
+ PREDICATELOCKTARGETTAG heaptargettag;
+
+ SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId);
+ heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag);
+ heaptarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ &heaptargettag,
+ heaptargettaghash,
+ HASH_ENTER, &found);
+ if (!found)
+ SHMQueueInit(&heaptarget->predicateLocks);
+ }
+
+ /*
+ * Loop through all the locks on the old target, replacing them with
+ * locks on the new target.
+ */
+ oldpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(oldtarget->predicateLocks),
+ &(oldtarget->predicateLocks),
+ offsetof(PREDICATELOCK, targetLink));
+ while (oldpredlock)
+ {
+ PREDICATELOCK *nextpredlock;
+ PREDICATELOCK *newpredlock;
+ SerCommitSeqNo oldCommitSeqNo;
+ SERIALIZABLEXACT *oldXact;
+
+ nextpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(oldtarget->predicateLocks),
+ &(oldpredlock->targetLink),
+ offsetof(PREDICATELOCK, targetLink));
+
+ /*
+ * Remove the old lock first. This avoids the chance of running
+ * out of lock structure entries for the hash table.
+ */
+ oldCommitSeqNo = oldpredlock->commitSeqNo;
+ oldXact = oldpredlock->tag.myXact;
+
+ SHMQueueDelete(&(oldpredlock->xactLink));
+
+ /*
+ * No need for retail delete from oldtarget list, we're removing
+ * the whole target anyway.
+ */
+ hash_search(PredicateLockHash,
+ &oldpredlock->tag,
+ HASH_REMOVE, &found);
+ Assert(found);
+
+ if (transfer)
+ {
+ PREDICATELOCKTAG newpredlocktag;
+
+ newpredlocktag.myTarget = heaptarget;
+ newpredlocktag.myXact = oldXact;
+ newpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &newpredlocktag,
+ PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+ heaptargettaghash),
+ HASH_ENTER,
+ &found);
+ if (!found)
+ {
+ SHMQueueInsertBefore(&(heaptarget->predicateLocks),
+ &(newpredlock->targetLink));
+ SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks),
+ &(newpredlock->xactLink));
+ newpredlock->commitSeqNo = oldCommitSeqNo;
+ }
+ else
+ {
+ if (newpredlock->commitSeqNo < oldCommitSeqNo)
+ newpredlock->commitSeqNo = oldCommitSeqNo;
+ }
+
+ Assert(newpredlock->commitSeqNo != 0);
+ Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
+ || (newpredlock->tag.myXact == OldCommittedSxact));
+ }
+
+ oldpredlock = nextpredlock;
+ }
+
+ hash_search(PredicateLockTargetHash, &oldtarget->tag, HASH_REMOVE,
+ &found);
+ Assert(found);
+ }
+
+ /* Put the scratch entry back */
+ if (transfer)
+ RestoreScratchTarget(true);
+
+ /* Release locks in reverse order */
+ LWLockRelease(SerializableXactHashLock);
+ for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * TransferPredicateLocksToHeapRelation
+ * For all transactions, transfer all predicate locks for the given
+ * relation to a single relation lock on the heap.
+ */
+void
+TransferPredicateLocksToHeapRelation(Relation relation)
+{
+ DropAllPredicateLocksFromTable(relation, true);
+}
+
+
+/*
+ * PredicateLockPageSplit
+ *
+ * Copies any predicate locks for the old page to the new page.
+ * Skip if this is a temporary table or toast table.
+ *
+ * NOTE: A page split (or overflow) affects all serializable transactions,
+ * even if it occurs in the context of another transaction isolation level.
+ *
+ * NOTE: This currently leaves the local copy of the locks without
+ * information on the new lock which is in shared memory. This could cause
+ * problems if enough page splits occur on locked pages without the processes
+ * which hold the locks getting in and noticing.
+ */
+void
+PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+ BlockNumber newblkno)
+{
+ PREDICATELOCKTARGETTAG oldtargettag;
+ PREDICATELOCKTARGETTAG newtargettag;
+ bool success;
+
+ /*
+ * Bail out quickly if there are no serializable transactions running.
+ *
+ * It's safe to do this check without taking any additional locks. Even if
+ * a serializable transaction starts concurrently, we know it can't take
+ * any SIREAD locks on the page being split because the caller is holding
+ * the associated buffer page lock. Memory reordering isn't an issue; the
+ * memory barrier in the LWLock acquisition guarantees that this read
+ * occurs while the buffer page lock is held.
+ */
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+ return;
+
+ if (!PredicateLockingNeededForRelation(relation))
+ return;
+
+ Assert(oldblkno != newblkno);
+ Assert(BlockNumberIsValid(oldblkno));
+ Assert(BlockNumberIsValid(newblkno));
+
+ SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+ relation->rd_node.dbNode,
+ relation->rd_id,
+ oldblkno);
+ SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+ relation->rd_node.dbNode,
+ relation->rd_id,
+ newblkno);
+
+ LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+
+ /*
+ * Try copying the locks over to the new page's tag, creating it if
+ * necessary.
+ */
+ success = TransferPredicateLocksToNewTarget(oldtargettag,
+ newtargettag,
+ false);
+
+ if (!success)
+ {
+ /*
+ * No more predicate lock entries are available. Failure isn't an
+ * option here, so promote the page lock to a relation lock.
+ */
+
+ /* Get the parent relation lock's lock tag */
+ success = GetParentPredicateLockTag(&oldtargettag,
+ &newtargettag);
+ Assert(success);
+
+ /*
+ * Move the locks to the parent. This shouldn't fail.
+ *
+ * Note that here we are removing locks held by other backends,
+ * leading to a possible inconsistency in their local lock hash table.
+ * This is OK because we're replacing it with a lock that covers the
+ * old one.
+ */
+ success = TransferPredicateLocksToNewTarget(oldtargettag,
+ newtargettag,
+ true);
+ Assert(success);
+ }
+
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * PredicateLockPageCombine
+ *
+ * Combines predicate locks for two existing pages.
+ * Skip if this is a temporary table or toast table.
+ *
+ * NOTE: A page combine affects all serializable transactions, even if it
+ * occurs in the context of another transaction isolation level.
+ */
+void
+PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+ BlockNumber newblkno)
+{
+ /*
+ * Page combines differ from page splits in that we ought to be able to
+ * remove the locks on the old page after transferring them to the new
+ * page, instead of duplicating them. However, because we can't edit other
+ * backends' local lock tables, removing the old lock would leave them
+ * with an entry in their LocalPredicateLockHash for a lock they're not
+ * holding, which isn't acceptable. So we wind up having to do the same
+ * work as a page split, acquiring a lock on the new page and keeping the
+ * old page locked too. That can lead to some false positives, but should
+ * be rare in practice.
+ */
+ PredicateLockPageSplit(relation, oldblkno, newblkno);
+}
+
+/*
+ * Walk the list of in-progress serializable transactions and find the new
+ * xmin.
+ */
+static void
+SetNewSxactGlobalXmin(void)
+{
+ SERIALIZABLEXACT *sxact;
+
+ Assert(LWLockHeldByMe(SerializableXactHashLock));
+
+ PredXact->SxactGlobalXmin = InvalidTransactionId;
+ PredXact->SxactGlobalXminCount = 0;
+
+ for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact))
+ {
+ if (!SxactIsRolledBack(sxact)
+ && !SxactIsCommitted(sxact)
+ && sxact != OldCommittedSxact)
+ {
+ Assert(sxact->xmin != InvalidTransactionId);
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
+ || TransactionIdPrecedes(sxact->xmin,
+ PredXact->SxactGlobalXmin))
+ {
+ PredXact->SxactGlobalXmin = sxact->xmin;
+ PredXact->SxactGlobalXminCount = 1;
+ }
+ else if (TransactionIdEquals(sxact->xmin,
+ PredXact->SxactGlobalXmin))
+ PredXact->SxactGlobalXminCount++;
+ }
+ }
+
+ SerialSetActiveSerXmin(PredXact->SxactGlobalXmin);
+}
+
+/*
+ * ReleasePredicateLocks
+ *
+ * Releases predicate locks based on completion of the current transaction,
+ * whether committed or rolled back. It can also be called for a read only
+ * transaction when it becomes impossible for the transaction to become
+ * part of a dangerous structure.
+ *
+ * We do nothing unless this is a serializable transaction.
+ *
+ * This method must ensure that shared memory hash tables are cleaned
+ * up in some relatively timely fashion.
+ *
+ * If this transaction is committing and is holding any predicate locks,
+ * it must be added to a list of completed serializable transactions still
+ * holding locks.
+ *
+ * If isReadOnlySafe is true, then predicate locks are being released before
+ * the end of the transaction because MySerializableXact has been determined
+ * to be RO_SAFE. In non-parallel mode we can release it completely, but it
+ * in parallel mode we partially release the SERIALIZABLEXACT and keep it
+ * around until the end of the transaction, allowing each backend to clear its
+ * MySerializableXact variable and benefit from the optimization in its own
+ * time.
+ */
+void
+ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
+{
+ bool needToClear;
+ RWConflict conflict,
+ nextConflict,
+ possibleUnsafeConflict;
+ SERIALIZABLEXACT *roXact;
+
+ /*
+ * We can't trust XactReadOnly here, because a transaction which started
+ * as READ WRITE can show as READ ONLY later, e.g., within
+ * subtransactions. We want to flag a transaction as READ ONLY if it
+ * commits without writing so that de facto READ ONLY transactions get the
+ * benefit of some RO optimizations, so we will use this local variable to
+ * get some cleanup logic right which is based on whether the transaction
+ * was declared READ ONLY at the top level.
+ */
+ bool topLevelIsDeclaredReadOnly;
+
+ /* We can't be both committing and releasing early due to RO_SAFE. */
+ Assert(!(isCommit && isReadOnlySafe));
+
+ /* Are we at the end of a transaction, that is, a commit or abort? */
+ if (!isReadOnlySafe)
+ {
+ /*
+ * Parallel workers mustn't release predicate locks at the end of
+ * their transaction. The leader will do that at the end of its
+ * transaction.
+ */
+ if (IsParallelWorker())
+ {
+ ReleasePredicateLocksLocal();
+ return;
+ }
+
+ /*
+ * By the time the leader in a parallel query reaches end of
+ * transaction, it has waited for all workers to exit.
+ */
+ Assert(!ParallelContextActive());
+
+ /*
+ * If the leader in a parallel query earlier stashed a partially
+ * released SERIALIZABLEXACT for final clean-up at end of transaction
+ * (because workers might still have been accessing it), then it's
+ * time to restore it.
+ */
+ if (SavedSerializableXact != InvalidSerializableXact)
+ {
+ Assert(MySerializableXact == InvalidSerializableXact);
+ MySerializableXact = SavedSerializableXact;
+ SavedSerializableXact = InvalidSerializableXact;
+ Assert(SxactIsPartiallyReleased(MySerializableXact));
+ }
+ }
+
+ if (MySerializableXact == InvalidSerializableXact)
+ {
+ Assert(LocalPredicateLockHash == NULL);
+ return;
+ }
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * If the transaction is committing, but it has been partially released
+ * already, then treat this as a roll back. It was marked as rolled back.
+ */
+ if (isCommit && SxactIsPartiallyReleased(MySerializableXact))
+ isCommit = false;
+
+ /*
+ * If we're called in the middle of a transaction because we discovered
+ * that the SXACT_FLAG_RO_SAFE flag was set, then we'll partially release
+ * it (that is, release the predicate locks and conflicts, but not the
+ * SERIALIZABLEXACT itself) if we're the first backend to have noticed.
+ */
+ if (isReadOnlySafe && IsInParallelMode())
+ {
+ /*
+ * The leader needs to stash a pointer to it, so that it can
+ * completely release it at end-of-transaction.
+ */
+ if (!IsParallelWorker())
+ SavedSerializableXact = MySerializableXact;
+
+ /*
+ * The first backend to reach this condition will partially release
+ * the SERIALIZABLEXACT. All others will just clear their
+ * backend-local state so that they stop doing SSI checks for the rest
+ * of the transaction.
+ */
+ if (SxactIsPartiallyReleased(MySerializableXact))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ReleasePredicateLocksLocal();
+ return;
+ }
+ else
+ {
+ MySerializableXact->flags |= SXACT_FLAG_PARTIALLY_RELEASED;
+ /* ... and proceed to perform the partial release below. */
+ }
+ }
+ Assert(!isCommit || SxactIsPrepared(MySerializableXact));
+ Assert(!isCommit || !SxactIsDoomed(MySerializableXact));
+ Assert(!SxactIsCommitted(MySerializableXact));
+ Assert(SxactIsPartiallyReleased(MySerializableXact)
+ || !SxactIsRolledBack(MySerializableXact));
+
+ /* may not be serializable during COMMIT/ROLLBACK PREPARED */
+ Assert(MySerializableXact->pid == 0 || IsolationIsSerializable());
+
+ /* We'd better not already be on the cleanup list. */
+ Assert(!SxactIsOnFinishedList(MySerializableXact));
+
+ topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact);
+
+ /*
+ * We don't hold XidGenLock lock here, assuming that TransactionId is
+ * atomic!
+ *
+ * If this value is changing, we don't care that much whether we get the
+ * old or new value -- it is just used to determine how far
+ * SxactGlobalXmin must advance before this transaction can be fully
+ * cleaned up. The worst that could happen is we wait for one more
+ * transaction to complete before freeing some RAM; correctness of visible
+ * behavior is not affected.
+ */
+ MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+ /*
+ * If it's not a commit it's either a rollback or a read-only transaction
+ * flagged SXACT_FLAG_RO_SAFE, and we can clear our locks immediately.
+ */
+ if (isCommit)
+ {
+ MySerializableXact->flags |= SXACT_FLAG_COMMITTED;
+ MySerializableXact->commitSeqNo = ++(PredXact->LastSxactCommitSeqNo);
+ /* Recognize implicit read-only transaction (commit without write). */
+ if (!MyXactDidWrite)
+ MySerializableXact->flags |= SXACT_FLAG_READ_ONLY;
+ }
+ else
+ {
+ /*
+ * The DOOMED flag indicates that we intend to roll back this
+ * transaction and so it should not cause serialization failures for
+ * other transactions that conflict with it. Note that this flag might
+ * already be set, if another backend marked this transaction for
+ * abort.
+ *
+ * The ROLLED_BACK flag further indicates that ReleasePredicateLocks
+ * has been called, and so the SerializableXact is eligible for
+ * cleanup. This means it should not be considered when calculating
+ * SxactGlobalXmin.
+ */
+ MySerializableXact->flags |= SXACT_FLAG_DOOMED;
+ MySerializableXact->flags |= SXACT_FLAG_ROLLED_BACK;
+
+ /*
+ * If the transaction was previously prepared, but is now failing due
+ * to a ROLLBACK PREPARED or (hopefully very rare) error after the
+ * prepare, clear the prepared flag. This simplifies conflict
+ * checking.
+ */
+ MySerializableXact->flags &= ~SXACT_FLAG_PREPARED;
+ }
+
+ if (!topLevelIsDeclaredReadOnly)
+ {
+ Assert(PredXact->WritableSxactCount > 0);
+ if (--(PredXact->WritableSxactCount) == 0)
+ {
+ /*
+ * Release predicate locks and rw-conflicts in for all committed
+ * transactions. There are no longer any transactions which might
+ * conflict with the locks and no chance for new transactions to
+ * overlap. Similarly, existing conflicts in can't cause pivots,
+ * and any conflicts in which could have completed a dangerous
+ * structure would already have caused a rollback, so any
+ * remaining ones must be benign.
+ */
+ PredXact->CanPartialClearThrough = PredXact->LastSxactCommitSeqNo;
+ }
+ }
+ else
+ {
+ /*
+ * Read-only transactions: clear the list of transactions that might
+ * make us unsafe. Note that we use 'inLink' for the iteration as
+ * opposed to 'outLink' for the r/w xacts.
+ */
+ possibleUnsafeConflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
+ &MySerializableXact->possibleUnsafeConflicts,
+ offsetof(RWConflictData, inLink));
+ while (possibleUnsafeConflict)
+ {
+ nextConflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
+ &possibleUnsafeConflict->inLink,
+ offsetof(RWConflictData, inLink));
+
+ Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut));
+ Assert(MySerializableXact == possibleUnsafeConflict->sxactIn);
+
+ ReleaseRWConflict(possibleUnsafeConflict);
+
+ possibleUnsafeConflict = nextConflict;
+ }
+ }
+
+ /* Check for conflict out to old committed transactions. */
+ if (isCommit
+ && !SxactIsReadOnly(MySerializableXact)
+ && SxactHasSummaryConflictOut(MySerializableXact))
+ {
+ /*
+ * we don't know which old committed transaction we conflicted with,
+ * so be conservative and use FirstNormalSerCommitSeqNo here
+ */
+ MySerializableXact->SeqNo.earliestOutConflictCommit =
+ FirstNormalSerCommitSeqNo;
+ MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
+ }
+
+ /*
+ * Release all outConflicts to committed transactions. If we're rolling
+ * back clear them all. Set SXACT_FLAG_CONFLICT_OUT if any point to
+ * previously committed transactions.
+ */
+ conflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->outConflicts,
+ &MySerializableXact->outConflicts,
+ offsetof(RWConflictData, outLink));
+ while (conflict)
+ {
+ nextConflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->outConflicts,
+ &conflict->outLink,
+ offsetof(RWConflictData, outLink));
+
+ if (isCommit
+ && !SxactIsReadOnly(MySerializableXact)
+ && SxactIsCommitted(conflict->sxactIn))
+ {
+ if ((MySerializableXact->flags & SXACT_FLAG_CONFLICT_OUT) == 0
+ || conflict->sxactIn->prepareSeqNo < MySerializableXact->SeqNo.earliestOutConflictCommit)
+ MySerializableXact->SeqNo.earliestOutConflictCommit = conflict->sxactIn->prepareSeqNo;
+ MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
+ }
+
+ if (!isCommit
+ || SxactIsCommitted(conflict->sxactIn)
+ || (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo))
+ ReleaseRWConflict(conflict);
+
+ conflict = nextConflict;
+ }
+
+ /*
+ * Release all inConflicts from committed and read-only transactions. If
+ * we're rolling back, clear them all.
+ */
+ conflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->inConflicts,
+ &MySerializableXact->inConflicts,
+ offsetof(RWConflictData, inLink));
+ while (conflict)
+ {
+ nextConflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->inConflicts,
+ &conflict->inLink,
+ offsetof(RWConflictData, inLink));
+
+ if (!isCommit
+ || SxactIsCommitted(conflict->sxactOut)
+ || SxactIsReadOnly(conflict->sxactOut))
+ ReleaseRWConflict(conflict);
+
+ conflict = nextConflict;
+ }
+
+ if (!topLevelIsDeclaredReadOnly)
+ {
+ /*
+ * Remove ourselves from the list of possible conflicts for concurrent
+ * READ ONLY transactions, flagging them as unsafe if we have a
+ * conflict out. If any are waiting DEFERRABLE transactions, wake them
+ * up if they are known safe or known unsafe.
+ */
+ possibleUnsafeConflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
+ &MySerializableXact->possibleUnsafeConflicts,
+ offsetof(RWConflictData, outLink));
+ while (possibleUnsafeConflict)
+ {
+ nextConflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts,
+ &possibleUnsafeConflict->outLink,
+ offsetof(RWConflictData, outLink));
+
+ roXact = possibleUnsafeConflict->sxactIn;
+ Assert(MySerializableXact == possibleUnsafeConflict->sxactOut);
+ Assert(SxactIsReadOnly(roXact));
+
+ /* Mark conflicted if necessary. */
+ if (isCommit
+ && MyXactDidWrite
+ && SxactHasConflictOut(MySerializableXact)
+ && (MySerializableXact->SeqNo.earliestOutConflictCommit
+ <= roXact->SeqNo.lastCommitBeforeSnapshot))
+ {
+ /*
+ * This releases possibleUnsafeConflict (as well as all other
+ * possible conflicts for roXact)
+ */
+ FlagSxactUnsafe(roXact);
+ }
+ else
+ {
+ ReleaseRWConflict(possibleUnsafeConflict);
+
+ /*
+ * If we were the last possible conflict, flag it safe. The
+ * transaction can now safely release its predicate locks (but
+ * that transaction's backend has to do that itself).
+ */
+ if (SHMQueueEmpty(&roXact->possibleUnsafeConflicts))
+ roXact->flags |= SXACT_FLAG_RO_SAFE;
+ }
+
+ /*
+ * Wake up the process for a waiting DEFERRABLE transaction if we
+ * now know it's either safe or conflicted.
+ */
+ if (SxactIsDeferrableWaiting(roXact) &&
+ (SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact)))
+ ProcSendSignal(roXact->pid);
+
+ possibleUnsafeConflict = nextConflict;
+ }
+ }
+
+ /*
+ * Check whether it's time to clean up old transactions. This can only be
+ * done when the last serializable transaction with the oldest xmin among
+ * serializable transactions completes. We then find the "new oldest"
+ * xmin and purge any transactions which finished before this transaction
+ * was launched.
+ */
+ needToClear = false;
+ if (TransactionIdEquals(MySerializableXact->xmin, PredXact->SxactGlobalXmin))
+ {
+ Assert(PredXact->SxactGlobalXminCount > 0);
+ if (--(PredXact->SxactGlobalXminCount) == 0)
+ {
+ SetNewSxactGlobalXmin();
+ needToClear = true;
+ }
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+
+ LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+
+ /* Add this to the list of transactions to check for later cleanup. */
+ if (isCommit)
+ SHMQueueInsertBefore(FinishedSerializableTransactions,
+ &MySerializableXact->finishedLink);
+
+ /*
+ * If we're releasing a RO_SAFE transaction in parallel mode, we'll only
+ * partially release it. That's necessary because other backends may have
+ * a reference to it. The leader will release the SERIALIZABLEXACT itself
+ * at the end of the transaction after workers have stopped running.
+ */
+ if (!isCommit)
+ ReleaseOneSerializableXact(MySerializableXact,
+ isReadOnlySafe && IsInParallelMode(),
+ false);
+
+ LWLockRelease(SerializableFinishedListLock);
+
+ if (needToClear)
+ ClearOldPredicateLocks();
+
+ ReleasePredicateLocksLocal();
+}
+
+static void
+ReleasePredicateLocksLocal(void)
+{
+ MySerializableXact = InvalidSerializableXact;
+ MyXactDidWrite = false;
+
+ /* Delete per-transaction lock table */
+ if (LocalPredicateLockHash != NULL)
+ {
+ hash_destroy(LocalPredicateLockHash);
+ LocalPredicateLockHash = NULL;
+ }
+}
+
+/*
+ * Clear old predicate locks, belonging to committed transactions that are no
+ * longer interesting to any in-progress transaction.
+ */
+static void
+ClearOldPredicateLocks(void)
+{
+ SERIALIZABLEXACT *finishedSxact;
+ PREDICATELOCK *predlock;
+
+ /*
+ * Loop through finished transactions. They are in commit order, so we can
+ * stop as soon as we find one that's still interesting.
+ */
+ LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+ finishedSxact = (SERIALIZABLEXACT *)
+ SHMQueueNext(FinishedSerializableTransactions,
+ FinishedSerializableTransactions,
+ offsetof(SERIALIZABLEXACT, finishedLink));
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ while (finishedSxact)
+ {
+ SERIALIZABLEXACT *nextSxact;
+
+ nextSxact = (SERIALIZABLEXACT *)
+ SHMQueueNext(FinishedSerializableTransactions,
+ &(finishedSxact->finishedLink),
+ offsetof(SERIALIZABLEXACT, finishedLink));
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
+ || TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
+ PredXact->SxactGlobalXmin))
+ {
+ /*
+ * This transaction committed before any in-progress transaction
+ * took its snapshot. It's no longer interesting.
+ */
+ LWLockRelease(SerializableXactHashLock);
+ SHMQueueDelete(&(finishedSxact->finishedLink));
+ ReleaseOneSerializableXact(finishedSxact, false, false);
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ }
+ else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough
+ && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough)
+ {
+ /*
+ * Any active transactions that took their snapshot before this
+ * transaction committed are read-only, so we can clear part of
+ * its state.
+ */
+ LWLockRelease(SerializableXactHashLock);
+
+ if (SxactIsReadOnly(finishedSxact))
+ {
+ /* A read-only transaction can be removed entirely */
+ SHMQueueDelete(&(finishedSxact->finishedLink));
+ ReleaseOneSerializableXact(finishedSxact, false, false);
+ }
+ else
+ {
+ /*
+ * A read-write transaction can only be partially cleared. We
+ * need to keep the SERIALIZABLEXACT but can release the
+ * SIREAD locks and conflicts in.
+ */
+ ReleaseOneSerializableXact(finishedSxact, true, false);
+ }
+
+ PredXact->HavePartialClearedThrough = finishedSxact->commitSeqNo;
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ }
+ else
+ {
+ /* Still interesting. */
+ break;
+ }
+ finishedSxact = nextSxact;
+ }
+ LWLockRelease(SerializableXactHashLock);
+
+ /*
+ * Loop through predicate locks on dummy transaction for summarized data.
+ */
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ predlock = (PREDICATELOCK *)
+ SHMQueueNext(&OldCommittedSxact->predicateLocks,
+ &OldCommittedSxact->predicateLocks,
+ offsetof(PREDICATELOCK, xactLink));
+ while (predlock)
+ {
+ PREDICATELOCK *nextpredlock;
+ bool canDoPartialCleanup;
+
+ nextpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&OldCommittedSxact->predicateLocks,
+ &predlock->xactLink,
+ offsetof(PREDICATELOCK, xactLink));
+
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ Assert(predlock->commitSeqNo != 0);
+ Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
+ canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough);
+ LWLockRelease(SerializableXactHashLock);
+
+ /*
+ * If this lock originally belonged to an old enough transaction, we
+ * can release it.
+ */
+ if (canDoPartialCleanup)
+ {
+ PREDICATELOCKTAG tag;
+ PREDICATELOCKTARGET *target;
+ PREDICATELOCKTARGETTAG targettag;
+ uint32 targettaghash;
+ LWLock *partitionLock;
+
+ tag = predlock->tag;
+ target = tag.myTarget;
+ targettag = target->tag;
+ targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ SHMQueueDelete(&(predlock->targetLink));
+ SHMQueueDelete(&(predlock->xactLink));
+
+ hash_search_with_hash_value(PredicateLockHash, &tag,
+ PredicateLockHashCodeFromTargetHashCode(&tag,
+ targettaghash),
+ HASH_REMOVE, NULL);
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+ LWLockRelease(partitionLock);
+ }
+
+ predlock = nextpredlock;
+ }
+
+ LWLockRelease(SerializablePredicateListLock);
+ LWLockRelease(SerializableFinishedListLock);
+}
+
+/*
+ * This is the normal way to delete anything from any of the predicate
+ * locking hash tables. Given a transaction which we know can be deleted:
+ * delete all predicate locks held by that transaction and any predicate
+ * lock targets which are now unreferenced by a lock; delete all conflicts
+ * for the transaction; delete all xid values for the transaction; then
+ * delete the transaction.
+ *
+ * When the partial flag is set, we can release all predicate locks and
+ * in-conflict information -- we've established that there are no longer
+ * any overlapping read write transactions for which this transaction could
+ * matter -- but keep the transaction entry itself and any outConflicts.
+ *
+ * When the summarize flag is set, we've run short of room for sxact data
+ * and must summarize to the SLRU. Predicate locks are transferred to a
+ * dummy "old" transaction, with duplicate locks on a single target
+ * collapsing to a single lock with the "latest" commitSeqNo from among
+ * the conflicting locks..
+ */
+static void
+ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
+ bool summarize)
+{
+ PREDICATELOCK *predlock;
+ SERIALIZABLEXIDTAG sxidtag;
+ RWConflict conflict,
+ nextConflict;
+
+ Assert(sxact != NULL);
+ Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact));
+ Assert(partial || !SxactIsOnFinishedList(sxact));
+ Assert(LWLockHeldByMe(SerializableFinishedListLock));
+
+ /*
+ * First release all the predicate locks held by this xact (or transfer
+ * them to OldCommittedSxact if summarize is true)
+ */
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ if (IsInParallelMode())
+ LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+ predlock = (PREDICATELOCK *)
+ SHMQueueNext(&(sxact->predicateLocks),
+ &(sxact->predicateLocks),
+ offsetof(PREDICATELOCK, xactLink));
+ while (predlock)
+ {
+ PREDICATELOCK *nextpredlock;
+ PREDICATELOCKTAG tag;
+ SHM_QUEUE *targetLink;
+ PREDICATELOCKTARGET *target;
+ PREDICATELOCKTARGETTAG targettag;
+ uint32 targettaghash;
+ LWLock *partitionLock;
+
+ nextpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(sxact->predicateLocks),
+ &(predlock->xactLink),
+ offsetof(PREDICATELOCK, xactLink));
+
+ tag = predlock->tag;
+ targetLink = &(predlock->targetLink);
+ target = tag.myTarget;
+ targettag = target->tag;
+ targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ SHMQueueDelete(targetLink);
+
+ hash_search_with_hash_value(PredicateLockHash, &tag,
+ PredicateLockHashCodeFromTargetHashCode(&tag,
+ targettaghash),
+ HASH_REMOVE, NULL);
+ if (summarize)
+ {
+ bool found;
+
+ /* Fold into dummy transaction list. */
+ tag.myXact = OldCommittedSxact;
+ predlock = hash_search_with_hash_value(PredicateLockHash, &tag,
+ PredicateLockHashCodeFromTargetHashCode(&tag,
+ targettaghash),
+ HASH_ENTER_NULL, &found);
+ if (!predlock)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase max_pred_locks_per_transaction.")));
+ if (found)
+ {
+ Assert(predlock->commitSeqNo != 0);
+ Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
+ if (predlock->commitSeqNo < sxact->commitSeqNo)
+ predlock->commitSeqNo = sxact->commitSeqNo;
+ }
+ else
+ {
+ SHMQueueInsertBefore(&(target->predicateLocks),
+ &(predlock->targetLink));
+ SHMQueueInsertBefore(&(OldCommittedSxact->predicateLocks),
+ &(predlock->xactLink));
+ predlock->commitSeqNo = sxact->commitSeqNo;
+ }
+ }
+ else
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+ LWLockRelease(partitionLock);
+
+ predlock = nextpredlock;
+ }
+
+ /*
+ * Rather than retail removal, just re-init the head after we've run
+ * through the list.
+ */
+ SHMQueueInit(&sxact->predicateLocks);
+
+ if (IsInParallelMode())
+ LWLockRelease(&sxact->perXactPredicateListLock);
+ LWLockRelease(SerializablePredicateListLock);
+
+ sxidtag.xid = sxact->topXid;
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /* Release all outConflicts (unless 'partial' is true) */
+ if (!partial)
+ {
+ conflict = (RWConflict)
+ SHMQueueNext(&sxact->outConflicts,
+ &sxact->outConflicts,
+ offsetof(RWConflictData, outLink));
+ while (conflict)
+ {
+ nextConflict = (RWConflict)
+ SHMQueueNext(&sxact->outConflicts,
+ &conflict->outLink,
+ offsetof(RWConflictData, outLink));
+ if (summarize)
+ conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+ ReleaseRWConflict(conflict);
+ conflict = nextConflict;
+ }
+ }
+
+ /* Release all inConflicts. */
+ conflict = (RWConflict)
+ SHMQueueNext(&sxact->inConflicts,
+ &sxact->inConflicts,
+ offsetof(RWConflictData, inLink));
+ while (conflict)
+ {
+ nextConflict = (RWConflict)
+ SHMQueueNext(&sxact->inConflicts,
+ &conflict->inLink,
+ offsetof(RWConflictData, inLink));
+ if (summarize)
+ conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+ ReleaseRWConflict(conflict);
+ conflict = nextConflict;
+ }
+
+ /* Finally, get rid of the xid and the record of the transaction itself. */
+ if (!partial)
+ {
+ if (sxidtag.xid != InvalidTransactionId)
+ hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL);
+ ReleasePredXact(sxact);
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+}
+
+/*
+ * Tests whether the given top level transaction is concurrent with
+ * (overlaps) our current transaction.
+ *
+ * We need to identify the top level transaction for SSI, anyway, so pass
+ * that to this function to save the overhead of checking the snapshot's
+ * subxip array.
+ */
+static bool
+XidIsConcurrent(TransactionId xid)
+{
+ Snapshot snap;
+ uint32 i;
+
+ Assert(TransactionIdIsValid(xid));
+ Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+ snap = GetTransactionSnapshot();
+
+ if (TransactionIdPrecedes(xid, snap->xmin))
+ return false;
+
+ if (TransactionIdFollowsOrEquals(xid, snap->xmax))
+ return true;
+
+ for (i = 0; i < snap->xcnt; i++)
+ {
+ if (xid == snap->xip[i])
+ return true;
+ }
+
+ return false;
+}
+
+bool
+CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
+{
+ if (!SerializationNeededForRead(relation, snapshot))
+ return false;
+
+ /* Check if someone else has already decided that we need to die */
+ if (SxactIsDoomed(MySerializableXact))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
+ errhint("The transaction might succeed if retried.")));
+ }
+
+ return true;
+}
+
+/*
+ * CheckForSerializableConflictOut
+ * A table AM is reading a tuple that has been modified. If it determines
+ * that the tuple version it is reading is not visible to us, it should
+ * pass in the top level xid of the transaction that created it.
+ * Otherwise, if it determines that it is visible to us but it has been
+ * deleted or there is a newer version available due to an update, it
+ * should pass in the top level xid of the modifying transaction.
+ *
+ * This function will check for overlap with our own transaction. If the given
+ * xid is also serializable and the transactions overlap (i.e., they cannot see
+ * each other's writes), then we have a conflict out.
+ */
+void
+CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
+{
+ SERIALIZABLEXIDTAG sxidtag;
+ SERIALIZABLEXID *sxid;
+ SERIALIZABLEXACT *sxact;
+
+ if (!SerializationNeededForRead(relation, snapshot))
+ return;
+
+ /* Check if someone else has already decided that we need to die */
+ if (SxactIsDoomed(MySerializableXact))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
+ errhint("The transaction might succeed if retried.")));
+ }
+ Assert(TransactionIdIsValid(xid));
+
+ if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
+ return;
+
+ /*
+ * Find sxact or summarized info for the top level xid.
+ */
+ sxidtag.xid = xid;
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ sxid = (SERIALIZABLEXID *)
+ hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ if (!sxid)
+ {
+ /*
+ * Transaction not found in "normal" SSI structures. Check whether it
+ * got pushed out to SLRU storage for "old committed" transactions.
+ */
+ SerCommitSeqNo conflictCommitSeqNo;
+
+ conflictCommitSeqNo = SerialGetMinConflictCommitSeqNo(xid);
+ if (conflictCommitSeqNo != 0)
+ {
+ if (conflictCommitSeqNo != InvalidSerCommitSeqNo
+ && (!SxactIsReadOnly(MySerializableXact)
+ || conflictCommitSeqNo
+ <= MySerializableXact->SeqNo.lastCommitBeforeSnapshot))
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid),
+ errhint("The transaction might succeed if retried.")));
+
+ if (SxactHasSummaryConflictIn(MySerializableXact)
+ || !SHMQueueEmpty(&MySerializableXact->inConflicts))
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid),
+ errhint("The transaction might succeed if retried.")));
+
+ MySerializableXact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+ }
+
+ /* It's not serializable or otherwise not important. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+ sxact = sxid->myXact;
+ Assert(TransactionIdEquals(sxact->topXid, xid));
+ if (sxact == MySerializableXact || SxactIsDoomed(sxact))
+ {
+ /* Can't conflict with ourself or a transaction that will roll back. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+
+ /*
+ * We have a conflict out to a transaction which has a conflict out to a
+ * summarized transaction. That summarized transaction must have
+ * committed first, and we can't tell when it committed in relation to our
+ * snapshot acquisition, so something needs to be canceled.
+ */
+ if (SxactHasSummaryConflictOut(sxact))
+ {
+ if (!SxactIsPrepared(sxact))
+ {
+ sxact->flags |= SXACT_FLAG_DOOMED;
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+ else
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on conflict out to old pivot."),
+ errhint("The transaction might succeed if retried.")));
+ }
+ }
+
+ /*
+ * If this is a read-only transaction and the writing transaction has
+ * committed, and it doesn't have a rw-conflict to a transaction which
+ * committed before it, no conflict.
+ */
+ if (SxactIsReadOnly(MySerializableXact)
+ && SxactIsCommitted(sxact)
+ && !SxactHasSummaryConflictOut(sxact)
+ && (!SxactHasConflictOut(sxact)
+ || MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit))
+ {
+ /* Read-only transaction will appear to run first. No conflict. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+
+ if (!XidIsConcurrent(xid))
+ {
+ /* This write was already in our snapshot; no conflict. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+
+ if (RWConflictExists(MySerializableXact, sxact))
+ {
+ /* We don't want duplicate conflict records in the list. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+
+ /*
+ * Flag the conflict. But first, if this conflict creates a dangerous
+ * structure, ereport an error.
+ */
+ FlagRWConflict(MySerializableXact, sxact);
+ LWLockRelease(SerializableXactHashLock);
+}
+
+/*
+ * Check a particular target for rw-dependency conflict in. A subroutine of
+ * CheckForSerializableConflictIn().
+ */
+static void
+CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
+{
+ uint32 targettaghash;
+ LWLock *partitionLock;
+ PREDICATELOCKTARGET *target;
+ PREDICATELOCK *predlock;
+ PREDICATELOCK *mypredlock = NULL;
+ PREDICATELOCKTAG mypredlocktag;
+
+ Assert(MySerializableXact != InvalidSerializableXact);
+
+ /*
+ * The same hash and LW lock apply to the lock target and the lock itself.
+ */
+ targettaghash = PredicateLockTargetTagHashCode(targettag);
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ LWLockAcquire(partitionLock, LW_SHARED);
+ target = (PREDICATELOCKTARGET *)
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ targettag, targettaghash,
+ HASH_FIND, NULL);
+ if (!target)
+ {
+ /* Nothing has this target locked; we're done here. */
+ LWLockRelease(partitionLock);
+ return;
+ }
+
+ /*
+ * Each lock for an overlapping transaction represents a conflict: a
+ * rw-dependency in to this transaction.
+ */
+ predlock = (PREDICATELOCK *)
+ SHMQueueNext(&(target->predicateLocks),
+ &(target->predicateLocks),
+ offsetof(PREDICATELOCK, targetLink));
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ while (predlock)
+ {
+ SHM_QUEUE *predlocktargetlink;
+ PREDICATELOCK *nextpredlock;
+ SERIALIZABLEXACT *sxact;
+
+ predlocktargetlink = &(predlock->targetLink);
+ nextpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(target->predicateLocks),
+ predlocktargetlink,
+ offsetof(PREDICATELOCK, targetLink));
+
+ sxact = predlock->tag.myXact;
+ if (sxact == MySerializableXact)
+ {
+ /*
+ * If we're getting a write lock on a tuple, we don't need a
+ * predicate (SIREAD) lock on the same tuple. We can safely remove
+ * our SIREAD lock, but we'll defer doing so until after the loop
+ * because that requires upgrading to an exclusive partition lock.
+ *
+ * We can't use this optimization within a subtransaction because
+ * the subtransaction could roll back, and we would be left
+ * without any lock at the top level.
+ */
+ if (!IsSubTransaction()
+ && GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
+ {
+ mypredlock = predlock;
+ mypredlocktag = predlock->tag;
+ }
+ }
+ else if (!SxactIsDoomed(sxact)
+ && (!SxactIsCommitted(sxact)
+ || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+ sxact->finishedBefore))
+ && !RWConflictExists(sxact, MySerializableXact))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Re-check after getting exclusive lock because the other
+ * transaction may have flagged a conflict.
+ */
+ if (!SxactIsDoomed(sxact)
+ && (!SxactIsCommitted(sxact)
+ || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+ sxact->finishedBefore))
+ && !RWConflictExists(sxact, MySerializableXact))
+ {
+ FlagRWConflict(sxact, MySerializableXact);
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ }
+
+ predlock = nextpredlock;
+ }
+ LWLockRelease(SerializableXactHashLock);
+ LWLockRelease(partitionLock);
+
+ /*
+ * If we found one of our own SIREAD locks to remove, remove it now.
+ *
+ * At this point our transaction already has a RowExclusiveLock on the
+ * relation, so we are OK to drop the predicate lock on the tuple, if
+ * found, without fearing that another write against the tuple will occur
+ * before the MVCC information makes it to the buffer.
+ */
+ if (mypredlock != NULL)
+ {
+ uint32 predlockhashcode;
+ PREDICATELOCK *rmpredlock;
+
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ if (IsInParallelMode())
+ LWLockAcquire(&MySerializableXact->perXactPredicateListLock, LW_EXCLUSIVE);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Remove the predicate lock from shared memory, if it wasn't removed
+ * while the locks were released. One way that could happen is from
+ * autovacuum cleaning up an index.
+ */
+ predlockhashcode = PredicateLockHashCodeFromTargetHashCode
+ (&mypredlocktag, targettaghash);
+ rmpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &mypredlocktag,
+ predlockhashcode,
+ HASH_FIND, NULL);
+ if (rmpredlock != NULL)
+ {
+ Assert(rmpredlock == mypredlock);
+
+ SHMQueueDelete(&(mypredlock->targetLink));
+ SHMQueueDelete(&(mypredlock->xactLink));
+
+ rmpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &mypredlocktag,
+ predlockhashcode,
+ HASH_REMOVE, NULL);
+ Assert(rmpredlock == mypredlock);
+
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+ LWLockRelease(partitionLock);
+ if (IsInParallelMode())
+ LWLockRelease(&MySerializableXact->perXactPredicateListLock);
+ LWLockRelease(SerializablePredicateListLock);
+
+ if (rmpredlock != NULL)
+ {
+ /*
+ * Remove entry in local lock table if it exists. It's OK if it
+ * doesn't exist; that means the lock was transferred to a new
+ * target by a different backend.
+ */
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ targettag, targettaghash,
+ HASH_REMOVE, NULL);
+
+ DecrementParentLocks(targettag);
+ }
+ }
+}
+
+/*
+ * CheckForSerializableConflictIn
+ * We are writing the given tuple. If that indicates a rw-conflict
+ * in from another serializable transaction, take appropriate action.
+ *
+ * Skip checking for any granularity for which a parameter is missing.
+ *
+ * A tuple update or delete is in conflict if we have a predicate lock
+ * against the relation or page in which the tuple exists, or against the
+ * tuple itself.
+ */
+void
+CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
+{
+ PREDICATELOCKTARGETTAG targettag;
+
+ if (!SerializationNeededForWrite(relation))
+ return;
+
+ /* Check if someone else has already decided that we need to die */
+ if (SxactIsDoomed(MySerializableXact))
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."),
+ errhint("The transaction might succeed if retried.")));
+
+ /*
+ * We're doing a write which might cause rw-conflicts now or later.
+ * Memorize that fact.
+ */
+ MyXactDidWrite = true;
+
+ /*
+ * It is important that we check for locks from the finest granularity to
+ * the coarsest granularity, so that granularity promotion doesn't cause
+ * us to miss a lock. The new (coarser) lock will be acquired before the
+ * old (finer) locks are released.
+ *
+ * It is not possible to take and hold a lock across the checks for all
+ * granularities because each target could be in a separate partition.
+ */
+ if (tid != NULL)
+ {
+ SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
+ relation->rd_node.dbNode,
+ relation->rd_id,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ CheckTargetForConflictsIn(&targettag);
+ }
+
+ if (blkno != InvalidBlockNumber)
+ {
+ SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+ relation->rd_node.dbNode,
+ relation->rd_id,
+ blkno);
+ CheckTargetForConflictsIn(&targettag);
+ }
+
+ SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
+ relation->rd_node.dbNode,
+ relation->rd_id);
+ CheckTargetForConflictsIn(&targettag);
+}
+
+/*
+ * CheckTableForSerializableConflictIn
+ * The entire table is going through a DDL-style logical mass delete
+ * like TRUNCATE or DROP TABLE. If that causes a rw-conflict in from
+ * another serializable transaction, take appropriate action.
+ *
+ * While these operations do not operate entirely within the bounds of
+ * snapshot isolation, they can occur inside a serializable transaction, and
+ * will logically occur after any reads which saw rows which were destroyed
+ * by these operations, so we do what we can to serialize properly under
+ * SSI.
+ *
+ * The relation passed in must be a heap relation. Any predicate lock of any
+ * granularity on the heap will cause a rw-conflict in to this transaction.
+ * Predicate locks on indexes do not matter because they only exist to guard
+ * against conflicting inserts into the index, and this is a mass *delete*.
+ * When a table is truncated or dropped, the index will also be truncated
+ * or dropped, and we'll deal with locks on the index when that happens.
+ *
+ * Dropping or truncating a table also needs to drop any existing predicate
+ * locks on heap tuples or pages, because they're about to go away. This
+ * should be done before altering the predicate locks because the transaction
+ * could be rolled back because of a conflict, in which case the lock changes
+ * are not needed. (At the moment, we don't actually bother to drop the
+ * existing locks on a dropped or truncated table at the moment. That might
+ * lead to some false positives, but it doesn't seem worth the trouble.)
+ */
+void
+CheckTableForSerializableConflictIn(Relation relation)
+{
+ HASH_SEQ_STATUS seqstat;
+ PREDICATELOCKTARGET *target;
+ Oid dbId;
+ Oid heapId;
+ int i;
+
+ /*
+ * Bail out quickly if there are no serializable transactions running.
+ * It's safe to check this without taking locks because the caller is
+ * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which
+ * would matter here can be acquired while that is held.
+ */
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+ return;
+
+ if (!SerializationNeededForWrite(relation))
+ return;
+
+ /*
+ * We're doing a write which might cause rw-conflicts now or later.
+ * Memorize that fact.
+ */
+ MyXactDidWrite = true;
+
+ Assert(relation->rd_index == NULL); /* not an index relation */
+
+ dbId = relation->rd_node.dbNode;
+ heapId = relation->rd_id;
+
+ LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+ for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /* Scan through target list */
+ hash_seq_init(&seqstat, PredicateLockTargetHash);
+
+ while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
+ {
+ PREDICATELOCK *predlock;
+
+ /*
+ * Check whether this is a target which needs attention.
+ */
+ if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId)
+ continue; /* wrong relation id */
+ if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId)
+ continue; /* wrong database id */
+
+ /*
+ * Loop through locks for this target and flag conflicts.
+ */
+ predlock = (PREDICATELOCK *)
+ SHMQueueNext(&(target->predicateLocks),
+ &(target->predicateLocks),
+ offsetof(PREDICATELOCK, targetLink));
+ while (predlock)
+ {
+ PREDICATELOCK *nextpredlock;
+
+ nextpredlock = (PREDICATELOCK *)
+ SHMQueueNext(&(target->predicateLocks),
+ &(predlock->targetLink),
+ offsetof(PREDICATELOCK, targetLink));
+
+ if (predlock->tag.myXact != MySerializableXact
+ && !RWConflictExists(predlock->tag.myXact, MySerializableXact))
+ {
+ FlagRWConflict(predlock->tag.myXact, MySerializableXact);
+ }
+
+ predlock = nextpredlock;
+ }
+ }
+
+ /* Release locks in reverse order */
+ LWLockRelease(SerializableXactHashLock);
+ for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+
+/*
+ * Flag a rw-dependency between two serializable transactions.
+ *
+ * The caller is responsible for ensuring that we have a LW lock on
+ * the transaction hash table.
+ */
+static void
+FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+{
+ Assert(reader != writer);
+
+ /* First, see if this conflict causes failure. */
+ OnConflict_CheckForSerializationFailure(reader, writer);
+
+ /* Actually do the conflict flagging. */
+ if (reader == OldCommittedSxact)
+ writer->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+ else if (writer == OldCommittedSxact)
+ reader->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+ else
+ SetRWConflict(reader, writer);
+}
+
+/*----------------------------------------------------------------------------
+ * We are about to add a RW-edge to the dependency graph - check that we don't
+ * introduce a dangerous structure by doing so, and abort one of the
+ * transactions if so.
+ *
+ * A serialization failure can only occur if there is a dangerous structure
+ * in the dependency graph:
+ *
+ * Tin ------> Tpivot ------> Tout
+ * rw rw
+ *
+ * Furthermore, Tout must commit first.
+ *
+ * One more optimization is that if Tin is declared READ ONLY (or commits
+ * without writing), we can only have a problem if Tout committed before Tin
+ * acquired its snapshot.
+ *----------------------------------------------------------------------------
+ */
+static void
+OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+ SERIALIZABLEXACT *writer)
+{
+ bool failure;
+ RWConflict conflict;
+
+ Assert(LWLockHeldByMe(SerializableXactHashLock));
+
+ failure = false;
+
+ /*------------------------------------------------------------------------
+ * Check for already-committed writer with rw-conflict out flagged
+ * (conflict-flag on W means that T2 committed before W):
+ *
+ * R ------> W ------> T2
+ * rw rw
+ *
+ * That is a dangerous structure, so we must abort. (Since the writer
+ * has already committed, we must be the reader)
+ *------------------------------------------------------------------------
+ */
+ if (SxactIsCommitted(writer)
+ && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
+ failure = true;
+
+ /*------------------------------------------------------------------------
+ * Check whether the writer has become a pivot with an out-conflict
+ * committed transaction (T2), and T2 committed first:
+ *
+ * R ------> W ------> T2
+ * rw rw
+ *
+ * Because T2 must've committed first, there is no anomaly if:
+ * - the reader committed before T2
+ * - the writer committed before T2
+ * - the reader is a READ ONLY transaction and the reader was concurrent
+ * with T2 (= reader acquired its snapshot before T2 committed)
+ *
+ * We also handle the case that T2 is prepared but not yet committed
+ * here. In that case T2 has already checked for conflicts, so if it
+ * commits first, making the above conflict real, it's too late for it
+ * to abort.
+ *------------------------------------------------------------------------
+ */
+ if (!failure)
+ {
+ if (SxactHasSummaryConflictOut(writer))
+ {
+ failure = true;
+ conflict = NULL;
+ }
+ else
+ conflict = (RWConflict)
+ SHMQueueNext(&writer->outConflicts,
+ &writer->outConflicts,
+ offsetof(RWConflictData, outLink));
+ while (conflict)
+ {
+ SERIALIZABLEXACT *t2 = conflict->sxactIn;
+
+ if (SxactIsPrepared(t2)
+ && (!SxactIsCommitted(reader)
+ || t2->prepareSeqNo <= reader->commitSeqNo)
+ && (!SxactIsCommitted(writer)
+ || t2->prepareSeqNo <= writer->commitSeqNo)
+ && (!SxactIsReadOnly(reader)
+ || t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
+ {
+ failure = true;
+ break;
+ }
+ conflict = (RWConflict)
+ SHMQueueNext(&writer->outConflicts,
+ &conflict->outLink,
+ offsetof(RWConflictData, outLink));
+ }
+ }
+
+ /*------------------------------------------------------------------------
+ * Check whether the reader has become a pivot with a writer
+ * that's committed (or prepared):
+ *
+ * T0 ------> R ------> W
+ * rw rw
+ *
+ * Because W must've committed first for an anomaly to occur, there is no
+ * anomaly if:
+ * - T0 committed before the writer
+ * - T0 is READ ONLY, and overlaps the writer
+ *------------------------------------------------------------------------
+ */
+ if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader))
+ {
+ if (SxactHasSummaryConflictIn(reader))
+ {
+ failure = true;
+ conflict = NULL;
+ }
+ else
+ conflict = (RWConflict)
+ SHMQueueNext(&reader->inConflicts,
+ &reader->inConflicts,
+ offsetof(RWConflictData, inLink));
+ while (conflict)
+ {
+ SERIALIZABLEXACT *t0 = conflict->sxactOut;
+
+ if (!SxactIsDoomed(t0)
+ && (!SxactIsCommitted(t0)
+ || t0->commitSeqNo >= writer->prepareSeqNo)
+ && (!SxactIsReadOnly(t0)
+ || t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo))
+ {
+ failure = true;
+ break;
+ }
+ conflict = (RWConflict)
+ SHMQueueNext(&reader->inConflicts,
+ &conflict->inLink,
+ offsetof(RWConflictData, inLink));
+ }
+ }
+
+ if (failure)
+ {
+ /*
+ * We have to kill a transaction to avoid a possible anomaly from
+ * occurring. If the writer is us, we can just ereport() to cause a
+ * transaction abort. Otherwise we flag the writer for termination,
+ * causing it to abort when it tries to commit. However, if the writer
+ * is a prepared transaction, already prepared, we can't abort it
+ * anymore, so we have to kill the reader instead.
+ */
+ if (MySerializableXact == writer)
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during write."),
+ errhint("The transaction might succeed if retried.")));
+ }
+ else if (SxactIsPrepared(writer))
+ {
+ LWLockRelease(SerializableXactHashLock);
+
+ /* if we're not the writer, we have to be the reader */
+ Assert(MySerializableXact == reader);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid),
+ errhint("The transaction might succeed if retried.")));
+ }
+ writer->flags |= SXACT_FLAG_DOOMED;
+ }
+}
+
+/*
+ * PreCommit_CheckForSerializationFailure
+ * Check for dangerous structures in a serializable transaction
+ * at commit.
+ *
+ * We're checking for a dangerous structure as each conflict is recorded.
+ * The only way we could have a problem at commit is if this is the "out"
+ * side of a pivot, and neither the "in" side nor the pivot has yet
+ * committed.
+ *
+ * If a dangerous structure is found, the pivot (the near conflict) is
+ * marked for death, because rolling back another transaction might mean
+ * that we fail without ever making progress. This transaction is
+ * committing writes, so letting it commit ensures progress. If we
+ * canceled the far conflict, it might immediately fail again on retry.
+ */
+void
+PreCommit_CheckForSerializationFailure(void)
+{
+ RWConflict nearConflict;
+
+ if (MySerializableXact == InvalidSerializableXact)
+ return;
+
+ Assert(IsolationIsSerializable());
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /* Check if someone else has already decided that we need to die */
+ if (SxactIsDoomed(MySerializableXact))
+ {
+ Assert(!SxactIsPartiallyReleased(MySerializableXact));
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."),
+ errhint("The transaction might succeed if retried.")));
+ }
+
+ nearConflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->inConflicts,
+ &MySerializableXact->inConflicts,
+ offsetof(RWConflictData, inLink));
+ while (nearConflict)
+ {
+ if (!SxactIsCommitted(nearConflict->sxactOut)
+ && !SxactIsDoomed(nearConflict->sxactOut))
+ {
+ RWConflict farConflict;
+
+ farConflict = (RWConflict)
+ SHMQueueNext(&nearConflict->sxactOut->inConflicts,
+ &nearConflict->sxactOut->inConflicts,
+ offsetof(RWConflictData, inLink));
+ while (farConflict)
+ {
+ if (farConflict->sxactOut == MySerializableXact
+ || (!SxactIsCommitted(farConflict->sxactOut)
+ && !SxactIsReadOnly(farConflict->sxactOut)
+ && !SxactIsDoomed(farConflict->sxactOut)))
+ {
+ /*
+ * Normally, we kill the pivot transaction to make sure we
+ * make progress if the failing transaction is retried.
+ * However, we can't kill it if it's already prepared, so
+ * in that case we commit suicide instead.
+ */
+ if (SxactIsPrepared(nearConflict->sxactOut))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."),
+ errhint("The transaction might succeed if retried.")));
+ }
+ nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED;
+ break;
+ }
+ farConflict = (RWConflict)
+ SHMQueueNext(&nearConflict->sxactOut->inConflicts,
+ &farConflict->inLink,
+ offsetof(RWConflictData, inLink));
+ }
+ }
+
+ nearConflict = (RWConflict)
+ SHMQueueNext(&MySerializableXact->inConflicts,
+ &nearConflict->inLink,
+ offsetof(RWConflictData, inLink));
+ }
+
+ MySerializableXact->prepareSeqNo = ++(PredXact->LastSxactCommitSeqNo);
+ MySerializableXact->flags |= SXACT_FLAG_PREPARED;
+
+ LWLockRelease(SerializableXactHashLock);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Two-phase commit support
+ */
+
+/*
+ * AtPrepare_Locks
+ * Do the preparatory work for a PREPARE: make 2PC state file
+ * records for all predicate locks currently held.
+ */
+void
+AtPrepare_PredicateLocks(void)
+{
+ PREDICATELOCK *predlock;
+ SERIALIZABLEXACT *sxact;
+ TwoPhasePredicateRecord record;
+ TwoPhasePredicateXactRecord *xactRecord;
+ TwoPhasePredicateLockRecord *lockRecord;
+
+ sxact = MySerializableXact;
+ xactRecord = &(record.data.xactRecord);
+ lockRecord = &(record.data.lockRecord);
+
+ if (MySerializableXact == InvalidSerializableXact)
+ return;
+
+ /* Generate an xact record for our SERIALIZABLEXACT */
+ record.type = TWOPHASEPREDICATERECORD_XACT;
+ xactRecord->xmin = MySerializableXact->xmin;
+ xactRecord->flags = MySerializableXact->flags;
+
+ /*
+ * Note that we don't include the list of conflicts in our out in the
+ * statefile, because new conflicts can be added even after the
+ * transaction prepares. We'll just make a conservative assumption during
+ * recovery instead.
+ */
+
+ RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
+ &record, sizeof(record));
+
+ /*
+ * Generate a lock record for each lock.
+ *
+ * To do this, we need to walk the predicate lock list in our sxact rather
+ * than using the local predicate lock table because the latter is not
+ * guaranteed to be accurate.
+ */
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+
+ /*
+ * No need to take sxact->perXactPredicateListLock in parallel mode
+ * because there cannot be any parallel workers running while we are
+ * preparing a transaction.
+ */
+ Assert(!IsParallelWorker() && !ParallelContextActive());
+
+ predlock = (PREDICATELOCK *)
+ SHMQueueNext(&(sxact->predicateLocks),
+ &(sxact->predicateLocks),
+ offsetof(PREDICATELOCK, xactLink));
+
+ while (predlock != NULL)
+ {
+ record.type = TWOPHASEPREDICATERECORD_LOCK;
+ lockRecord->target = predlock->tag.myTarget->tag;
+
+ RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
+ &record, sizeof(record));
+
+ predlock = (PREDICATELOCK *)
+ SHMQueueNext(&(sxact->predicateLocks),
+ &(predlock->xactLink),
+ offsetof(PREDICATELOCK, xactLink));
+ }
+
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * PostPrepare_Locks
+ * Clean up after successful PREPARE. Unlike the non-predicate
+ * lock manager, we do not need to transfer locks to a dummy
+ * PGPROC because our SERIALIZABLEXACT will stay around
+ * anyway. We only need to clean up our local state.
+ */
+void
+PostPrepare_PredicateLocks(TransactionId xid)
+{
+ if (MySerializableXact == InvalidSerializableXact)
+ return;
+
+ Assert(SxactIsPrepared(MySerializableXact));
+
+ MySerializableXact->pid = 0;
+
+ hash_destroy(LocalPredicateLockHash);
+ LocalPredicateLockHash = NULL;
+
+ MySerializableXact = InvalidSerializableXact;
+ MyXactDidWrite = false;
+}
+
+/*
+ * PredicateLockTwoPhaseFinish
+ * Release a prepared transaction's predicate locks once it
+ * commits or aborts.
+ */
+void
+PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
+{
+ SERIALIZABLEXID *sxid;
+ SERIALIZABLEXIDTAG sxidtag;
+
+ sxidtag.xid = xid;
+
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ sxid = (SERIALIZABLEXID *)
+ hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ LWLockRelease(SerializableXactHashLock);
+
+ /* xid will not be found if it wasn't a serializable transaction */
+ if (sxid == NULL)
+ return;
+
+ /* Release its locks */
+ MySerializableXact = sxid->myXact;
+ MyXactDidWrite = true; /* conservatively assume that we wrote
+ * something */
+ ReleasePredicateLocks(isCommit, false);
+}
+
+/*
+ * Re-acquire a predicate lock belonging to a transaction that was prepared.
+ */
+void
+predicatelock_twophase_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhasePredicateRecord *record;
+
+ Assert(len == sizeof(TwoPhasePredicateRecord));
+
+ record = (TwoPhasePredicateRecord *) recdata;
+
+ Assert((record->type == TWOPHASEPREDICATERECORD_XACT) ||
+ (record->type == TWOPHASEPREDICATERECORD_LOCK));
+
+ if (record->type == TWOPHASEPREDICATERECORD_XACT)
+ {
+ /* Per-transaction record. Set up a SERIALIZABLEXACT. */
+ TwoPhasePredicateXactRecord *xactRecord;
+ SERIALIZABLEXACT *sxact;
+ SERIALIZABLEXID *sxid;
+ SERIALIZABLEXIDTAG sxidtag;
+ bool found;
+
+ xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord;
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ sxact = CreatePredXact();
+ if (!sxact)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+
+ /* vxid for a prepared xact is InvalidBackendId/xid; no pid */
+ sxact->vxid.backendId = InvalidBackendId;
+ sxact->vxid.localTransactionId = (LocalTransactionId) xid;
+ sxact->pid = 0;
+
+ /* a prepared xact hasn't committed yet */
+ sxact->prepareSeqNo = RecoverySerCommitSeqNo;
+ sxact->commitSeqNo = InvalidSerCommitSeqNo;
+ sxact->finishedBefore = InvalidTransactionId;
+
+ sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo;
+
+ /*
+ * Don't need to track this; no transactions running at the time the
+ * recovered xact started are still active, except possibly other
+ * prepared xacts and we don't care whether those are RO_SAFE or not.
+ */
+ SHMQueueInit(&(sxact->possibleUnsafeConflicts));
+
+ SHMQueueInit(&(sxact->predicateLocks));
+ SHMQueueElemInit(&(sxact->finishedLink));
+
+ sxact->topXid = xid;
+ sxact->xmin = xactRecord->xmin;
+ sxact->flags = xactRecord->flags;
+ Assert(SxactIsPrepared(sxact));
+ if (!SxactIsReadOnly(sxact))
+ {
+ ++(PredXact->WritableSxactCount);
+ Assert(PredXact->WritableSxactCount <=
+ (MaxBackends + max_prepared_xacts));
+ }
+
+ /*
+ * We don't know whether the transaction had any conflicts or not, so
+ * we'll conservatively assume that it had both a conflict in and a
+ * conflict out, and represent that with the summary conflict flags.
+ */
+ SHMQueueInit(&(sxact->outConflicts));
+ SHMQueueInit(&(sxact->inConflicts));
+ sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+ sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+
+ /* Register the transaction's xid */
+ sxidtag.xid = xid;
+ sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+ &sxidtag,
+ HASH_ENTER, &found);
+ Assert(sxid != NULL);
+ Assert(!found);
+ sxid->myXact = (SERIALIZABLEXACT *) sxact;
+
+ /*
+ * Update global xmin. Note that this is a special case compared to
+ * registering a normal transaction, because the global xmin might go
+ * backwards. That's OK, because until recovery is over we're not
+ * going to complete any transactions or create any non-prepared
+ * transactions, so there's no danger of throwing away.
+ */
+ if ((!TransactionIdIsValid(PredXact->SxactGlobalXmin)) ||
+ (TransactionIdFollows(PredXact->SxactGlobalXmin, sxact->xmin)))
+ {
+ PredXact->SxactGlobalXmin = sxact->xmin;
+ PredXact->SxactGlobalXminCount = 1;
+ SerialSetActiveSerXmin(sxact->xmin);
+ }
+ else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin))
+ {
+ Assert(PredXact->SxactGlobalXminCount > 0);
+ PredXact->SxactGlobalXminCount++;
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+ }
+ else if (record->type == TWOPHASEPREDICATERECORD_LOCK)
+ {
+ /* Lock record. Recreate the PREDICATELOCK */
+ TwoPhasePredicateLockRecord *lockRecord;
+ SERIALIZABLEXID *sxid;
+ SERIALIZABLEXACT *sxact;
+ SERIALIZABLEXIDTAG sxidtag;
+ uint32 targettaghash;
+
+ lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord;
+ targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target);
+
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ sxidtag.xid = xid;
+ sxid = (SERIALIZABLEXID *)
+ hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ LWLockRelease(SerializableXactHashLock);
+
+ Assert(sxid != NULL);
+ sxact = sxid->myXact;
+ Assert(sxact != InvalidSerializableXact);
+
+ CreatePredicateLock(&lockRecord->target, targettaghash, sxact);
+ }
+}
+
+/*
+ * Prepare to share the current SERIALIZABLEXACT with parallel workers.
+ * Return a handle object that can be used by AttachSerializableXact() in a
+ * parallel worker.
+ */
+SerializableXactHandle
+ShareSerializableXact(void)
+{
+ return MySerializableXact;
+}
+
+/*
+ * Allow parallel workers to import the leader's SERIALIZABLEXACT.
+ */
+void
+AttachSerializableXact(SerializableXactHandle handle)
+{
+
+ Assert(MySerializableXact == InvalidSerializableXact);
+
+ MySerializableXact = (SERIALIZABLEXACT *) handle;
+ if (MySerializableXact != InvalidSerializableXact)
+ CreateLocalPredicateLockHash();
+}
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
new file mode 100644
index 0000000..c50a419
--- /dev/null
+++ b/src/backend/storage/lmgr/proc.c
@@ -0,0 +1,2012 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.c
+ * routines to manage per-process shared memory data structure
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/proc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * Interface (a):
+ * ProcSleep(), ProcWakeup(),
+ * ProcQueueAlloc() -- create a shm queue for sleeping processes
+ * ProcQueueInit() -- create a queue without allocing memory
+ *
+ * Waiting for a lock causes the backend to be put to sleep. Whoever releases
+ * the lock wakes the process up again (and gives it an error code so it knows
+ * whether it was awoken on an error condition).
+ *
+ * Interface (b):
+ *
+ * ProcReleaseLocks -- frees the locks associated with current transaction
+ *
+ * ProcKill -- destroys the shared memory state (and locks)
+ * associated with the process.
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "replication/slot.h"
+#include "replication/syncrep.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/spin.h"
+#include "storage/standby.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/* GUC variables */
+int DeadlockTimeout = 1000;
+int StatementTimeout = 0;
+int LockTimeout = 0;
+int IdleInTransactionSessionTimeout = 0;
+int IdleSessionTimeout = 0;
+bool log_lock_waits = false;
+
+/* Pointer to this process's PGPROC struct, if any */
+PGPROC *MyProc = NULL;
+
+/*
+ * This spinlock protects the freelist of recycled PGPROC structures.
+ * We cannot use an LWLock because the LWLock manager depends on already
+ * having a PGPROC and a wait semaphore! But these structures are touched
+ * relatively infrequently (only at backend startup or shutdown) and not for
+ * very long, so a spinlock is okay.
+ */
+NON_EXEC_STATIC slock_t *ProcStructLock = NULL;
+
+/* Pointers to shared-memory structures */
+PROC_HDR *ProcGlobal = NULL;
+NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL;
+PGPROC *PreparedXactProcs = NULL;
+
+/* If we are waiting for a lock, this points to the associated LOCALLOCK */
+static LOCALLOCK *lockAwaited = NULL;
+
+static DeadLockState deadlock_state = DS_NOT_YET_CHECKED;
+
+/* Is a deadlock check pending? */
+static volatile sig_atomic_t got_deadlock_timeout;
+
+static void RemoveProcFromArray(int code, Datum arg);
+static void ProcKill(int code, Datum arg);
+static void AuxiliaryProcKill(int code, Datum arg);
+static void CheckDeadLock(void);
+
+
+/*
+ * Report shared-memory space needed by InitProcGlobal.
+ */
+Size
+ProcGlobalShmemSize(void)
+{
+ Size size = 0;
+ Size TotalProcs =
+ add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts));
+
+ /* ProcGlobal */
+ size = add_size(size, sizeof(PROC_HDR));
+ size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC)));
+ size = add_size(size, sizeof(slock_t));
+
+ size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
+ size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates)));
+ size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags)));
+
+ return size;
+}
+
+/*
+ * Report number of semaphores needed by InitProcGlobal.
+ */
+int
+ProcGlobalSemas(void)
+{
+ /*
+ * We need a sema per backend (including autovacuum), plus one for each
+ * auxiliary process.
+ */
+ return MaxBackends + NUM_AUXILIARY_PROCS;
+}
+
+/*
+ * InitProcGlobal -
+ * Initialize the global process table during postmaster or standalone
+ * backend startup.
+ *
+ * We also create all the per-process semaphores we will need to support
+ * the requested number of backends. We used to allocate semaphores
+ * only when backends were actually started up, but that is bad because
+ * it lets Postgres fail under load --- a lot of Unix systems are
+ * (mis)configured with small limits on the number of semaphores, and
+ * running out when trying to start another backend is a common failure.
+ * So, now we grab enough semaphores to support the desired max number
+ * of backends immediately at initialization --- if the sysadmin has set
+ * MaxConnections, max_worker_processes, max_wal_senders, or
+ * autovacuum_max_workers higher than his kernel will support, he'll
+ * find out sooner rather than later.
+ *
+ * Another reason for creating semaphores here is that the semaphore
+ * implementation typically requires us to create semaphores in the
+ * postmaster, not in backends.
+ *
+ * Note: this is NOT called by individual backends under a postmaster,
+ * not even in the EXEC_BACKEND case. The ProcGlobal and AuxiliaryProcs
+ * pointers must be propagated specially for EXEC_BACKEND operation.
+ */
+void
+InitProcGlobal(void)
+{
+ PGPROC *procs;
+ int i,
+ j;
+ bool found;
+ uint32 TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts;
+
+ /* Create the ProcGlobal shared structure */
+ ProcGlobal = (PROC_HDR *)
+ ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found);
+ Assert(!found);
+
+ /*
+ * Initialize the data structures.
+ */
+ ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+ ProcGlobal->freeProcs = NULL;
+ ProcGlobal->autovacFreeProcs = NULL;
+ ProcGlobal->bgworkerFreeProcs = NULL;
+ ProcGlobal->walsenderFreeProcs = NULL;
+ ProcGlobal->startupProc = NULL;
+ ProcGlobal->startupProcPid = 0;
+ ProcGlobal->startupBufferPinWaitBufId = -1;
+ ProcGlobal->walwriterLatch = NULL;
+ ProcGlobal->checkpointerLatch = NULL;
+ pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PGPROCNO);
+ pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PGPROCNO);
+
+ /*
+ * Create and initialize all the PGPROC structures we'll need. There are
+ * five separate consumers: (1) normal backends, (2) autovacuum workers
+ * and the autovacuum launcher, (3) background workers, (4) auxiliary
+ * processes, and (5) prepared transactions. Each PGPROC structure is
+ * dedicated to exactly one of these purposes, and they do not move
+ * between groups.
+ */
+ procs = (PGPROC *) ShmemAlloc(TotalProcs * sizeof(PGPROC));
+ MemSet(procs, 0, TotalProcs * sizeof(PGPROC));
+ ProcGlobal->allProcs = procs;
+ /* XXX allProcCount isn't really all of them; it excludes prepared xacts */
+ ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS;
+
+ /*
+ * Allocate arrays mirroring PGPROC fields in a dense manner. See
+ * PROC_HDR.
+ *
+ * XXX: It might make sense to increase padding for these arrays, given
+ * how hotly they are accessed.
+ */
+ ProcGlobal->xids =
+ (TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids));
+ MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids));
+ ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates));
+ MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates));
+ ProcGlobal->statusFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->statusFlags));
+ MemSet(ProcGlobal->statusFlags, 0, TotalProcs * sizeof(*ProcGlobal->statusFlags));
+
+ for (i = 0; i < TotalProcs; i++)
+ {
+ /* Common initialization for all PGPROCs, regardless of type. */
+
+ /*
+ * Set up per-PGPROC semaphore, latch, and fpInfoLock. Prepared xact
+ * dummy PGPROCs don't need these though - they're never associated
+ * with a real process
+ */
+ if (i < MaxBackends + NUM_AUXILIARY_PROCS)
+ {
+ procs[i].sem = PGSemaphoreCreate();
+ InitSharedLatch(&(procs[i].procLatch));
+ LWLockInitialize(&(procs[i].fpInfoLock), LWTRANCHE_LOCK_FASTPATH);
+ }
+ procs[i].pgprocno = i;
+
+ /*
+ * Newly created PGPROCs for normal backends, autovacuum and bgworkers
+ * must be queued up on the appropriate free list. Because there can
+ * only ever be a small, fixed number of auxiliary processes, no free
+ * list is used in that case; InitAuxiliaryProcess() instead uses a
+ * linear search. PGPROCs for prepared transactions are added to a
+ * free list by TwoPhaseShmemInit().
+ */
+ if (i < MaxConnections)
+ {
+ /* PGPROC for normal backend, add to freeProcs list */
+ procs[i].links.next = (SHM_QUEUE *) ProcGlobal->freeProcs;
+ ProcGlobal->freeProcs = &procs[i];
+ procs[i].procgloballist = &ProcGlobal->freeProcs;
+ }
+ else if (i < MaxConnections + autovacuum_max_workers + 1)
+ {
+ /* PGPROC for AV launcher/worker, add to autovacFreeProcs list */
+ procs[i].links.next = (SHM_QUEUE *) ProcGlobal->autovacFreeProcs;
+ ProcGlobal->autovacFreeProcs = &procs[i];
+ procs[i].procgloballist = &ProcGlobal->autovacFreeProcs;
+ }
+ else if (i < MaxConnections + autovacuum_max_workers + 1 + max_worker_processes)
+ {
+ /* PGPROC for bgworker, add to bgworkerFreeProcs list */
+ procs[i].links.next = (SHM_QUEUE *) ProcGlobal->bgworkerFreeProcs;
+ ProcGlobal->bgworkerFreeProcs = &procs[i];
+ procs[i].procgloballist = &ProcGlobal->bgworkerFreeProcs;
+ }
+ else if (i < MaxBackends)
+ {
+ /* PGPROC for walsender, add to walsenderFreeProcs list */
+ procs[i].links.next = (SHM_QUEUE *) ProcGlobal->walsenderFreeProcs;
+ ProcGlobal->walsenderFreeProcs = &procs[i];
+ procs[i].procgloballist = &ProcGlobal->walsenderFreeProcs;
+ }
+
+ /* Initialize myProcLocks[] shared memory queues. */
+ for (j = 0; j < NUM_LOCK_PARTITIONS; j++)
+ SHMQueueInit(&(procs[i].myProcLocks[j]));
+
+ /* Initialize lockGroupMembers list. */
+ dlist_init(&procs[i].lockGroupMembers);
+
+ /*
+ * Initialize the atomic variables, otherwise, it won't be safe to
+ * access them for backends that aren't currently in use.
+ */
+ pg_atomic_init_u32(&(procs[i].procArrayGroupNext), INVALID_PGPROCNO);
+ pg_atomic_init_u32(&(procs[i].clogGroupNext), INVALID_PGPROCNO);
+ pg_atomic_init_u64(&(procs[i].waitStart), 0);
+ }
+
+ /*
+ * Save pointers to the blocks of PGPROC structures reserved for auxiliary
+ * processes and prepared transactions.
+ */
+ AuxiliaryProcs = &procs[MaxBackends];
+ PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS];
+
+ /* Create ProcStructLock spinlock, too */
+ ProcStructLock = (slock_t *) ShmemAlloc(sizeof(slock_t));
+ SpinLockInit(ProcStructLock);
+}
+
+/*
+ * InitProcess -- initialize a per-process data structure for this backend
+ */
+void
+InitProcess(void)
+{
+ PGPROC *volatile *procgloballist;
+
+ /*
+ * ProcGlobal should be set up already (if we are a backend, we inherit
+ * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+ */
+ if (ProcGlobal == NULL)
+ elog(PANIC, "proc header uninitialized");
+
+ if (MyProc != NULL)
+ elog(ERROR, "you already exist");
+
+ /* Decide which list should supply our PGPROC. */
+ if (IsAnyAutoVacuumProcess())
+ procgloballist = &ProcGlobal->autovacFreeProcs;
+ else if (IsBackgroundWorker)
+ procgloballist = &ProcGlobal->bgworkerFreeProcs;
+ else if (am_walsender)
+ procgloballist = &ProcGlobal->walsenderFreeProcs;
+ else
+ procgloballist = &ProcGlobal->freeProcs;
+
+ /*
+ * Try to get a proc struct from the appropriate free list. If this
+ * fails, we must be out of PGPROC structures (not to mention semaphores).
+ *
+ * While we are holding the ProcStructLock, also copy the current shared
+ * estimate of spins_per_delay to local storage.
+ */
+ SpinLockAcquire(ProcStructLock);
+
+ set_spins_per_delay(ProcGlobal->spins_per_delay);
+
+ MyProc = *procgloballist;
+
+ if (MyProc != NULL)
+ {
+ *procgloballist = (PGPROC *) MyProc->links.next;
+ SpinLockRelease(ProcStructLock);
+ }
+ else
+ {
+ /*
+ * If we reach here, all the PGPROCs are in use. This is one of the
+ * possible places to detect "too many backends", so give the standard
+ * error message. XXX do we need to give a different failure message
+ * in the autovacuum case?
+ */
+ SpinLockRelease(ProcStructLock);
+ if (am_walsender)
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("number of requested standby connections exceeds max_wal_senders (currently %d)",
+ max_wal_senders)));
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("sorry, too many clients already")));
+ }
+
+ /*
+ * Cross-check that the PGPROC is of the type we expect; if this were not
+ * the case, it would get returned to the wrong list.
+ */
+ Assert(MyProc->procgloballist == procgloballist);
+
+ /*
+ * Now that we have a PGPROC, mark ourselves as an active postmaster
+ * child; this is so that the postmaster can detect it if we exit without
+ * cleaning up. (XXX autovac launcher currently doesn't participate in
+ * this; it probably should.)
+ */
+ if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+ MarkPostmasterChildActive();
+
+ /*
+ * Initialize all fields of MyProc, except for those previously
+ * initialized by InitProcGlobal.
+ */
+ SHMQueueElemInit(&(MyProc->links));
+ MyProc->waitStatus = PROC_WAIT_STATUS_OK;
+ MyProc->lxid = InvalidLocalTransactionId;
+ MyProc->fpVXIDLock = false;
+ MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+ MyProc->xid = InvalidTransactionId;
+ MyProc->xmin = InvalidTransactionId;
+ MyProc->pid = MyProcPid;
+ /* backendId, databaseId and roleId will be filled in later */
+ MyProc->backendId = InvalidBackendId;
+ MyProc->databaseId = InvalidOid;
+ MyProc->roleId = InvalidOid;
+ MyProc->tempNamespaceId = InvalidOid;
+ MyProc->isBackgroundWorker = IsBackgroundWorker;
+ MyProc->delayChkpt = 0;
+ MyProc->statusFlags = 0;
+ /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
+ if (IsAutoVacuumWorkerProcess())
+ MyProc->statusFlags |= PROC_IS_AUTOVACUUM;
+ MyProc->lwWaiting = false;
+ MyProc->lwWaitMode = 0;
+ MyProc->waitLock = NULL;
+ MyProc->waitProcLock = NULL;
+ pg_atomic_write_u64(&MyProc->waitStart, 0);
+#ifdef USE_ASSERT_CHECKING
+ {
+ int i;
+
+ /* Last process should have released all locks. */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i])));
+ }
+#endif
+ MyProc->recoveryConflictPending = false;
+
+ /* Initialize fields for sync rep */
+ MyProc->waitLSN = 0;
+ MyProc->syncRepState = SYNC_REP_NOT_WAITING;
+ SHMQueueElemInit(&(MyProc->syncRepLinks));
+
+ /* Initialize fields for group XID clearing. */
+ MyProc->procArrayGroupMember = false;
+ MyProc->procArrayGroupMemberXid = InvalidTransactionId;
+ Assert(pg_atomic_read_u32(&MyProc->procArrayGroupNext) == INVALID_PGPROCNO);
+
+ /* Check that group locking fields are in a proper initial state. */
+ Assert(MyProc->lockGroupLeader == NULL);
+ Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+
+ /* Initialize wait event information. */
+ MyProc->wait_event_info = 0;
+
+ /* Initialize fields for group transaction status update. */
+ MyProc->clogGroupMember = false;
+ MyProc->clogGroupMemberXid = InvalidTransactionId;
+ MyProc->clogGroupMemberXidStatus = TRANSACTION_STATUS_IN_PROGRESS;
+ MyProc->clogGroupMemberPage = -1;
+ MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
+ Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
+
+ /*
+ * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
+ * on it. That allows us to repoint the process latch, which so far
+ * points to process local one, to the shared one.
+ */
+ OwnLatch(&MyProc->procLatch);
+ SwitchToSharedLatch();
+
+ /* now that we have a proc, report wait events to shared memory */
+ pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+
+ /*
+ * We might be reusing a semaphore that belonged to a failed process. So
+ * be careful and reinitialize its value here. (This is not strictly
+ * necessary anymore, but seems like a good idea for cleanliness.)
+ */
+ PGSemaphoreReset(MyProc->sem);
+
+ /*
+ * Arrange to clean up at backend exit.
+ */
+ on_shmem_exit(ProcKill, 0);
+
+ /*
+ * Now that we have a PGPROC, we could try to acquire locks, so initialize
+ * local state needed for LWLocks, and the deadlock checker.
+ */
+ InitLWLockAccess();
+ InitDeadLockChecking();
+}
+
+/*
+ * InitProcessPhase2 -- make MyProc visible in the shared ProcArray.
+ *
+ * This is separate from InitProcess because we can't acquire LWLocks until
+ * we've created a PGPROC, but in the EXEC_BACKEND case ProcArrayAdd won't
+ * work until after we've done CreateSharedMemoryAndSemaphores.
+ */
+void
+InitProcessPhase2(void)
+{
+ Assert(MyProc != NULL);
+
+ /*
+ * Add our PGPROC to the PGPROC array in shared memory.
+ */
+ ProcArrayAdd(MyProc);
+
+ /*
+ * Arrange to clean that up at backend exit.
+ */
+ on_shmem_exit(RemoveProcFromArray, 0);
+}
+
+/*
+ * InitAuxiliaryProcess -- create a per-auxiliary-process data structure
+ *
+ * This is called by bgwriter and similar processes so that they will have a
+ * MyProc value that's real enough to let them wait for LWLocks. The PGPROC
+ * and sema that are assigned are one of the extra ones created during
+ * InitProcGlobal.
+ *
+ * Auxiliary processes are presently not expected to wait for real (lockmgr)
+ * locks, so we need not set up the deadlock checker. They are never added
+ * to the ProcArray or the sinval messaging mechanism, either. They also
+ * don't get a VXID assigned, since this is only useful when we actually
+ * hold lockmgr locks.
+ *
+ * Startup process however uses locks but never waits for them in the
+ * normal backend sense. Startup process also takes part in sinval messaging
+ * as a sendOnly process, so never reads messages from sinval queue. So
+ * Startup process does have a VXID and does show up in pg_locks.
+ */
+void
+InitAuxiliaryProcess(void)
+{
+ PGPROC *auxproc;
+ int proctype;
+
+ /*
+ * ProcGlobal should be set up already (if we are a backend, we inherit
+ * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+ */
+ if (ProcGlobal == NULL || AuxiliaryProcs == NULL)
+ elog(PANIC, "proc header uninitialized");
+
+ if (MyProc != NULL)
+ elog(ERROR, "you already exist");
+
+ /*
+ * We use the ProcStructLock to protect assignment and releasing of
+ * AuxiliaryProcs entries.
+ *
+ * While we are holding the ProcStructLock, also copy the current shared
+ * estimate of spins_per_delay to local storage.
+ */
+ SpinLockAcquire(ProcStructLock);
+
+ set_spins_per_delay(ProcGlobal->spins_per_delay);
+
+ /*
+ * Find a free auxproc ... *big* trouble if there isn't one ...
+ */
+ for (proctype = 0; proctype < NUM_AUXILIARY_PROCS; proctype++)
+ {
+ auxproc = &AuxiliaryProcs[proctype];
+ if (auxproc->pid == 0)
+ break;
+ }
+ if (proctype >= NUM_AUXILIARY_PROCS)
+ {
+ SpinLockRelease(ProcStructLock);
+ elog(FATAL, "all AuxiliaryProcs are in use");
+ }
+
+ /* Mark auxiliary proc as in use by me */
+ /* use volatile pointer to prevent code rearrangement */
+ ((volatile PGPROC *) auxproc)->pid = MyProcPid;
+
+ MyProc = auxproc;
+
+ SpinLockRelease(ProcStructLock);
+
+ /*
+ * Initialize all fields of MyProc, except for those previously
+ * initialized by InitProcGlobal.
+ */
+ SHMQueueElemInit(&(MyProc->links));
+ MyProc->waitStatus = PROC_WAIT_STATUS_OK;
+ MyProc->lxid = InvalidLocalTransactionId;
+ MyProc->fpVXIDLock = false;
+ MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+ MyProc->xid = InvalidTransactionId;
+ MyProc->xmin = InvalidTransactionId;
+ MyProc->backendId = InvalidBackendId;
+ MyProc->databaseId = InvalidOid;
+ MyProc->roleId = InvalidOid;
+ MyProc->tempNamespaceId = InvalidOid;
+ MyProc->isBackgroundWorker = IsBackgroundWorker;
+ MyProc->delayChkpt = 0;
+ MyProc->statusFlags = 0;
+ MyProc->lwWaiting = false;
+ MyProc->lwWaitMode = 0;
+ MyProc->waitLock = NULL;
+ MyProc->waitProcLock = NULL;
+ pg_atomic_write_u64(&MyProc->waitStart, 0);
+#ifdef USE_ASSERT_CHECKING
+ {
+ int i;
+
+ /* Last process should have released all locks. */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i])));
+ }
+#endif
+
+ /*
+ * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
+ * on it. That allows us to repoint the process latch, which so far
+ * points to process local one, to the shared one.
+ */
+ OwnLatch(&MyProc->procLatch);
+ SwitchToSharedLatch();
+
+ /* now that we have a proc, report wait events to shared memory */
+ pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+
+ /* Check that group locking fields are in a proper initial state. */
+ Assert(MyProc->lockGroupLeader == NULL);
+ Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+
+ /*
+ * We might be reusing a semaphore that belonged to a failed process. So
+ * be careful and reinitialize its value here. (This is not strictly
+ * necessary anymore, but seems like a good idea for cleanliness.)
+ */
+ PGSemaphoreReset(MyProc->sem);
+
+ /*
+ * Arrange to clean up at process exit.
+ */
+ on_shmem_exit(AuxiliaryProcKill, Int32GetDatum(proctype));
+}
+
+/*
+ * Record the PID and PGPROC structures for the Startup process, for use in
+ * ProcSendSignal(). See comments there for further explanation.
+ */
+void
+PublishStartupProcessInformation(void)
+{
+ SpinLockAcquire(ProcStructLock);
+
+ ProcGlobal->startupProc = MyProc;
+ ProcGlobal->startupProcPid = MyProcPid;
+
+ SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * Used from bufmgr to share the value of the buffer that Startup waits on,
+ * or to reset the value to "not waiting" (-1). This allows processing
+ * of recovery conflicts for buffer pins. Set is made before backends look
+ * at this value, so locking not required, especially since the set is
+ * an atomic integer set operation.
+ */
+void
+SetStartupBufferPinWaitBufId(int bufid)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile PROC_HDR *procglobal = ProcGlobal;
+
+ procglobal->startupBufferPinWaitBufId = bufid;
+}
+
+/*
+ * Used by backends when they receive a request to check for buffer pin waits.
+ */
+int
+GetStartupBufferPinWaitBufId(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile PROC_HDR *procglobal = ProcGlobal;
+
+ return procglobal->startupBufferPinWaitBufId;
+}
+
+/*
+ * Check whether there are at least N free PGPROC objects.
+ *
+ * Note: this is designed on the assumption that N will generally be small.
+ */
+bool
+HaveNFreeProcs(int n)
+{
+ PGPROC *proc;
+
+ SpinLockAcquire(ProcStructLock);
+
+ proc = ProcGlobal->freeProcs;
+
+ while (n > 0 && proc != NULL)
+ {
+ proc = (PGPROC *) proc->links.next;
+ n--;
+ }
+
+ SpinLockRelease(ProcStructLock);
+
+ return (n <= 0);
+}
+
+/*
+ * Check if the current process is awaiting a lock.
+ */
+bool
+IsWaitingForLock(void)
+{
+ if (lockAwaited == NULL)
+ return false;
+
+ return true;
+}
+
+/*
+ * Cancel any pending wait for lock, when aborting a transaction, and revert
+ * any strong lock count acquisition for a lock being acquired.
+ *
+ * (Normally, this would only happen if we accept a cancel/die
+ * interrupt while waiting; but an ereport(ERROR) before or during the lock
+ * wait is within the realm of possibility, too.)
+ */
+void
+LockErrorCleanup(void)
+{
+ LWLock *partitionLock;
+ DisableTimeoutParams timeouts[2];
+
+ HOLD_INTERRUPTS();
+
+ AbortStrongLockAcquire();
+
+ /* Nothing to do if we weren't waiting for a lock */
+ if (lockAwaited == NULL)
+ {
+ RESUME_INTERRUPTS();
+ return;
+ }
+
+ /*
+ * Turn off the deadlock and lock timeout timers, if they are still
+ * running (see ProcSleep). Note we must preserve the LOCK_TIMEOUT
+ * indicator flag, since this function is executed before
+ * ProcessInterrupts when responding to SIGINT; else we'd lose the
+ * knowledge that the SIGINT came from a lock timeout and not an external
+ * source.
+ */
+ timeouts[0].id = DEADLOCK_TIMEOUT;
+ timeouts[0].keep_indicator = false;
+ timeouts[1].id = LOCK_TIMEOUT;
+ timeouts[1].keep_indicator = true;
+ disable_timeouts(timeouts, 2);
+
+ /* Unlink myself from the wait queue, if on it (might not be anymore!) */
+ partitionLock = LockHashPartitionLock(lockAwaited->hashcode);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ if (MyProc->links.next != NULL)
+ {
+ /* We could not have been granted the lock yet */
+ RemoveFromWaitQueue(MyProc, lockAwaited->hashcode);
+ }
+ else
+ {
+ /*
+ * Somebody kicked us off the lock queue already. Perhaps they
+ * granted us the lock, or perhaps they detected a deadlock. If they
+ * did grant us the lock, we'd better remember it in our local lock
+ * table.
+ */
+ if (MyProc->waitStatus == PROC_WAIT_STATUS_OK)
+ GrantAwaitedLock();
+ }
+
+ lockAwaited = NULL;
+
+ LWLockRelease(partitionLock);
+
+ RESUME_INTERRUPTS();
+}
+
+
+/*
+ * ProcReleaseLocks() -- release locks associated with current transaction
+ * at main transaction commit or abort
+ *
+ * At main transaction commit, we release standard locks except session locks.
+ * At main transaction abort, we release all locks including session locks.
+ *
+ * Advisory locks are released only if they are transaction-level;
+ * session-level holds remain, whether this is a commit or not.
+ *
+ * At subtransaction commit, we don't release any locks (so this func is not
+ * needed at all); we will defer the releasing to the parent transaction.
+ * At subtransaction abort, we release all locks held by the subtransaction;
+ * this is implemented by retail releasing of the locks under control of
+ * the ResourceOwner mechanism.
+ */
+void
+ProcReleaseLocks(bool isCommit)
+{
+ if (!MyProc)
+ return;
+ /* If waiting, get off wait queue (should only be needed after error) */
+ LockErrorCleanup();
+ /* Release standard locks, including session-level if aborting */
+ LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit);
+ /* Release transaction-level advisory locks */
+ LockReleaseAll(USER_LOCKMETHOD, false);
+}
+
+
+/*
+ * RemoveProcFromArray() -- Remove this process from the shared ProcArray.
+ */
+static void
+RemoveProcFromArray(int code, Datum arg)
+{
+ Assert(MyProc != NULL);
+ ProcArrayRemove(MyProc, InvalidTransactionId);
+}
+
+/*
+ * ProcKill() -- Destroy the per-proc data structure for
+ * this process. Release any of its held LW locks.
+ */
+static void
+ProcKill(int code, Datum arg)
+{
+ PGPROC *proc;
+ PGPROC *volatile *procgloballist;
+
+ Assert(MyProc != NULL);
+
+ /* Make sure we're out of the sync rep lists */
+ SyncRepCleanupAtProcExit();
+
+#ifdef USE_ASSERT_CHECKING
+ {
+ int i;
+
+ /* Last process should have released all locks. */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i])));
+ }
+#endif
+
+ /*
+ * Release any LW locks I am holding. There really shouldn't be any, but
+ * it's cheap to check again before we cut the knees off the LWLock
+ * facility by releasing our PGPROC ...
+ */
+ LWLockReleaseAll();
+
+ /* Cancel any pending condition variable sleep, too */
+ ConditionVariableCancelSleep();
+
+ /* Make sure active replication slots are released */
+ if (MyReplicationSlot != NULL)
+ ReplicationSlotRelease();
+
+ /* Also cleanup all the temporary slots. */
+ ReplicationSlotCleanup();
+
+ /*
+ * Detach from any lock group of which we are a member. If the leader
+ * exist before all other group members, its PGPROC will remain allocated
+ * until the last group process exits; that process must return the
+ * leader's PGPROC to the appropriate list.
+ */
+ if (MyProc->lockGroupLeader != NULL)
+ {
+ PGPROC *leader = MyProc->lockGroupLeader;
+ LWLock *leader_lwlock = LockHashPartitionLockByProc(leader);
+
+ LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+ Assert(!dlist_is_empty(&leader->lockGroupMembers));
+ dlist_delete(&MyProc->lockGroupLink);
+ if (dlist_is_empty(&leader->lockGroupMembers))
+ {
+ leader->lockGroupLeader = NULL;
+ if (leader != MyProc)
+ {
+ procgloballist = leader->procgloballist;
+
+ /* Leader exited first; return its PGPROC. */
+ SpinLockAcquire(ProcStructLock);
+ leader->links.next = (SHM_QUEUE *) *procgloballist;
+ *procgloballist = leader;
+ SpinLockRelease(ProcStructLock);
+ }
+ }
+ else if (leader != MyProc)
+ MyProc->lockGroupLeader = NULL;
+ LWLockRelease(leader_lwlock);
+ }
+
+ /*
+ * Reset MyLatch to the process local one. This is so that signal
+ * handlers et al can continue using the latch after the shared latch
+ * isn't ours anymore.
+ *
+ * Similarly, stop reporting wait events to MyProc->wait_event_info.
+ *
+ * After that clear MyProc and disown the shared latch.
+ */
+ SwitchBackToLocalLatch();
+ pgstat_reset_wait_event_storage();
+
+ proc = MyProc;
+ MyProc = NULL;
+ DisownLatch(&proc->procLatch);
+
+ procgloballist = proc->procgloballist;
+ SpinLockAcquire(ProcStructLock);
+
+ /*
+ * If we're still a member of a locking group, that means we're a leader
+ * which has somehow exited before its children. The last remaining child
+ * will release our PGPROC. Otherwise, release it now.
+ */
+ if (proc->lockGroupLeader == NULL)
+ {
+ /* Since lockGroupLeader is NULL, lockGroupMembers should be empty. */
+ Assert(dlist_is_empty(&proc->lockGroupMembers));
+
+ /* Return PGPROC structure (and semaphore) to appropriate freelist */
+ proc->links.next = (SHM_QUEUE *) *procgloballist;
+ *procgloballist = proc;
+ }
+
+ /* Update shared estimate of spins_per_delay */
+ ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay);
+
+ SpinLockRelease(ProcStructLock);
+
+ /*
+ * This process is no longer present in shared memory in any meaningful
+ * way, so tell the postmaster we've cleaned up acceptably well. (XXX
+ * autovac launcher should be included here someday)
+ */
+ if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+ MarkPostmasterChildInactive();
+
+ /* wake autovac launcher if needed -- see comments in FreeWorkerInfo */
+ if (AutovacuumLauncherPid != 0)
+ kill(AutovacuumLauncherPid, SIGUSR2);
+}
+
+/*
+ * AuxiliaryProcKill() -- Cut-down version of ProcKill for auxiliary
+ * processes (bgwriter, etc). The PGPROC and sema are not released, only
+ * marked as not-in-use.
+ */
+static void
+AuxiliaryProcKill(int code, Datum arg)
+{
+ int proctype = DatumGetInt32(arg);
+ PGPROC *auxproc PG_USED_FOR_ASSERTS_ONLY;
+ PGPROC *proc;
+
+ Assert(proctype >= 0 && proctype < NUM_AUXILIARY_PROCS);
+
+ auxproc = &AuxiliaryProcs[proctype];
+
+ Assert(MyProc == auxproc);
+
+ /* Release any LW locks I am holding (see notes above) */
+ LWLockReleaseAll();
+
+ /* Cancel any pending condition variable sleep, too */
+ ConditionVariableCancelSleep();
+
+ /* look at the equivalent ProcKill() code for comments */
+ SwitchBackToLocalLatch();
+ pgstat_reset_wait_event_storage();
+
+ proc = MyProc;
+ MyProc = NULL;
+ DisownLatch(&proc->procLatch);
+
+ SpinLockAcquire(ProcStructLock);
+
+ /* Mark auxiliary proc no longer in use */
+ proc->pid = 0;
+
+ /* Update shared estimate of spins_per_delay */
+ ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay);
+
+ SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * AuxiliaryPidGetProc -- get PGPROC for an auxiliary process
+ * given its PID
+ *
+ * Returns NULL if not found.
+ */
+PGPROC *
+AuxiliaryPidGetProc(int pid)
+{
+ PGPROC *result = NULL;
+ int index;
+
+ if (pid == 0) /* never match dummy PGPROCs */
+ return NULL;
+
+ for (index = 0; index < NUM_AUXILIARY_PROCS; index++)
+ {
+ PGPROC *proc = &AuxiliaryProcs[index];
+
+ if (proc->pid == pid)
+ {
+ result = proc;
+ break;
+ }
+ }
+ return result;
+}
+
+/*
+ * ProcQueue package: routines for putting processes to sleep
+ * and waking them up
+ */
+
+/*
+ * ProcQueueAlloc -- alloc/attach to a shared memory process queue
+ *
+ * Returns: a pointer to the queue
+ * Side Effects: Initializes the queue if it wasn't there before
+ */
+#ifdef NOT_USED
+PROC_QUEUE *
+ProcQueueAlloc(const char *name)
+{
+ PROC_QUEUE *queue;
+ bool found;
+
+ queue = (PROC_QUEUE *)
+ ShmemInitStruct(name, sizeof(PROC_QUEUE), &found);
+
+ if (!found)
+ ProcQueueInit(queue);
+
+ return queue;
+}
+#endif
+
+/*
+ * ProcQueueInit -- initialize a shared memory process queue
+ */
+void
+ProcQueueInit(PROC_QUEUE *queue)
+{
+ SHMQueueInit(&(queue->links));
+ queue->size = 0;
+}
+
+
+/*
+ * ProcSleep -- put a process to sleep on the specified lock
+ *
+ * Caller must have set MyProc->heldLocks to reflect locks already held
+ * on the lockable object by this process (under all XIDs).
+ *
+ * The lock table's partition lock must be held at entry, and will be held
+ * at exit.
+ *
+ * Result: PROC_WAIT_STATUS_OK if we acquired the lock, PROC_WAIT_STATUS_ERROR if not (deadlock).
+ *
+ * ASSUME: that no one will fiddle with the queue until after
+ * we release the partition lock.
+ *
+ * NOTES: The process queue is now a priority queue for locking.
+ */
+ProcWaitStatus
+ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
+{
+ LOCKMODE lockmode = locallock->tag.mode;
+ LOCK *lock = locallock->lock;
+ PROCLOCK *proclock = locallock->proclock;
+ uint32 hashcode = locallock->hashcode;
+ LWLock *partitionLock = LockHashPartitionLock(hashcode);
+ PROC_QUEUE *waitQueue = &(lock->waitProcs);
+ LOCKMASK myHeldLocks = MyProc->heldLocks;
+ TimestampTz standbyWaitStart = 0;
+ bool early_deadlock = false;
+ bool allow_autovacuum_cancel = true;
+ bool logged_recovery_conflict = false;
+ ProcWaitStatus myWaitStatus;
+ PGPROC *proc;
+ PGPROC *leader = MyProc->lockGroupLeader;
+ int i;
+
+ /*
+ * If group locking is in use, locks held by members of my locking group
+ * need to be included in myHeldLocks. This is not required for relation
+ * extension or page locks which conflict among group members. However,
+ * including them in myHeldLocks will give group members the priority to
+ * get those locks as compared to other backends which are also trying to
+ * acquire those locks. OTOH, we can avoid giving priority to group
+ * members for that kind of locks, but there doesn't appear to be a clear
+ * advantage of the same.
+ */
+ if (leader != NULL)
+ {
+ SHM_QUEUE *procLocks = &(lock->procLocks);
+ PROCLOCK *otherproclock;
+
+ otherproclock = (PROCLOCK *)
+ SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink));
+ while (otherproclock != NULL)
+ {
+ if (otherproclock->groupLeader == leader)
+ myHeldLocks |= otherproclock->holdMask;
+ otherproclock = (PROCLOCK *)
+ SHMQueueNext(procLocks, &otherproclock->lockLink,
+ offsetof(PROCLOCK, lockLink));
+ }
+ }
+
+ /*
+ * Determine where to add myself in the wait queue.
+ *
+ * Normally I should go at the end of the queue. However, if I already
+ * hold locks that conflict with the request of any previous waiter, put
+ * myself in the queue just in front of the first such waiter. This is not
+ * a necessary step, since deadlock detection would move me to before that
+ * waiter anyway; but it's relatively cheap to detect such a conflict
+ * immediately, and avoid delaying till deadlock timeout.
+ *
+ * Special case: if I find I should go in front of some waiter, check to
+ * see if I conflict with already-held locks or the requests before that
+ * waiter. If not, then just grant myself the requested lock immediately.
+ * This is the same as the test for immediate grant in LockAcquire, except
+ * we are only considering the part of the wait queue before my insertion
+ * point.
+ */
+ if (myHeldLocks != 0)
+ {
+ LOCKMASK aheadRequests = 0;
+
+ proc = (PGPROC *) waitQueue->links.next;
+ for (i = 0; i < waitQueue->size; i++)
+ {
+ /*
+ * If we're part of the same locking group as this waiter, its
+ * locks neither conflict with ours nor contribute to
+ * aheadRequests.
+ */
+ if (leader != NULL && leader == proc->lockGroupLeader)
+ {
+ proc = (PGPROC *) proc->links.next;
+ continue;
+ }
+ /* Must he wait for me? */
+ if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks)
+ {
+ /* Must I wait for him ? */
+ if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks)
+ {
+ /*
+ * Yes, so we have a deadlock. Easiest way to clean up
+ * correctly is to call RemoveFromWaitQueue(), but we
+ * can't do that until we are *on* the wait queue. So, set
+ * a flag to check below, and break out of loop. Also,
+ * record deadlock info for later message.
+ */
+ RememberSimpleDeadLock(MyProc, lockmode, lock, proc);
+ early_deadlock = true;
+ break;
+ }
+ /* I must go before this waiter. Check special case. */
+ if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
+ !LockCheckConflicts(lockMethodTable, lockmode, lock,
+ proclock))
+ {
+ /* Skip the wait and just grant myself the lock. */
+ GrantLock(lock, proclock, lockmode);
+ GrantAwaitedLock();
+ return PROC_WAIT_STATUS_OK;
+ }
+ /* Break out of loop to put myself before him */
+ break;
+ }
+ /* Nope, so advance to next waiter */
+ aheadRequests |= LOCKBIT_ON(proc->waitLockMode);
+ proc = (PGPROC *) proc->links.next;
+ }
+
+ /*
+ * If we fall out of loop normally, proc points to waitQueue head, so
+ * we will insert at tail of queue as desired.
+ */
+ }
+ else
+ {
+ /* I hold no locks, so I can't push in front of anyone. */
+ proc = (PGPROC *) &(waitQueue->links);
+ }
+
+ /*
+ * Insert self into queue, ahead of the given proc (or at tail of queue).
+ */
+ SHMQueueInsertBefore(&(proc->links), &(MyProc->links));
+ waitQueue->size++;
+
+ lock->waitMask |= LOCKBIT_ON(lockmode);
+
+ /* Set up wait information in PGPROC object, too */
+ MyProc->waitLock = lock;
+ MyProc->waitProcLock = proclock;
+ MyProc->waitLockMode = lockmode;
+
+ MyProc->waitStatus = PROC_WAIT_STATUS_WAITING;
+
+ /*
+ * If we detected deadlock, give up without waiting. This must agree with
+ * CheckDeadLock's recovery code.
+ */
+ if (early_deadlock)
+ {
+ RemoveFromWaitQueue(MyProc, hashcode);
+ return PROC_WAIT_STATUS_ERROR;
+ }
+
+ /* mark that we are waiting for a lock */
+ lockAwaited = locallock;
+
+ /*
+ * Release the lock table's partition lock.
+ *
+ * NOTE: this may also cause us to exit critical-section state, possibly
+ * allowing a cancel/die interrupt to be accepted. This is OK because we
+ * have recorded the fact that we are waiting for a lock, and so
+ * LockErrorCleanup will clean up if cancel/die happens.
+ */
+ LWLockRelease(partitionLock);
+
+ /*
+ * Also, now that we will successfully clean up after an ereport, it's
+ * safe to check to see if there's a buffer pin deadlock against the
+ * Startup process. Of course, that's only necessary if we're doing Hot
+ * Standby and are not the Startup process ourselves.
+ */
+ if (RecoveryInProgress() && !InRecovery)
+ CheckRecoveryConflictDeadlock();
+
+ /* Reset deadlock_state before enabling the timeout handler */
+ deadlock_state = DS_NOT_YET_CHECKED;
+ got_deadlock_timeout = false;
+
+ /*
+ * Set timer so we can wake up after awhile and check for a deadlock. If a
+ * deadlock is detected, the handler sets MyProc->waitStatus =
+ * PROC_WAIT_STATUS_ERROR, allowing us to know that we must report failure
+ * rather than success.
+ *
+ * By delaying the check until we've waited for a bit, we can avoid
+ * running the rather expensive deadlock-check code in most cases.
+ *
+ * If LockTimeout is set, also enable the timeout for that. We can save a
+ * few cycles by enabling both timeout sources in one call.
+ *
+ * If InHotStandby we set lock waits slightly later for clarity with other
+ * code.
+ */
+ if (!InHotStandby)
+ {
+ if (LockTimeout > 0)
+ {
+ EnableTimeoutParams timeouts[2];
+
+ timeouts[0].id = DEADLOCK_TIMEOUT;
+ timeouts[0].type = TMPARAM_AFTER;
+ timeouts[0].delay_ms = DeadlockTimeout;
+ timeouts[1].id = LOCK_TIMEOUT;
+ timeouts[1].type = TMPARAM_AFTER;
+ timeouts[1].delay_ms = LockTimeout;
+ enable_timeouts(timeouts, 2);
+ }
+ else
+ enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout);
+
+ /*
+ * Use the current time obtained for the deadlock timeout timer as
+ * waitStart (i.e., the time when this process started waiting for the
+ * lock). Since getting the current time newly can cause overhead, we
+ * reuse the already-obtained time to avoid that overhead.
+ *
+ * Note that waitStart is updated without holding the lock table's
+ * partition lock, to avoid the overhead by additional lock
+ * acquisition. This can cause "waitstart" in pg_locks to become NULL
+ * for a very short period of time after the wait started even though
+ * "granted" is false. This is OK in practice because we can assume
+ * that users are likely to look at "waitstart" when waiting for the
+ * lock for a long time.
+ */
+ pg_atomic_write_u64(&MyProc->waitStart,
+ get_timeout_start_time(DEADLOCK_TIMEOUT));
+ }
+ else if (log_recovery_conflict_waits)
+ {
+ /*
+ * Set the wait start timestamp if logging is enabled and in hot
+ * standby.
+ */
+ standbyWaitStart = GetCurrentTimestamp();
+ }
+
+ /*
+ * If somebody wakes us between LWLockRelease and WaitLatch, the latch
+ * will not wait. But a set latch does not necessarily mean that the lock
+ * is free now, as there are many other sources for latch sets than
+ * somebody releasing the lock.
+ *
+ * We process interrupts whenever the latch has been set, so cancel/die
+ * interrupts are processed quickly. This means we must not mind losing
+ * control to a cancel/die interrupt here. We don't, because we have no
+ * shared-state-change work to do after being granted the lock (the
+ * grantor did it all). We do have to worry about canceling the deadlock
+ * timeout and updating the locallock table, but if we lose control to an
+ * error, LockErrorCleanup will fix that up.
+ */
+ do
+ {
+ if (InHotStandby)
+ {
+ bool maybe_log_conflict =
+ (standbyWaitStart != 0 && !logged_recovery_conflict);
+
+ /* Set a timer and wait for that or for the lock to be granted */
+ ResolveRecoveryConflictWithLock(locallock->tag.lock,
+ maybe_log_conflict);
+
+ /*
+ * Emit the log message if the startup process is waiting longer
+ * than deadlock_timeout for recovery conflict on lock.
+ */
+ if (maybe_log_conflict)
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ if (TimestampDifferenceExceeds(standbyWaitStart, now,
+ DeadlockTimeout))
+ {
+ VirtualTransactionId *vxids;
+ int cnt;
+
+ vxids = GetLockConflicts(&locallock->tag.lock,
+ AccessExclusiveLock, &cnt);
+
+ /*
+ * Log the recovery conflict and the list of PIDs of
+ * backends holding the conflicting lock. Note that we do
+ * logging even if there are no such backends right now
+ * because the startup process here has already waited
+ * longer than deadlock_timeout.
+ */
+ LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+ standbyWaitStart, now,
+ cnt > 0 ? vxids : NULL, true);
+ logged_recovery_conflict = true;
+ }
+ }
+ }
+ else
+ {
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
+ ResetLatch(MyLatch);
+ /* check for deadlocks first, as that's probably log-worthy */
+ if (got_deadlock_timeout)
+ {
+ CheckDeadLock();
+ got_deadlock_timeout = false;
+ }
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * waitStatus could change from PROC_WAIT_STATUS_WAITING to something
+ * else asynchronously. Read it just once per loop to prevent
+ * surprising behavior (such as missing log messages).
+ */
+ myWaitStatus = *((volatile ProcWaitStatus *) &MyProc->waitStatus);
+
+ /*
+ * If we are not deadlocked, but are waiting on an autovacuum-induced
+ * task, send a signal to interrupt it.
+ */
+ if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel)
+ {
+ PGPROC *autovac = GetBlockingAutoVacuumPgproc();
+ uint8 statusFlags;
+ uint8 lockmethod_copy;
+ LOCKTAG locktag_copy;
+
+ /*
+ * Grab info we need, then release lock immediately. Note this
+ * coding means that there is a tiny chance that the process
+ * terminates its current transaction and starts a different one
+ * before we have a change to send the signal; the worst possible
+ * consequence is that a for-wraparound vacuum is cancelled. But
+ * that could happen in any case unless we were to do kill() with
+ * the lock held, which is much more undesirable.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ statusFlags = ProcGlobal->statusFlags[autovac->pgxactoff];
+ lockmethod_copy = lock->tag.locktag_lockmethodid;
+ locktag_copy = lock->tag;
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * Only do it if the worker is not working to protect against Xid
+ * wraparound.
+ */
+ if ((statusFlags & PROC_IS_AUTOVACUUM) &&
+ !(statusFlags & PROC_VACUUM_FOR_WRAPAROUND))
+ {
+ int pid = autovac->pid;
+
+ /* report the case, if configured to do so */
+ if (message_level_is_interesting(DEBUG1))
+ {
+ StringInfoData locktagbuf;
+ StringInfoData logbuf; /* errdetail for server log */
+
+ initStringInfo(&locktagbuf);
+ initStringInfo(&logbuf);
+ DescribeLockTag(&locktagbuf, &locktag_copy);
+ appendStringInfo(&logbuf,
+ "Process %d waits for %s on %s.",
+ MyProcPid,
+ GetLockmodeName(lockmethod_copy, lockmode),
+ locktagbuf.data);
+
+ ereport(DEBUG1,
+ (errmsg_internal("sending cancel to blocking autovacuum PID %d",
+ pid),
+ errdetail_log("%s", logbuf.data)));
+
+ pfree(locktagbuf.data);
+ pfree(logbuf.data);
+ }
+
+ /* send the autovacuum worker Back to Old Kent Road */
+ if (kill(pid, SIGINT) < 0)
+ {
+ /*
+ * There's a race condition here: once we release the
+ * ProcArrayLock, it's possible for the autovac worker to
+ * close up shop and exit before we can do the kill().
+ * Therefore, we do not whinge about no-such-process.
+ * Other errors such as EPERM could conceivably happen if
+ * the kernel recycles the PID fast enough, but such cases
+ * seem improbable enough that it's probably best to issue
+ * a warning if we see some other errno.
+ */
+ if (errno != ESRCH)
+ ereport(WARNING,
+ (errmsg("could not send signal to process %d: %m",
+ pid)));
+ }
+ }
+
+ /* prevent signal from being sent again more than once */
+ allow_autovacuum_cancel = false;
+ }
+
+ /*
+ * If awoken after the deadlock check interrupt has run, and
+ * log_lock_waits is on, then report about the wait.
+ */
+ if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED)
+ {
+ StringInfoData buf,
+ lock_waiters_sbuf,
+ lock_holders_sbuf;
+ const char *modename;
+ long secs;
+ int usecs;
+ long msecs;
+ SHM_QUEUE *procLocks;
+ PROCLOCK *proclock;
+ bool first_holder = true,
+ first_waiter = true;
+ int lockHoldersNum = 0;
+
+ initStringInfo(&buf);
+ initStringInfo(&lock_waiters_sbuf);
+ initStringInfo(&lock_holders_sbuf);
+
+ DescribeLockTag(&buf, &locallock->tag.lock);
+ modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid,
+ lockmode);
+ TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT),
+ GetCurrentTimestamp(),
+ &secs, &usecs);
+ msecs = secs * 1000 + usecs / 1000;
+ usecs = usecs % 1000;
+
+ /*
+ * we loop over the lock's procLocks to gather a list of all
+ * holders and waiters. Thus we will be able to provide more
+ * detailed information for lock debugging purposes.
+ *
+ * lock->procLocks contains all processes which hold or wait for
+ * this lock.
+ */
+
+ LWLockAcquire(partitionLock, LW_SHARED);
+
+ procLocks = &(lock->procLocks);
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks,
+ offsetof(PROCLOCK, lockLink));
+
+ while (proclock)
+ {
+ /*
+ * we are a waiter if myProc->waitProcLock == proclock; we are
+ * a holder if it is NULL or something different
+ */
+ if (proclock->tag.myProc->waitProcLock == proclock)
+ {
+ if (first_waiter)
+ {
+ appendStringInfo(&lock_waiters_sbuf, "%d",
+ proclock->tag.myProc->pid);
+ first_waiter = false;
+ }
+ else
+ appendStringInfo(&lock_waiters_sbuf, ", %d",
+ proclock->tag.myProc->pid);
+ }
+ else
+ {
+ if (first_holder)
+ {
+ appendStringInfo(&lock_holders_sbuf, "%d",
+ proclock->tag.myProc->pid);
+ first_holder = false;
+ }
+ else
+ appendStringInfo(&lock_holders_sbuf, ", %d",
+ proclock->tag.myProc->pid);
+
+ lockHoldersNum++;
+ }
+
+ proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink,
+ offsetof(PROCLOCK, lockLink));
+ }
+
+ LWLockRelease(partitionLock);
+
+ if (deadlock_state == DS_SOFT_DEADLOCK)
+ ereport(LOG,
+ (errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs),
+ (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+ "Processes holding the lock: %s. Wait queue: %s.",
+ lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+ else if (deadlock_state == DS_HARD_DEADLOCK)
+ {
+ /*
+ * This message is a bit redundant with the error that will be
+ * reported subsequently, but in some cases the error report
+ * might not make it to the log (eg, if it's caught by an
+ * exception handler), and we want to ensure all long-wait
+ * events get logged.
+ */
+ ereport(LOG,
+ (errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs),
+ (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+ "Processes holding the lock: %s. Wait queue: %s.",
+ lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+ }
+
+ if (myWaitStatus == PROC_WAIT_STATUS_WAITING)
+ ereport(LOG,
+ (errmsg("process %d still waiting for %s on %s after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs),
+ (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+ "Processes holding the lock: %s. Wait queue: %s.",
+ lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+ else if (myWaitStatus == PROC_WAIT_STATUS_OK)
+ ereport(LOG,
+ (errmsg("process %d acquired %s on %s after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs)));
+ else
+ {
+ Assert(myWaitStatus == PROC_WAIT_STATUS_ERROR);
+
+ /*
+ * Currently, the deadlock checker always kicks its own
+ * process, which means that we'll only see
+ * PROC_WAIT_STATUS_ERROR when deadlock_state ==
+ * DS_HARD_DEADLOCK, and there's no need to print redundant
+ * messages. But for completeness and future-proofing, print
+ * a message if it looks like someone else kicked us off the
+ * lock.
+ */
+ if (deadlock_state != DS_HARD_DEADLOCK)
+ ereport(LOG,
+ (errmsg("process %d failed to acquire %s on %s after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs),
+ (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+ "Processes holding the lock: %s. Wait queue: %s.",
+ lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+ }
+
+ /*
+ * At this point we might still need to wait for the lock. Reset
+ * state so we don't print the above messages again.
+ */
+ deadlock_state = DS_NO_DEADLOCK;
+
+ pfree(buf.data);
+ pfree(lock_holders_sbuf.data);
+ pfree(lock_waiters_sbuf.data);
+ }
+ } while (myWaitStatus == PROC_WAIT_STATUS_WAITING);
+
+ /*
+ * Disable the timers, if they are still running. As in LockErrorCleanup,
+ * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has
+ * already caused QueryCancelPending to become set, we want the cancel to
+ * be reported as a lock timeout, not a user cancel.
+ */
+ if (!InHotStandby)
+ {
+ if (LockTimeout > 0)
+ {
+ DisableTimeoutParams timeouts[2];
+
+ timeouts[0].id = DEADLOCK_TIMEOUT;
+ timeouts[0].keep_indicator = false;
+ timeouts[1].id = LOCK_TIMEOUT;
+ timeouts[1].keep_indicator = true;
+ disable_timeouts(timeouts, 2);
+ }
+ else
+ disable_timeout(DEADLOCK_TIMEOUT, false);
+ }
+
+ /*
+ * Emit the log message if recovery conflict on lock was resolved but the
+ * startup process waited longer than deadlock_timeout for it.
+ */
+ if (InHotStandby && logged_recovery_conflict)
+ LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+ standbyWaitStart, GetCurrentTimestamp(),
+ NULL, false);
+
+ /*
+ * Re-acquire the lock table's partition lock. We have to do this to hold
+ * off cancel/die interrupts before we can mess with lockAwaited (else we
+ * might have a missed or duplicated locallock update).
+ */
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * We no longer want LockErrorCleanup to do anything.
+ */
+ lockAwaited = NULL;
+
+ /*
+ * If we got the lock, be sure to remember it in the locallock table.
+ */
+ if (MyProc->waitStatus == PROC_WAIT_STATUS_OK)
+ GrantAwaitedLock();
+
+ /*
+ * We don't have to do anything else, because the awaker did all the
+ * necessary update of the lock table and MyProc.
+ */
+ return MyProc->waitStatus;
+}
+
+
+/*
+ * ProcWakeup -- wake up a process by setting its latch.
+ *
+ * Also remove the process from the wait queue and set its links invalid.
+ * RETURN: the next process in the wait queue.
+ *
+ * The appropriate lock partition lock must be held by caller.
+ *
+ * XXX: presently, this code is only used for the "success" case, and only
+ * works correctly for that case. To clean up in failure case, would need
+ * to twiddle the lock's request counts too --- see RemoveFromWaitQueue.
+ * Hence, in practice the waitStatus parameter must be PROC_WAIT_STATUS_OK.
+ */
+PGPROC *
+ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus)
+{
+ PGPROC *retProc;
+
+ /* Proc should be sleeping ... */
+ if (proc->links.prev == NULL ||
+ proc->links.next == NULL)
+ return NULL;
+ Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING);
+
+ /* Save next process before we zap the list link */
+ retProc = (PGPROC *) proc->links.next;
+
+ /* Remove process from wait queue */
+ SHMQueueDelete(&(proc->links));
+ (proc->waitLock->waitProcs.size)--;
+
+ /* Clean up process' state and pass it the ok/fail signal */
+ proc->waitLock = NULL;
+ proc->waitProcLock = NULL;
+ proc->waitStatus = waitStatus;
+ pg_atomic_write_u64(&MyProc->waitStart, 0);
+
+ /* And awaken it */
+ SetLatch(&proc->procLatch);
+
+ return retProc;
+}
+
+/*
+ * ProcLockWakeup -- routine for waking up processes when a lock is
+ * released (or a prior waiter is aborted). Scan all waiters
+ * for lock, waken any that are no longer blocked.
+ *
+ * The appropriate lock partition lock must be held by caller.
+ */
+void
+ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
+{
+ PROC_QUEUE *waitQueue = &(lock->waitProcs);
+ int queue_size = waitQueue->size;
+ PGPROC *proc;
+ LOCKMASK aheadRequests = 0;
+
+ Assert(queue_size >= 0);
+
+ if (queue_size == 0)
+ return;
+
+ proc = (PGPROC *) waitQueue->links.next;
+
+ while (queue_size-- > 0)
+ {
+ LOCKMODE lockmode = proc->waitLockMode;
+
+ /*
+ * Waken if (a) doesn't conflict with requests of earlier waiters, and
+ * (b) doesn't conflict with already-held locks.
+ */
+ if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
+ !LockCheckConflicts(lockMethodTable, lockmode, lock,
+ proc->waitProcLock))
+ {
+ /* OK to waken */
+ GrantLock(lock, proc->waitProcLock, lockmode);
+ proc = ProcWakeup(proc, PROC_WAIT_STATUS_OK);
+
+ /*
+ * ProcWakeup removes proc from the lock's waiting process queue
+ * and returns the next proc in chain; don't use proc's next-link,
+ * because it's been cleared.
+ */
+ }
+ else
+ {
+ /*
+ * Cannot wake this guy. Remember his request for later checks.
+ */
+ aheadRequests |= LOCKBIT_ON(lockmode);
+ proc = (PGPROC *) proc->links.next;
+ }
+ }
+
+ Assert(waitQueue->size >= 0);
+}
+
+/*
+ * CheckDeadLock
+ *
+ * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a
+ * lock to be released by some other process. Check if there's a deadlock; if
+ * not, just return. (But signal ProcSleep to log a message, if
+ * log_lock_waits is true.) If we have a real deadlock, remove ourselves from
+ * the lock's wait queue and signal an error to ProcSleep.
+ */
+static void
+CheckDeadLock(void)
+{
+ int i;
+
+ /*
+ * Acquire exclusive lock on the entire shared lock data structures. Must
+ * grab LWLocks in partition-number order to avoid LWLock deadlock.
+ *
+ * Note that the deadlock check interrupt had better not be enabled
+ * anywhere that this process itself holds lock partition locks, else this
+ * will wait forever. Also note that LWLockAcquire creates a critical
+ * section, so that this routine cannot be interrupted by cancel/die
+ * interrupts.
+ */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ LWLockAcquire(LockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+
+ /*
+ * Check to see if we've been awoken by anyone in the interim.
+ *
+ * If we have, we can return and resume our transaction -- happy day.
+ * Before we are awoken the process releasing the lock grants it to us so
+ * we know that we don't have to wait anymore.
+ *
+ * We check by looking to see if we've been unlinked from the wait queue.
+ * This is safe because we hold the lock partition lock.
+ */
+ if (MyProc->links.prev == NULL ||
+ MyProc->links.next == NULL)
+ goto check_done;
+
+#ifdef LOCK_DEBUG
+ if (Debug_deadlocks)
+ DumpAllLocks();
+#endif
+
+ /* Run the deadlock check, and set deadlock_state for use by ProcSleep */
+ deadlock_state = DeadLockCheck(MyProc);
+
+ if (deadlock_state == DS_HARD_DEADLOCK)
+ {
+ /*
+ * Oops. We have a deadlock.
+ *
+ * Get this process out of wait state. (Note: we could do this more
+ * efficiently by relying on lockAwaited, but use this coding to
+ * preserve the flexibility to kill some other transaction than the
+ * one detecting the deadlock.)
+ *
+ * RemoveFromWaitQueue sets MyProc->waitStatus to
+ * PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we
+ * return from the signal handler.
+ */
+ Assert(MyProc->waitLock != NULL);
+ RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag)));
+
+ /*
+ * We're done here. Transaction abort caused by the error that
+ * ProcSleep will raise will cause any other locks we hold to be
+ * released, thus allowing other processes to wake up; we don't need
+ * to do that here. NOTE: an exception is that releasing locks we
+ * hold doesn't consider the possibility of waiters that were blocked
+ * behind us on the lock we just failed to get, and might now be
+ * wakable because we're not in front of them anymore. However,
+ * RemoveFromWaitQueue took care of waking up any such processes.
+ */
+ }
+
+ /*
+ * And release locks. We do this in reverse order for two reasons: (1)
+ * Anyone else who needs more than one of the locks will be trying to lock
+ * them in increasing order; we don't want to release the other process
+ * until it can get all the locks it needs. (2) This avoids O(N^2)
+ * behavior inside LWLockRelease.
+ */
+check_done:
+ for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+ LWLockRelease(LockHashPartitionLockByIndex(i));
+}
+
+/*
+ * CheckDeadLockAlert - Handle the expiry of deadlock_timeout.
+ *
+ * NB: Runs inside a signal handler, be careful.
+ */
+void
+CheckDeadLockAlert(void)
+{
+ int save_errno = errno;
+
+ got_deadlock_timeout = true;
+
+ /*
+ * Have to set the latch again, even if handle_sig_alarm already did. Back
+ * then got_deadlock_timeout wasn't yet set... It's unlikely that this
+ * ever would be a problem, but setting a set latch again is cheap.
+ *
+ * Note that, when this function runs inside procsignal_sigusr1_handler(),
+ * the handler function sets the latch again after the latch is set here.
+ */
+ SetLatch(MyLatch);
+ errno = save_errno;
+}
+
+/*
+ * ProcWaitForSignal - wait for a signal from another backend.
+ *
+ * As this uses the generic process latch the caller has to be robust against
+ * unrelated wakeups: Always check that the desired state has occurred, and
+ * wait again if not.
+ */
+void
+ProcWaitForSignal(uint32 wait_event_info)
+{
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ wait_event_info);
+ ResetLatch(MyLatch);
+ CHECK_FOR_INTERRUPTS();
+}
+
+/*
+ * ProcSendSignal - send a signal to a backend identified by PID
+ */
+void
+ProcSendSignal(int pid)
+{
+ PGPROC *proc = NULL;
+
+ if (RecoveryInProgress())
+ {
+ SpinLockAcquire(ProcStructLock);
+
+ /*
+ * Check to see whether it is the Startup process we wish to signal.
+ * This call is made by the buffer manager when it wishes to wake up a
+ * process that has been waiting for a pin in so it can obtain a
+ * cleanup lock using LockBufferForCleanup(). Startup is not a normal
+ * backend, so BackendPidGetProc() will not return any pid at all. So
+ * we remember the information for this special case.
+ */
+ if (pid == ProcGlobal->startupProcPid)
+ proc = ProcGlobal->startupProc;
+
+ SpinLockRelease(ProcStructLock);
+ }
+
+ if (proc == NULL)
+ proc = BackendPidGetProc(pid);
+
+ if (proc != NULL)
+ {
+ SetLatch(&proc->procLatch);
+ }
+}
+
+/*
+ * BecomeLockGroupLeader - designate process as lock group leader
+ *
+ * Once this function has returned, other processes can join the lock group
+ * by calling BecomeLockGroupMember.
+ */
+void
+BecomeLockGroupLeader(void)
+{
+ LWLock *leader_lwlock;
+
+ /* If we already did it, we don't need to do it again. */
+ if (MyProc->lockGroupLeader == MyProc)
+ return;
+
+ /* We had better not be a follower. */
+ Assert(MyProc->lockGroupLeader == NULL);
+
+ /* Create single-member group, containing only ourselves. */
+ leader_lwlock = LockHashPartitionLockByProc(MyProc);
+ LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+ MyProc->lockGroupLeader = MyProc;
+ dlist_push_head(&MyProc->lockGroupMembers, &MyProc->lockGroupLink);
+ LWLockRelease(leader_lwlock);
+}
+
+/*
+ * BecomeLockGroupMember - designate process as lock group member
+ *
+ * This is pretty straightforward except for the possibility that the leader
+ * whose group we're trying to join might exit before we manage to do so;
+ * and the PGPROC might get recycled for an unrelated process. To avoid
+ * that, we require the caller to pass the PID of the intended PGPROC as
+ * an interlock. Returns true if we successfully join the intended lock
+ * group, and false if not.
+ */
+bool
+BecomeLockGroupMember(PGPROC *leader, int pid)
+{
+ LWLock *leader_lwlock;
+ bool ok = false;
+
+ /* Group leader can't become member of group */
+ Assert(MyProc != leader);
+
+ /* Can't already be a member of a group */
+ Assert(MyProc->lockGroupLeader == NULL);
+
+ /* PID must be valid. */
+ Assert(pid != 0);
+
+ /*
+ * Get lock protecting the group fields. Note LockHashPartitionLockByProc
+ * accesses leader->pgprocno in a PGPROC that might be free. This is safe
+ * because all PGPROCs' pgprocno fields are set during shared memory
+ * initialization and never change thereafter; so we will acquire the
+ * correct lock even if the leader PGPROC is in process of being recycled.
+ */
+ leader_lwlock = LockHashPartitionLockByProc(leader);
+ LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+
+ /* Is this the leader we're looking for? */
+ if (leader->pid == pid && leader->lockGroupLeader == leader)
+ {
+ /* OK, join the group */
+ ok = true;
+ MyProc->lockGroupLeader = leader;
+ dlist_push_tail(&leader->lockGroupMembers, &MyProc->lockGroupLink);
+ }
+ LWLockRelease(leader_lwlock);
+
+ return ok;
+}
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index 0000000..2dc2d67
--- /dev/null
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -0,0 +1,377 @@
+/*-------------------------------------------------------------------------
+ *
+ * s_lock.c
+ * Hardware-dependent implementation of spinlocks.
+ *
+ * When waiting for a contended spinlock we loop tightly for awhile, then
+ * delay using pg_usleep() and try again. Preferably, "awhile" should be a
+ * small multiple of the maximum time we expect a spinlock to be held. 100
+ * iterations seems about right as an initial guess. However, on a
+ * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario
+ * it's usually better to spin a bit longer than to call the kernel, so we try
+ * to adapt the spin loop count depending on whether we seem to be in a
+ * uniprocessor or multiprocessor.
+ *
+ * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
+ * be wrong; there are platforms where that can result in a "stuck
+ * spinlock" failure. This has been seen particularly on Alphas; it seems
+ * that the first TAS after returning from kernel space will always fail
+ * on that hardware.
+ *
+ * Once we do decide to block, we use randomly increasing pg_usleep()
+ * delays. The first delay is 1 msec, then the delay randomly increases to
+ * about one second, after which we reset to 1 msec and start again. The
+ * idea here is that in the presence of heavy contention we need to
+ * increase the delay, else the spinlock holder may never get to run and
+ * release the lock. (Consider situation where spinlock holder has been
+ * nice'd down in priority by the scheduler --- it will not get scheduled
+ * until all would-be acquirers are sleeping, so if we always use a 1-msec
+ * sleep, there is a real possibility of starvation.) But we can't just
+ * clamp the delay to an upper bound, else it would take a long time to
+ * make a reasonable number of tries.
+ *
+ * We time out and declare error after NUM_DELAYS delays (thus, exactly
+ * that many tries). With the given settings, this will usually take 2 or
+ * so minutes. It seems better to fix the total number of tries (and thus
+ * the probability of unintended failure) than to fix the total time
+ * spent.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/s_lock.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <time.h>
+#include <unistd.h>
+
+#include "port/atomics.h"
+#include "storage/s_lock.h"
+
+#define MIN_SPINS_PER_DELAY 10
+#define MAX_SPINS_PER_DELAY 1000
+#define NUM_DELAYS 1000
+#define MIN_DELAY_USEC 1000L
+#define MAX_DELAY_USEC 1000000L
+
+
+slock_t dummy_spinlock;
+
+static int spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+
+
+/*
+ * s_lock_stuck() - complain about a stuck spinlock
+ */
+static void
+s_lock_stuck(const char *file, int line, const char *func)
+{
+ if (!func)
+ func = "(unknown)";
+#if defined(S_LOCK_TEST)
+ fprintf(stderr,
+ "\nStuck spinlock detected at %s, %s:%d.\n",
+ func, file, line);
+ exit(1);
+#else
+ elog(PANIC, "stuck spinlock detected at %s, %s:%d",
+ func, file, line);
+#endif
+}
+
+/*
+ * s_lock(lock) - platform-independent portion of waiting for a spinlock.
+ */
+int
+s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
+{
+ SpinDelayStatus delayStatus;
+
+ init_spin_delay(&delayStatus, file, line, func);
+
+ while (TAS_SPIN(lock))
+ {
+ perform_spin_delay(&delayStatus);
+ }
+
+ finish_spin_delay(&delayStatus);
+
+ return delayStatus.delays;
+}
+
+#ifdef USE_DEFAULT_S_UNLOCK
+void
+s_unlock(volatile slock_t *lock)
+{
+#ifdef TAS_ACTIVE_WORD
+ /* HP's PA-RISC */
+ *TAS_ACTIVE_WORD(lock) = -1;
+#else
+ *lock = 0;
+#endif
+}
+#endif
+
+/*
+ * Wait while spinning on a contended spinlock.
+ */
+void
+perform_spin_delay(SpinDelayStatus *status)
+{
+ /* CPU-specific delay each time through the loop */
+ SPIN_DELAY();
+
+ /* Block the process every spins_per_delay tries */
+ if (++(status->spins) >= spins_per_delay)
+ {
+ if (++(status->delays) > NUM_DELAYS)
+ s_lock_stuck(status->file, status->line, status->func);
+
+ if (status->cur_delay == 0) /* first time to delay? */
+ status->cur_delay = MIN_DELAY_USEC;
+
+ pg_usleep(status->cur_delay);
+
+#if defined(S_LOCK_TEST)
+ fprintf(stdout, "*");
+ fflush(stdout);
+#endif
+
+ /* increase delay by a random fraction between 1X and 2X */
+ status->cur_delay += (int) (status->cur_delay *
+ ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5);
+ /* wrap back to minimum delay when max is exceeded */
+ if (status->cur_delay > MAX_DELAY_USEC)
+ status->cur_delay = MIN_DELAY_USEC;
+
+ status->spins = 0;
+ }
+}
+
+/*
+ * After acquiring a spinlock, update estimates about how long to loop.
+ *
+ * If we were able to acquire the lock without delaying, it's a good
+ * indication we are in a multiprocessor. If we had to delay, it's a sign
+ * (but not a sure thing) that we are in a uniprocessor. Hence, we
+ * decrement spins_per_delay slowly when we had to delay, and increase it
+ * rapidly when we didn't. It's expected that spins_per_delay will
+ * converge to the minimum value on a uniprocessor and to the maximum
+ * value on a multiprocessor.
+ *
+ * Note: spins_per_delay is local within our current process. We want to
+ * average these observations across multiple backends, since it's
+ * relatively rare for this function to even get entered, and so a single
+ * backend might not live long enough to converge on a good value. That
+ * is handled by the two routines below.
+ */
+void
+finish_spin_delay(SpinDelayStatus *status)
+{
+ if (status->cur_delay == 0)
+ {
+ /* we never had to delay */
+ if (spins_per_delay < MAX_SPINS_PER_DELAY)
+ spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
+ }
+ else
+ {
+ if (spins_per_delay > MIN_SPINS_PER_DELAY)
+ spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
+ }
+}
+
+/*
+ * Set local copy of spins_per_delay during backend startup.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+void
+set_spins_per_delay(int shared_spins_per_delay)
+{
+ spins_per_delay = shared_spins_per_delay;
+}
+
+/*
+ * Update shared estimate of spins_per_delay during backend exit.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+int
+update_spins_per_delay(int shared_spins_per_delay)
+{
+ /*
+ * We use an exponential moving average with a relatively slow adaption
+ * rate, so that noise in any one backend's result won't affect the shared
+ * value too much. As long as both inputs are within the allowed range,
+ * the result must be too, so we need not worry about clamping the result.
+ *
+ * We deliberately truncate rather than rounding; this is so that single
+ * adjustments inside a backend can affect the shared estimate (see the
+ * asymmetric adjustment rules above).
+ */
+ return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
+}
+
+
+/*
+ * Various TAS implementations that cannot live in s_lock.h as no inline
+ * definition exists (yet).
+ * In the future, get rid of tas.[cso] and fold it into this file.
+ *
+ * If you change something here, you will likely need to modify s_lock.h too,
+ * because the definitions for these are split between this file and s_lock.h.
+ */
+
+
+#ifdef HAVE_SPINLOCKS /* skip spinlocks if requested */
+
+
+#if defined(__GNUC__)
+
+/*
+ * All the gcc flavors that are not inlined
+ */
+
+
+/*
+ * Note: all the if-tests here probably ought to be testing gcc version
+ * rather than platform, but I don't have adequate info to know what to
+ * write. Ideally we'd flush all this in favor of the inline version.
+ */
+#if defined(__m68k__) && !defined(__linux__)
+/* really means: extern int tas(slock_t* **lock); */
+static void
+tas_dummy()
+{
+ __asm__ __volatile__(
+#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(__ELF__)
+/* no underscore for label and % for registers */
+ "\
+.global tas \n\
+tas: \n\
+ movel %sp@(0x4),%a0 \n\
+ tas %a0@ \n\
+ beq _success \n\
+ moveq #-128,%d0 \n\
+ rts \n\
+_success: \n\
+ moveq #0,%d0 \n\
+ rts \n"
+#else
+ "\
+.global _tas \n\
+_tas: \n\
+ movel sp@(0x4),a0 \n\
+ tas a0@ \n\
+ beq _success \n\
+ moveq #-128,d0 \n\
+ rts \n\
+_success: \n\
+ moveq #0,d0 \n\
+ rts \n"
+#endif /* (__NetBSD__ || __OpenBSD__) && __ELF__ */
+ );
+}
+#endif /* __m68k__ && !__linux__ */
+#endif /* not __GNUC__ */
+#endif /* HAVE_SPINLOCKS */
+
+
+
+/*****************************************************************************/
+#if defined(S_LOCK_TEST)
+
+/*
+ * test program for verifying a port's spinlock support.
+ */
+
+struct test_lock_struct
+{
+ char pad1;
+ slock_t lock;
+ char pad2;
+};
+
+volatile struct test_lock_struct test_lock;
+
+int
+main()
+{
+ srandom((unsigned int) time(NULL));
+
+ test_lock.pad1 = test_lock.pad2 = 0x44;
+
+ S_INIT_LOCK(&test_lock.lock);
+
+ if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+ {
+ printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+ return 1;
+ }
+
+ if (!S_LOCK_FREE(&test_lock.lock))
+ {
+ printf("S_LOCK_TEST: failed, lock not initialized\n");
+ return 1;
+ }
+
+ S_LOCK(&test_lock.lock);
+
+ if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+ {
+ printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+ return 1;
+ }
+
+ if (S_LOCK_FREE(&test_lock.lock))
+ {
+ printf("S_LOCK_TEST: failed, lock not locked\n");
+ return 1;
+ }
+
+ S_UNLOCK(&test_lock.lock);
+
+ if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+ {
+ printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+ return 1;
+ }
+
+ if (!S_LOCK_FREE(&test_lock.lock))
+ {
+ printf("S_LOCK_TEST: failed, lock not unlocked\n");
+ return 1;
+ }
+
+ S_LOCK(&test_lock.lock);
+
+ if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+ {
+ printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+ return 1;
+ }
+
+ if (S_LOCK_FREE(&test_lock.lock))
+ {
+ printf("S_LOCK_TEST: failed, lock not re-locked\n");
+ return 1;
+ }
+
+ printf("S_LOCK_TEST: this will print %d stars and then\n", NUM_DELAYS);
+ printf(" exit with a 'stuck spinlock' message\n");
+ printf(" if S_LOCK() and TAS() are working.\n");
+ fflush(stdout);
+
+ s_lock(&test_lock.lock, __FILE__, __LINE__);
+
+ printf("S_LOCK_TEST: failed, lock not locked\n");
+ return 1;
+}
+
+#endif /* S_LOCK_TEST */
diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c
new file mode 100644
index 0000000..557672c
--- /dev/null
+++ b/src/backend/storage/lmgr/spin.c
@@ -0,0 +1,180 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.c
+ * Hardware-independent implementation of spinlocks.
+ *
+ *
+ * For machines that have test-and-set (TAS) instructions, s_lock.h/.c
+ * define the spinlock implementation. This file contains only a stub
+ * implementation for spinlocks using PGSemaphores. Unless semaphores
+ * are implemented in a way that doesn't involve a kernel call, this
+ * is too slow to be very useful :-(
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/spin.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/pg_sema.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+
+#ifndef HAVE_SPINLOCKS
+
+/*
+ * No TAS, so spinlocks are implemented as PGSemaphores.
+ */
+
+#ifndef HAVE_ATOMICS
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES)
+#else
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES)
+#endif /* DISABLE_ATOMICS */
+
+PGSemaphore *SpinlockSemaArray;
+
+#else /* !HAVE_SPINLOCKS */
+
+#define NUM_EMULATION_SEMAPHORES 0
+
+#endif /* HAVE_SPINLOCKS */
+
+/*
+ * Report the amount of shared memory needed to store semaphores for spinlock
+ * support.
+ */
+Size
+SpinlockSemaSize(void)
+{
+ return NUM_EMULATION_SEMAPHORES * sizeof(PGSemaphore);
+}
+
+/*
+ * Report number of semaphores needed to support spinlocks.
+ */
+int
+SpinlockSemas(void)
+{
+ return NUM_EMULATION_SEMAPHORES;
+}
+
+#ifndef HAVE_SPINLOCKS
+
+/*
+ * Initialize spinlock emulation.
+ *
+ * This must be called after PGReserveSemaphores().
+ */
+void
+SpinlockSemaInit(void)
+{
+ PGSemaphore *spinsemas;
+ int nsemas = SpinlockSemas();
+ int i;
+
+ /*
+ * We must use ShmemAllocUnlocked(), since the spinlock protecting
+ * ShmemAlloc() obviously can't be ready yet.
+ */
+ spinsemas = (PGSemaphore *) ShmemAllocUnlocked(SpinlockSemaSize());
+ for (i = 0; i < nsemas; ++i)
+ spinsemas[i] = PGSemaphoreCreate();
+ SpinlockSemaArray = spinsemas;
+}
+
+/*
+ * s_lock.h hardware-spinlock emulation using semaphores
+ *
+ * We map all spinlocks onto NUM_EMULATION_SEMAPHORES semaphores. It's okay to
+ * map multiple spinlocks onto one semaphore because no process should ever
+ * hold more than one at a time. We just need enough semaphores so that we
+ * aren't adding too much extra contention from that.
+ *
+ * There is one exception to the restriction of only holding one spinlock at a
+ * time, which is that it's ok if emulated atomic operations are nested inside
+ * spinlocks. To avoid the danger of spinlocks and atomic using the same sema,
+ * we make sure "normal" spinlocks and atomics backed by spinlocks use
+ * distinct semaphores (see the nested argument to s_init_lock_sema).
+ *
+ * slock_t is just an int for this implementation; it holds the spinlock
+ * number from 1..NUM_EMULATION_SEMAPHORES. We intentionally ensure that 0
+ * is not a valid value, so that testing with this code can help find
+ * failures to initialize spinlocks.
+ */
+
+static inline void
+s_check_valid(int lockndx)
+{
+ if (unlikely(lockndx <= 0 || lockndx > NUM_EMULATION_SEMAPHORES))
+ elog(ERROR, "invalid spinlock number: %d", lockndx);
+}
+
+void
+s_init_lock_sema(volatile slock_t *lock, bool nested)
+{
+ static uint32 counter = 0;
+ uint32 offset;
+ uint32 sema_total;
+ uint32 idx;
+
+ if (nested)
+ {
+ /*
+ * To allow nesting atomics inside spinlocked sections, use a
+ * different spinlock. See comment above.
+ */
+ offset = 1 + NUM_SPINLOCK_SEMAPHORES;
+ sema_total = NUM_ATOMICS_SEMAPHORES;
+ }
+ else
+ {
+ offset = 1;
+ sema_total = NUM_SPINLOCK_SEMAPHORES;
+ }
+
+ idx = (counter++ % sema_total) + offset;
+
+ /* double check we did things correctly */
+ s_check_valid(idx);
+
+ *lock = idx;
+}
+
+void
+s_unlock_sema(volatile slock_t *lock)
+{
+ int lockndx = *lock;
+
+ s_check_valid(lockndx);
+
+ PGSemaphoreUnlock(SpinlockSemaArray[lockndx - 1]);
+}
+
+bool
+s_lock_free_sema(volatile slock_t *lock)
+{
+ /* We don't currently use S_LOCK_FREE anyway */
+ elog(ERROR, "spin.c does not support S_LOCK_FREE()");
+ return false;
+}
+
+int
+tas_sema(volatile slock_t *lock)
+{
+ int lockndx = *lock;
+
+ s_check_valid(lockndx);
+
+ /* Note that TAS macros return 0 if *success* */
+ return !PGSemaphoreTryLock(SpinlockSemaArray[lockndx - 1]);
+}
+
+#endif /* !HAVE_SPINLOCKS */