diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/storage/lmgr | |
parent | Initial commit. (diff) | |
download | postgresql-14-upstream.tar.xz postgresql-14-upstream.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/backend/storage/lmgr/.gitignore | 2 | ||||
-rw-r--r-- | src/backend/storage/lmgr/Makefile | 51 | ||||
-rw-r--r-- | src/backend/storage/lmgr/README | 739 | ||||
-rw-r--r-- | src/backend/storage/lmgr/README-SSI | 646 | ||||
-rw-r--r-- | src/backend/storage/lmgr/README.barrier | 197 | ||||
-rw-r--r-- | src/backend/storage/lmgr/condition_variable.c | 364 | ||||
-rw-r--r-- | src/backend/storage/lmgr/deadlock.c | 1177 | ||||
-rw-r--r-- | src/backend/storage/lmgr/generate-lwlocknames.pl | 71 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lmgr.c | 1196 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lock.c | 4738 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lwlock.c | 1977 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lwlocknames.c | 52 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lwlocknames.h | 50 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lwlocknames.txt | 55 | ||||
-rw-r--r-- | src/backend/storage/lmgr/predicate.c | 5203 | ||||
-rw-r--r-- | src/backend/storage/lmgr/proc.c | 2012 | ||||
-rw-r--r-- | src/backend/storage/lmgr/s_lock.c | 377 | ||||
-rw-r--r-- | src/backend/storage/lmgr/spin.c | 180 |
18 files changed, 19087 insertions, 0 deletions
diff --git a/src/backend/storage/lmgr/.gitignore b/src/backend/storage/lmgr/.gitignore new file mode 100644 index 0000000..9355cae --- /dev/null +++ b/src/backend/storage/lmgr/.gitignore @@ -0,0 +1,2 @@ +/lwlocknames.c +/lwlocknames.h diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile new file mode 100644 index 0000000..829b792 --- /dev/null +++ b/src/backend/storage/lmgr/Makefile @@ -0,0 +1,51 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/lmgr +# +# IDENTIFICATION +# src/backend/storage/lmgr/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/lmgr +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + condition_variable.o \ + deadlock.o \ + lmgr.o \ + lock.o \ + lwlock.o \ + lwlocknames.o \ + predicate.o \ + proc.o \ + s_lock.o \ + spin.o + +include $(top_srcdir)/src/backend/common.mk + +ifdef TAS +TASPATH = $(top_builddir)/src/backend/port/tas.o +endif + +s_lock_test: s_lock.c $(top_builddir)/src/port/libpgport.a + $(CC) $(CPPFLAGS) $(CFLAGS) -DS_LOCK_TEST=1 $(srcdir)/s_lock.c \ + $(TASPATH) -L $(top_builddir)/src/port -lpgport -o s_lock_test + +# see notes in src/backend/parser/Makefile +lwlocknames.c: lwlocknames.h + touch $@ + +lwlocknames.h: $(top_srcdir)/src/backend/storage/lmgr/lwlocknames.txt generate-lwlocknames.pl + $(PERL) $(srcdir)/generate-lwlocknames.pl $< + +check: s_lock_test + ./s_lock_test + +clean distclean: + rm -f s_lock_test + +maintainer-clean: clean + rm -f lwlocknames.h lwlocknames.c diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README new file mode 100644 index 0000000..c96cc7b --- /dev/null +++ b/src/backend/storage/lmgr/README @@ -0,0 +1,739 @@ +src/backend/storage/lmgr/README + +Locking Overview +================ + +Postgres uses four types of interprocess locks: + +* Spinlocks. These are intended for *very* short-term locks. If a lock +is to be held more than a few dozen instructions, or across any sort of +kernel call (or even a call to a nontrivial subroutine), don't use a +spinlock. Spinlocks are primarily used as infrastructure for lightweight +locks. They are implemented using a hardware atomic-test-and-set +instruction, if available. Waiting processes busy-loop until they can +get the lock. There is no provision for deadlock detection, automatic +release on error, or any other nicety. There is a timeout if the lock +cannot be gotten after a minute or so (which is approximately forever in +comparison to the intended lock hold time, so this is certainly an error +condition). + +* Lightweight locks (LWLocks). These locks are typically used to +interlock access to datastructures in shared memory. LWLocks support +both exclusive and shared lock modes (for read/write and read-only +access to a shared object). There is no provision for deadlock +detection, but the LWLock manager will automatically release held +LWLocks during elog() recovery, so it is safe to raise an error while +holding LWLocks. Obtaining or releasing an LWLock is quite fast (a few +dozen instructions) when there is no contention for the lock. When a +process has to wait for an LWLock, it blocks on a SysV semaphore so as +to not consume CPU time. Waiting processes will be granted the lock in +arrival order. There is no timeout. + +* Regular locks (a/k/a heavyweight locks). The regular lock manager +supports a variety of lock modes with table-driven semantics, and it has +full deadlock detection and automatic release at transaction end. +Regular locks should be used for all user-driven lock requests. + +* SIReadLock predicate locks. See separate README-SSI file for details. + +Acquisition of either a spinlock or a lightweight lock causes query +cancel and die() interrupts to be held off until all such locks are +released. No such restriction exists for regular locks, however. Also +note that we can accept query cancel and die() interrupts while waiting +for a regular lock, but we will not accept them while waiting for +spinlocks or LW locks. It is therefore not a good idea to use LW locks +when the wait time might exceed a few seconds. + +The rest of this README file discusses the regular lock manager in detail. + + +Lock Data Structures +-------------------- + +Lock methods describe the overall locking behavior. Currently there are +two lock methods: DEFAULT and USER. + +Lock modes describe the type of the lock (read/write or shared/exclusive). +In principle, each lock method can have its own set of lock modes with +different conflict rules, but currently DEFAULT and USER methods use +identical lock mode sets. See src/include/storage/lock.h for more details. +(Lock modes are also called lock types in some places in the code and +documentation.) + +There are two main methods for recording locks in shared memory. The primary +mechanism uses two main structures: the per-lockable-object LOCK struct, and +the per-lock-and-requestor PROCLOCK struct. A LOCK object exists for each +lockable object that currently has locks held or requested on it. A PROCLOCK +struct exists for each backend that is holding or requesting lock(s) on each +LOCK object. + +There is also a special "fast path" mechanism which backends may use to +record a limited number of locks with very specific characteristics: they must +use the DEFAULT lockmethod; they must represent a lock on a database relation +(not a shared relation), they must be a "weak" lock which is unlikely to +conflict (AccessShareLock, RowShareLock, or RowExclusiveLock); and the system +must be able to quickly verify that no conflicting locks could possibly be +present. See "Fast Path Locking", below, for more details. + +Each backend also maintains an unshared LOCALLOCK structure for each lockable +object and lock mode that it is currently holding or requesting. The shared +lock structures only allow a single lock grant to be made per lockable +object/lock mode/backend. Internally to a backend, however, the same lock may +be requested and perhaps released multiple times in a transaction, and it can +also be held both transactionally and session-wide. The internal request +counts are held in LOCALLOCK so that the shared data structures need not be +accessed to alter them. + +--------------------------------------------------------------------------- + +The lock manager's LOCK objects contain: + +tag - + The key fields that are used for hashing locks in the shared memory + lock hash table. The contents of the tag essentially define an + individual lockable object. See include/storage/lock.h for details + about the supported types of lockable objects. This is declared as + a separate struct to ensure that we always zero out the correct number + of bytes. It is critical that any alignment-padding bytes the compiler + might insert in the struct be zeroed out, else the hash computation + will be random. (Currently, we are careful to define struct LOCKTAG + so that there are no padding bytes.) + +grantMask - + This bitmask indicates what types of locks are currently held on the + given lockable object. It is used (against the lock table's conflict + table) to determine if a new lock request will conflict with existing + lock types held. Conflicts are determined by bitwise AND operations + between the grantMask and the conflict table entry for the requested + lock type. Bit i of grantMask is 1 if and only if granted[i] > 0. + +waitMask - + This bitmask shows the types of locks being waited for. Bit i of waitMask + is 1 if and only if requested[i] > granted[i]. + +procLocks - + This is a shared memory queue of all the PROCLOCK structs associated with + the lock object. Note that both granted and waiting PROCLOCKs are in this + list (indeed, the same PROCLOCK might have some already-granted locks and + be waiting for more!). + +waitProcs - + This is a shared memory queue of all PGPROC structures corresponding to + backends that are waiting (sleeping) until another backend releases this + lock. The process structure holds the information needed to determine + if it should be woken up when the lock is released. + +nRequested - + Keeps a count of how many times this lock has been attempted to be + acquired. The count includes attempts by processes which were put + to sleep due to conflicts. It also counts the same backend twice + if, for example, a backend process first acquires a read and then + acquires a write. (But multiple acquisitions of the same lock/lock mode + within a backend are not multiply counted here; they are recorded + only in the backend's LOCALLOCK structure.) + +requested - + Keeps a count of how many locks of each type have been attempted. Only + elements 1 through MAX_LOCKMODES-1 are used as they correspond to the lock + type defined constants. Summing the values of requested[] should come out + equal to nRequested. + +nGranted - + Keeps count of how many times this lock has been successfully acquired. + This count does not include attempts that are waiting due to conflicts. + Otherwise the counting rules are the same as for nRequested. + +granted - + Keeps count of how many locks of each type are currently held. Once again + only elements 1 through MAX_LOCKMODES-1 are used (0 is not). Also, like + requested[], summing the values of granted[] should total to the value + of nGranted. + +We should always have 0 <= nGranted <= nRequested, and +0 <= granted[i] <= requested[i] for each i. When all the request counts +go to zero, the LOCK object is no longer needed and can be freed. + +--------------------------------------------------------------------------- + +The lock manager's PROCLOCK objects contain: + +tag - + The key fields that are used for hashing entries in the shared memory + PROCLOCK hash table. This is declared as a separate struct to ensure that + we always zero out the correct number of bytes. It is critical that any + alignment-padding bytes the compiler might insert in the struct be zeroed + out, else the hash computation will be random. (Currently, we are careful + to define struct PROCLOCKTAG so that there are no padding bytes.) + + tag.myLock + Pointer to the shared LOCK object this PROCLOCK is for. + + tag.myProc + Pointer to the PGPROC of backend process that owns this PROCLOCK. + + Note: it's OK to use pointers here because a PROCLOCK never outlives + either its lock or its proc. The tag is therefore unique for as long + as it needs to be, even though the same tag values might mean something + else at other times. + +holdMask - + A bitmask for the lock modes successfully acquired by this PROCLOCK. + This should be a subset of the LOCK object's grantMask, and also a + subset of the PGPROC object's heldLocks mask (if the PGPROC is + currently waiting for another lock mode on this lock). + +releaseMask - + A bitmask for the lock modes due to be released during LockReleaseAll. + This must be a subset of the holdMask. Note that it is modified without + taking the partition LWLock, and therefore it is unsafe for any + backend except the one owning the PROCLOCK to examine/change it. + +lockLink - + List link for shared memory queue of all the PROCLOCK objects for the + same LOCK. + +procLink - + List link for shared memory queue of all the PROCLOCK objects for the + same backend. + +--------------------------------------------------------------------------- + + +Lock Manager Internal Locking +----------------------------- + +Before PostgreSQL 8.2, all of the shared-memory data structures used by +the lock manager were protected by a single LWLock, the LockMgrLock; +any operation involving these data structures had to exclusively lock +LockMgrLock. Not too surprisingly, this became a contention bottleneck. +To reduce contention, the lock manager's data structures have been split +into multiple "partitions", each protected by an independent LWLock. +Most operations only need to lock the single partition they are working in. +Here are the details: + +* Each possible lock is assigned to one partition according to a hash of +its LOCKTAG value. The partition's LWLock is considered to protect all the +LOCK objects of that partition as well as their subsidiary PROCLOCKs. + +* The shared-memory hash tables for LOCKs and PROCLOCKs are organized +so that different partitions use different hash chains, and thus there +is no conflict in working with objects in different partitions. This +is supported directly by dynahash.c's "partitioned table" mechanism +for the LOCK table: we need only ensure that the partition number is +taken from the low-order bits of the dynahash hash value for the LOCKTAG. +To make it work for PROCLOCKs, we have to ensure that a PROCLOCK's hash +value has the same low-order bits as its associated LOCK. This requires +a specialized hash function (see proclock_hash). + +* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it. +This has now been split into per-partition lists, so that access to a +particular PROCLOCK list can be protected by the associated partition's +LWLock. (This rule allows one backend to manipulate another backend's +PROCLOCK lists, which was not originally necessary but is now required in +connection with fast-path locking; see below.) + +* The other lock-related fields of a PGPROC are only interesting when +the PGPROC is waiting for a lock, so we consider that they are protected +by the partition LWLock of the awaited lock. + +For normal lock acquisition and release, it is sufficient to lock the +partition containing the desired lock. Deadlock checking needs to touch +multiple partitions in general; for simplicity, we just make it lock all +the partitions in partition-number order. (To prevent LWLock deadlock, +we establish the rule that any backend needing to lock more than one +partition at once must lock them in partition-number order.) It's +possible that deadlock checking could be done without touching every +partition in typical cases, but since in a properly functioning system +deadlock checking should not occur often enough to be performance-critical, +trying to make this work does not seem a productive use of effort. + +A backend's internal LOCALLOCK hash table is not partitioned. We do store +a copy of the locktag hash code in LOCALLOCK table entries, from which the +partition number can be computed, but this is a straight speed-for-space +tradeoff: we could instead recalculate the partition number from the LOCKTAG +when needed. + + +Fast Path Locking +----------------- + +Fast path locking is a special purpose mechanism designed to reduce the +overhead of taking and releasing certain types of locks which are taken +and released very frequently but rarely conflict. Currently, this includes +two categories of locks: + +(1) Weak relation locks. SELECT, INSERT, UPDATE, and DELETE must acquire a +lock on every relation they operate on, as well as various system catalogs +that can be used internally. Many DML operations can proceed in parallel +against the same table at the same time; only DDL operations such as +CLUSTER, ALTER TABLE, or DROP -- or explicit user action such as LOCK TABLE +-- will create lock conflicts with the "weak" locks (AccessShareLock, +RowShareLock, RowExclusiveLock) acquired by DML operations. + +(2) VXID locks. Every transaction takes a lock on its own virtual +transaction ID. Currently, the only operations that wait for these locks +are CREATE INDEX CONCURRENTLY and Hot Standby (in the case of a conflict), +so most VXID locks are taken and released by the owner without anyone else +needing to care. + +The primary locking mechanism does not cope well with this workload. Even +though the lock manager locks are partitioned, the locktag for any given +relation still falls in one, and only one, partition. Thus, if many short +queries are accessing the same relation, the lock manager partition lock for +that partition becomes a contention bottleneck. This effect is measurable +even on 2-core servers, and becomes very pronounced as core count increases. + +To alleviate this bottleneck, beginning in PostgreSQL 9.2, each backend is +permitted to record a limited number of locks on unshared relations in an +array within its PGPROC structure, rather than using the primary lock table. +This mechanism can only be used when the locker can verify that no conflicting +locks exist at the time of taking the lock. + +A key point of this algorithm is that it must be possible to verify the +absence of possibly conflicting locks without fighting over a shared LWLock or +spinlock. Otherwise, this effort would simply move the contention bottleneck +from one place to another. We accomplish this using an array of 1024 integer +counters, which are in effect a 1024-way partitioning of the lock space. +Each counter records the number of "strong" locks (that is, ShareLock, +ShareRowExclusiveLock, ExclusiveLock, and AccessExclusiveLock) on unshared +relations that fall into that partition. When this counter is non-zero, the +fast path mechanism may not be used to take new relation locks within that +partition. A strong locker bumps the counter and then scans each per-backend +array for matching fast-path locks; any which are found must be transferred to +the primary lock table before attempting to acquire the lock, to ensure proper +lock conflict and deadlock detection. + +On an SMP system, we must guarantee proper memory synchronization. Here we +rely on the fact that LWLock acquisition acts as a memory sequence point: if +A performs a store, A and B both acquire an LWLock in either order, and B +then performs a load on the same memory location, it is guaranteed to see +A's store. In this case, each backend's fast-path lock queue is protected +by an LWLock. A backend wishing to acquire a fast-path lock grabs this +LWLock before examining FastPathStrongRelationLocks to check for the presence +of a conflicting strong lock. And the backend attempting to acquire a strong +lock, because it must transfer any matching weak locks taken via the fast-path +mechanism to the shared lock table, will acquire every LWLock protecting a +backend fast-path queue in turn. So, if we examine +FastPathStrongRelationLocks and see a zero, then either the value is truly +zero, or if it is a stale value, the strong locker has yet to acquire the +per-backend LWLock we now hold (or, indeed, even the first per-backend LWLock) +and will notice any weak lock we take when it does. + +Fast-path VXID locks do not use the FastPathStrongRelationLocks table. The +first lock taken on a VXID is always the ExclusiveLock taken by its owner. +Any subsequent lockers are share lockers waiting for the VXID to terminate. +Indeed, the only reason VXID locks use the lock manager at all (rather than +waiting for the VXID to terminate via some other method) is for deadlock +detection. Thus, the initial VXID lock can *always* be taken via the fast +path without checking for conflicts. Any subsequent locker must check +whether the lock has been transferred to the main lock table, and if not, +do so. The backend owning the VXID must be careful to clean up any entry +made in the main lock table at end of transaction. + +Deadlock detection does not need to examine the fast-path data structures, +because any lock that could possibly be involved in a deadlock must have +been transferred to the main tables beforehand. + + +The Deadlock Detection Algorithm +-------------------------------- + +Since we allow user transactions to request locks in any order, deadlock +is possible. We use a deadlock detection/breaking algorithm that is +fairly standard in essence, but there are many special considerations +needed to deal with Postgres' generalized locking model. + +A key design consideration is that we want to make routine operations +(lock grant and release) run quickly when there is no deadlock, and +avoid the overhead of deadlock handling as much as possible. We do this +using an "optimistic waiting" approach: if a process cannot acquire the +lock it wants immediately, it goes to sleep without any deadlock check. +But it also sets a delay timer, with a delay of DeadlockTimeout +milliseconds (typically set to one second). If the delay expires before +the process is granted the lock it wants, it runs the deadlock +detection/breaking code. Normally this code will determine that there is +no deadlock condition, and then the process will go back to sleep and +wait quietly until it is granted the lock. But if a deadlock condition +does exist, it will be resolved, usually by aborting the detecting +process' transaction. In this way, we avoid deadlock handling overhead +whenever the wait time for a lock is less than DeadlockTimeout, while +not imposing an unreasonable delay of detection when there is an error. + +Lock acquisition (routines LockAcquire and ProcSleep) follows these rules: + +1. A lock request is granted immediately if it does not conflict with +any existing or waiting lock request, or if the process already holds an +instance of the same lock type (eg, there's no penalty to acquire a read +lock twice). Note that a process never conflicts with itself, eg one +can obtain read lock when one already holds exclusive lock. + +2. Otherwise the process joins the lock's wait queue. Normally it will +be added to the end of the queue, but there is an exception: if the +process already holds locks on this same lockable object that conflict +with the request of any pending waiter, then the process will be +inserted in the wait queue just ahead of the first such waiter. (If we +did not make this check, the deadlock detection code would adjust the +queue order to resolve the conflict, but it's relatively cheap to make +the check in ProcSleep and avoid a deadlock timeout delay in this case.) +Note special case when inserting before the end of the queue: if the +process's request does not conflict with any existing lock nor any +waiting request before its insertion point, then go ahead and grant the +lock without waiting. + +When a lock is released, the lock release routine (ProcLockWakeup) scans +the lock object's wait queue. Each waiter is awoken if (a) its request +does not conflict with already-granted locks, and (b) its request does +not conflict with the requests of prior un-wakable waiters. Rule (b) +ensures that conflicting requests are granted in order of arrival. There +are cases where a later waiter must be allowed to go in front of +conflicting earlier waiters to avoid deadlock, but it is not +ProcLockWakeup's responsibility to recognize these cases; instead, the +deadlock detection code will re-order the wait queue when necessary. + +To perform deadlock checking, we use the standard method of viewing the +various processes as nodes in a directed graph (the waits-for graph or +WFG). There is a graph edge leading from process A to process B if A +waits for B, ie, A is waiting for some lock and B holds a conflicting +lock. There is a deadlock condition if and only if the WFG contains a +cycle. We detect cycles by searching outward along waits-for edges to +see if we return to our starting point. There are three possible +outcomes: + +1. All outgoing paths terminate at a running process (which has no +outgoing edge). + +2. A deadlock is detected by looping back to the start point. We +resolve such a deadlock by canceling the start point's lock request and +reporting an error in that transaction, which normally leads to +transaction abort and release of that transaction's held locks. Note +that it's sufficient to cancel one request to remove the cycle; we don't +need to kill all the transactions involved. + +3. Some path(s) loop back to a node other than the start point. This +indicates a deadlock, but one that does not involve our starting +process. We ignore this condition on the grounds that resolving such a +deadlock is the responsibility of the processes involved --- killing our +start-point process would not resolve the deadlock. So, cases 1 and 3 +both report "no deadlock". + +Postgres' situation is a little more complex than the standard discussion +of deadlock detection, for two reasons: + +1. A process can be waiting for more than one other process, since there +might be multiple PROCLOCKs of (non-conflicting) lock types that all +conflict with the waiter's request. This creates no real difficulty +however; we simply need to be prepared to trace more than one outgoing +edge. + +2. If a process A is behind a process B in some lock's wait queue, and +their requested locks conflict, then we must say that A waits for B, since +ProcLockWakeup will never awaken A before B. This creates additional +edges in the WFG. We call these "soft" edges, as opposed to the "hard" +edges induced by locks already held. Note that if B already holds any +locks conflicting with A's request, then their relationship is a hard edge +not a soft edge. + +A "soft" block, or wait-priority block, has the same potential for +inducing deadlock as a hard block. However, we may be able to resolve +a soft block without aborting the transactions involved: we can instead +rearrange the order of the wait queue. This rearrangement reverses the +direction of the soft edge between two processes with conflicting requests +whose queue order is reversed. If we can find a rearrangement that +eliminates a cycle without creating new ones, then we can avoid an abort. +Checking for such possible rearrangements is the trickiest part of the +algorithm. + +The workhorse of the deadlock detector is a routine FindLockCycle() which +is given a starting point process (which must be a waiting process). +It recursively scans outward across waits-for edges as discussed above. +If it finds no cycle involving the start point, it returns "false". +(As discussed above, we can ignore cycles not involving the start point.) +When such a cycle is found, FindLockCycle() returns "true", and as it +unwinds it also builds a list of any "soft" edges involved in the cycle. +If the resulting list is empty then there is a hard deadlock and the +configuration cannot succeed. However, if the list is not empty, then +reversing any one of the listed edges through wait-queue rearrangement +will eliminate that cycle. Since such a reversal might create cycles +elsewhere, we may need to try every possibility. Therefore, we need to +be able to invoke FindLockCycle() on hypothetical configurations (wait +orders) as well as the current real order. + +The easiest way to handle this seems to be to have a lookaside table that +shows the proposed new queue order for each wait queue that we are +considering rearranging. This table is checked by FindLockCycle, and it +believes the proposed queue order rather than the real order for each lock +that has an entry in the lookaside table. + +We build a proposed new queue order by doing a "topological sort" of the +existing entries. Each soft edge that we are currently considering +reversing creates a property of the partial order that the topological sort +has to enforce. We must use a sort method that preserves the input +ordering as much as possible, so as not to gratuitously break arrival +order for processes not involved in a deadlock. (This is not true of the +tsort method shown in Knuth, for example, but it's easily done by a simple +doubly-nested-loop method that emits the first legal candidate at each +step. Fortunately, we don't need a highly efficient sort algorithm, since +the number of partial order constraints is not likely to be large.) Note +that failure of the topological sort tells us we have conflicting ordering +constraints, and therefore that the last-added soft edge reversal +conflicts with a prior edge reversal. We need to detect this case to +avoid an infinite loop in the case where no possible rearrangement will +work: otherwise, we might try a reversal, find that it still leads to +a cycle, then try to un-reverse the reversal while trying to get rid of +that cycle, etc etc. Topological sort failure tells us the un-reversal +is not a legitimate move in this context. + +So, the basic step in our rearrangement method is to take a list of +soft edges in a cycle (as returned by FindLockCycle()) and successively +try the reversal of each one as a topological-sort constraint added to +whatever constraints we are already considering. We recursively search +through all such sets of constraints to see if any one eliminates all +the deadlock cycles at once. Although this might seem impossibly +inefficient, it shouldn't be a big problem in practice, because there +will normally be very few, and not very large, deadlock cycles --- if +any at all. So the combinatorial inefficiency isn't going to hurt us. +Besides, it's better to spend some time to guarantee that we've checked +all possible escape routes than to abort a transaction when we didn't +really have to. + +Each edge reversal constraint can be viewed as requesting that the waiting +process A be moved to before the blocking process B in the wait queue they +are both in. This action will reverse the desired soft edge, as well as +any other soft edges between A and other processes it is advanced over. +No other edges will be affected (note this is actually a constraint on our +topological sort method to not re-order the queue more than necessary.) +Therefore, we can be sure we have not created any new deadlock cycles if +neither FindLockCycle(A) nor FindLockCycle(B) discovers any cycle. Given +the above-defined behavior of FindLockCycle, each of these searches is +necessary as well as sufficient, since FindLockCycle starting at the +original start point will not complain about cycles that include A or B +but not the original start point. + +In short then, a proposed rearrangement of the wait queue(s) is determined +by one or more broken soft edges A->B, fully specified by the output of +topological sorts of each wait queue involved, and then tested by invoking +FindLockCycle() starting at the original start point as well as each of +the mentioned processes (A's and B's). If none of the tests detect a +cycle, then we have a valid configuration and can implement it by +reordering the wait queues per the sort outputs (and then applying +ProcLockWakeup on each reordered queue, in case a waiter has become wakable). +If any test detects a soft cycle, we can try to resolve it by adding each +soft link in that cycle, in turn, to the proposed rearrangement list. +This is repeated recursively until we either find a workable rearrangement +or determine that none exists. In the latter case, the outer level +resolves the deadlock by aborting the original start-point transaction. + +The particular order in which rearrangements are tried depends on the +order FindLockCycle() happens to scan in, so if there are multiple +workable rearrangements of the wait queues, then it is unspecified which +one will be chosen. What's more important is that we guarantee to try +every queue rearrangement that could lead to success. (For example, +if we have A before B before C and the needed order constraints are +C before A and B before C, we would first discover that A before C +doesn't work and try the rearrangement C before A before B. This would +eventually lead to the discovery of the additional constraint B before C.) + +Got that? + +Miscellaneous Notes +------------------- + +1. It is easily proven that no deadlock will be missed due to our +asynchronous invocation of deadlock checking. A deadlock cycle in the WFG +is formed when the last edge in the cycle is added; therefore the last +process in the cycle to wait (the one from which that edge is outgoing) is +certain to detect and resolve the cycle when it later runs CheckDeadLock. +This holds even if that edge addition created multiple cycles; the process +may indeed abort without ever noticing those additional cycles, but we +don't particularly care. The only other possible creation of deadlocks is +during deadlock resolution's rearrangement of wait queues, and we already +saw that that algorithm will prove that it creates no new deadlocks before +it attempts to actually execute any rearrangement. + +2. It is not certain that a deadlock will be resolved by aborting the +last-to-wait process. If earlier waiters in the cycle have not yet run +CheckDeadLock, then the first one to do so will be the victim. + +3. No live (wakable) process can be missed by ProcLockWakeup, since it +examines every member of the wait queue (this was not true in the 7.0 +implementation, BTW). Therefore, if ProcLockWakeup is always invoked +after a lock is released or a wait queue is rearranged, there can be no +failure to wake a wakable process. One should also note that +LockErrorCleanup (abort a waiter due to outside factors) must run +ProcLockWakeup, in case the canceled waiter was soft-blocking other +waiters. + +4. We can minimize excess rearrangement-trial work by being careful to +scan the wait queue from the front when looking for soft edges. For +example, if we have queue order A,B,C and C has deadlock conflicts with +both A and B, we want to generate the "C before A" constraint first, +rather than wasting time with "C before B", which won't move C far +enough up. So we look for soft edges outgoing from C starting at the +front of the wait queue. + +5. The working data structures needed by the deadlock detection code can +be limited to numbers of entries computed from MaxBackends. Therefore, +we can allocate the worst-case space needed during backend startup. This +seems a safer approach than trying to allocate workspace on the fly; we +don't want to risk having the deadlock detector run out of memory, else +we really have no guarantees at all that deadlock will be detected. + +6. We abuse the deadlock detector to implement autovacuum cancellation. +When we run the detector and we find that there's an autovacuum worker +involved in the waits-for graph, we store a pointer to its PGPROC, and +return a special return code (unless a hard deadlock has been detected). +The caller can then send a cancellation signal. This implements the +principle that autovacuum has a low locking priority (eg it must not block +DDL on the table). + +Group Locking +------------- + +As if all of that weren't already complicated enough, PostgreSQL now supports +parallelism (see src/backend/access/transam/README.parallel), which means that +we might need to resolve deadlocks that occur between gangs of related +processes rather than individual processes. This doesn't change the basic +deadlock detection algorithm very much, but it makes the bookkeeping more +complicated. + +We choose to regard locks held by processes in the same parallel group as +non-conflicting with the exception of relation extension and page locks. This +means that two processes in a parallel group can hold a self-exclusive lock on +the same relation at the same time, or one process can acquire an AccessShareLock +while the other already holds AccessExclusiveLock. This might seem dangerous and +could be in some cases (more on that below), but if we didn't do this then +parallel query would be extremely prone to self-deadlock. For example, a +parallel query against a relation on which the leader already had +AccessExclusiveLock would hang, because the workers would try to lock the same +relation and be blocked by the leader; yet the leader can't finish until it +receives completion indications from all workers. An undetected deadlock +results. This is far from the only scenario where such a problem happens. The +same thing will occur if the leader holds only AccessShareLock, the worker +seeks AccessShareLock, but between the time the leader attempts to acquire the +lock and the time the worker attempts to acquire it, some other process queues +up waiting for an AccessExclusiveLock. In this case, too, an indefinite hang +results. + +It might seem that we could predict which locks the workers will attempt to +acquire and ensure before going parallel that those locks would be acquired +successfully. But this is very difficult to make work in a general way. For +example, a parallel worker's portion of the query plan could involve an +SQL-callable function which generates a query dynamically, and that query +might happen to hit a table on which the leader happens to hold +AccessExclusiveLock. By imposing enough restrictions on what workers can do, +we could eventually create a situation where their behavior can be adequately +restricted, but these restrictions would be fairly onerous, and even then, the +system required to decide whether the workers will succeed at acquiring the +necessary locks would be complex and possibly buggy. + +So, instead, we take the approach of deciding that locks within a lock group +do not conflict. This eliminates the possibility of an undetected deadlock, +but also opens up some problem cases: if the leader and worker try to do some +operation at the same time which would ordinarily be prevented by the +heavyweight lock mechanism, undefined behavior might result. In practice, the +dangers are modest. The leader and worker share the same transaction, +snapshot, and combo CID hash, and neither can perform any DDL or, indeed, +write any data at all. Thus, for either to read a table locked exclusively by +the other is safe enough. Problems would occur if the leader initiated +parallelism from a point in the code at which it had some backend-private +state that made table access from another process unsafe, for example after +calling SetReindexProcessing and before calling ResetReindexProcessing, +catastrophe could ensue, because the worker won't have that state. + +To allow parallel inserts and parallel copy, we have ensured that relation +extension and page locks don't participate in group locking which means such +locks can conflict among the same group members. This is required as it is no +safer for two related processes to extend the same relation or perform clean up +in gin indexes at a time than for unrelated processes to do the same. We don't +acquire a heavyweight lock on any other object after relation extension lock +which means such a lock can never participate in the deadlock cycle. After +acquiring page locks, we can acquire relation extension lock but reverse never +happens, so those will also not participate in deadlock. To allow for other +parallel writes like parallel update or parallel delete, we'll either need to +(1) further enhance the deadlock detector to handle those tuple locks in a +different way than other types; or (2) have parallel workers use some other +mutual exclusion method for such cases. Currently, the parallel mode is +strictly read-only, but now we have the infrastructure to allow parallel +inserts and parallel copy. + +Group locking adds three new members to each PGPROC: lockGroupLeader, +lockGroupMembers, and lockGroupLink. A PGPROC's lockGroupLeader is NULL for +processes not involved in parallel query. When a process wants to cooperate +with parallel workers, it becomes a lock group leader, which means setting +this field to point to its own PGPROC. When a parallel worker starts up, it +points this field at the leader. The lockGroupMembers field is only used in +the leader; it is a list of the member PGPROCs of the lock group (the leader +and all workers). The lockGroupLink field is the list link for this list. + +All three of these fields are considered to be protected by a lock manager +partition lock. The partition lock that protects these fields within a given +lock group is chosen by taking the leader's pgprocno modulo the number of lock +manager partitions. This unusual arrangement has a major advantage: the +deadlock detector can count on the fact that no lockGroupLeader field can +change while the deadlock detector is running, because it knows that it holds +all the lock manager locks. Also, holding this single lock allows safe +manipulation of the lockGroupMembers list for the lock group. + +We need an additional interlock when setting these fields, because a newly +started parallel worker has to try to join the leader's lock group, but it +has no guarantee that the group leader is still alive by the time it gets +started. We try to ensure that the parallel leader dies after all workers +in normal cases, but also that the system could survive relatively intact +if that somehow fails to happen. This is one of the precautions against +such a scenario: the leader relays its PGPROC and also its PID to the +worker, and the worker fails to join the lock group unless the given PGPROC +still has the same PID and is still a lock group leader. We assume that +PIDs are not recycled quickly enough for this interlock to fail. + + +User Locks (Advisory Locks) +--------------------------- + +User locks are handled totally on the application side as long term +cooperative locks which may extend beyond the normal transaction boundaries. +Their purpose is to indicate to an application that someone is `working' +on an item. So it is possible to put an user lock on a tuple's oid, +retrieve the tuple, work on it for an hour and then update it and remove +the lock. While the lock is active other clients can still read and write +the tuple but they can be aware that it has been locked at the application +level by someone. + +User locks and normal locks are completely orthogonal and they don't +interfere with each other. + +User locks can be acquired either at session level or transaction level. +A session-level lock request is not automatically released at transaction +end, but must be explicitly released by the application. (However, any +remaining locks are always released at session end.) Transaction-level +user lock requests behave the same as normal lock requests, in that they +are released at transaction end and do not need explicit unlocking. + +Locking during Hot Standby +-------------------------- + +The Startup process is the only backend that can make changes during +recovery, all other backends are read only. As a result the Startup +process does not acquire locks on relations or objects except when the lock +level is AccessExclusiveLock. + +Regular backends are only allowed to take locks on relations or objects +at RowExclusiveLock or lower. This ensures that they do not conflict with +each other or with the Startup process, unless AccessExclusiveLocks are +requested by the Startup process. + +Deadlocks involving AccessExclusiveLocks are not possible, so we need +not be concerned that a user initiated deadlock can prevent recovery from +progressing. + +AccessExclusiveLocks on the primary node generate WAL records +that are then applied by the Startup process. Locks are released at end +of transaction just as they are in normal processing. These locks are +held by the Startup process, acting as a proxy for the backends that +originally acquired these locks. Again, these locks cannot conflict with +one another, so the Startup process cannot deadlock itself either. + +Although deadlock is not possible, a regular backend's weak lock can +prevent the Startup process from making progress in applying WAL, which is +usually not something that should be tolerated for very long. Mechanisms +exist to forcibly cancel a regular backend's query if it blocks the +Startup process for too long. diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI new file mode 100644 index 0000000..50d2ecc --- /dev/null +++ b/src/backend/storage/lmgr/README-SSI @@ -0,0 +1,646 @@ +src/backend/storage/lmgr/README-SSI + +Serializable Snapshot Isolation (SSI) and Predicate Locking +=========================================================== + +This code is in the lmgr directory because about 90% of it is an +implementation of predicate locking, which is required for SSI, +rather than being directly related to SSI itself. When another use +for predicate locking justifies the effort to tease these two things +apart, this README file should probably be split. + + +Credits +------- + +This feature was developed by Kevin Grittner and Dan R. K. Ports, +with review and suggestions from Joe Conway, Heikki Linnakangas, and +Jeff Davis. It is based on work published in these papers: + + Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008. + Serializable isolation for snapshot databases. + In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD + international conference on Management of data, + pages 729-738, New York, NY, USA. ACM. + http://doi.acm.org/10.1145/1376616.1376690 + + Michael James Cahill. 2009. + Serializable Isolation for Snapshot Databases. + Sydney Digital Theses. + University of Sydney, School of Information Technologies. + http://hdl.handle.net/2123/5353 + + +Overview +-------- + +With true serializable transactions, if you can show that your +transaction will do the right thing if there are no concurrent +transactions, it will do the right thing in any mix of serializable +transactions or be rolled back with a serialization failure. This +feature has been implemented in PostgreSQL using SSI. + + +Serializable and Snapshot Transaction Isolation Levels +------------------------------------------------------ + +Serializable transaction isolation is attractive for shops with +active development by many programmers against a complex schema +because it guarantees data integrity with very little staff time -- +if a transaction can be shown to always do the right thing when it is +run alone (before or after any other transaction), it will always do +the right thing in any mix of concurrent serializable transactions. +Where conflicts with other transactions would result in an +inconsistent state within the database or an inconsistent view of +the data, a serializable transaction will block or roll back to +prevent the anomaly. The SQL standard provides a specific SQLSTATE +for errors generated when a transaction rolls back for this reason, +so that transactions can be retried automatically. + +Before version 9.1, PostgreSQL did not support a full serializable +isolation level. A request for serializable transaction isolation +actually provided snapshot isolation. This has well known anomalies +which can allow data corruption or inconsistent views of the data +during concurrent transactions; although these anomalies only occur +when certain patterns of read-write dependencies exist within a set +of concurrent transactions. Where these patterns exist, the anomalies +can be prevented by introducing conflicts through explicitly +programmed locks or otherwise unnecessary writes to the database. +Snapshot isolation is popular because performance is better than +serializable isolation and the integrity guarantees which it does +provide allow anomalies to be avoided or managed with reasonable +effort in many environments. + + +Serializable Isolation Implementation Strategies +------------------------------------------------ + +Techniques for implementing full serializable isolation have been +published and in use in many database products for decades. The +primary technique which has been used is Strict Two-Phase Locking +(S2PL), which operates by blocking writes against data which has been +read by concurrent transactions and blocking any access (read or +write) against data which has been written by concurrent +transactions. A cycle in a graph of blocking indicates a deadlock, +requiring a rollback. Blocking and deadlocks under S2PL in high +contention workloads can be debilitating, crippling throughput and +response time. + +A new technique for implementing full serializable isolation in an +MVCC database appears in the literature beginning in 2008. This +technique, known as Serializable Snapshot Isolation (SSI) has many of +the advantages of snapshot isolation. In particular, reads don't +block anything and writes don't block reads. Essentially, it runs +snapshot isolation but monitors the read-write conflicts between +transactions to identify dangerous structures in the transaction +graph which indicate that a set of concurrent transactions might +produce an anomaly, and rolls back transactions to ensure that no +anomalies occur. It will produce some false positives (where a +transaction is rolled back even though there would not have been an +anomaly), but will never let an anomaly occur. In the two known +prototype implementations, performance for many workloads (even with +the need to restart transactions which are rolled back) is very close +to snapshot isolation and generally far better than an S2PL +implementation. + + +Apparent Serial Order of Execution +---------------------------------- + +One way to understand when snapshot anomalies can occur, and to +visualize the difference between the serializable implementations +described above, is to consider that among transactions executing at +the serializable transaction isolation level, the results are +required to be consistent with some serial (one-at-a-time) execution +of the transactions [1]. How is that order determined in each? + +In S2PL, each transaction locks any data it accesses. It holds the +locks until committing, preventing other transactions from making +conflicting accesses to the same data in the interim. Some +transactions may have to be rolled back to prevent deadlock. But +successful transactions can always be viewed as having occurred +sequentially, in the order they committed. + +With snapshot isolation, reads never block writes, nor vice versa, so +more concurrency is possible. The order in which transactions appear +to have executed is determined by something more subtle than in S2PL: +read/write dependencies. If a transaction reads data, it appears to +execute after the transaction that wrote the data it is reading. +Similarly, if it updates data, it appears to execute after the +transaction that wrote the previous version. These dependencies, which +we call "wr-dependencies" and "ww-dependencies", are consistent with +the commit order, because the first transaction must have committed +before the second starts. However, there can also be dependencies +between two *concurrent* transactions, i.e. where one was running when +the other acquired its snapshot. These "rw-conflicts" occur when one +transaction attempts to read data which is not visible to it because +the transaction which wrote it (or will later write it) is +concurrent. The reading transaction appears to have executed first, +regardless of the actual sequence of transaction starts or commits, +because it sees a database state prior to that in which the other +transaction leaves it. + +Anomalies occur when a cycle is created in the graph of dependencies: +when a dependency or series of dependencies causes transaction A to +appear to have executed before transaction B, but another series of +dependencies causes B to appear before A. If that's the case, then +the results can't be consistent with any serial execution of the +transactions. + + +SSI Algorithm +------------- + +As of 9.1, serializable transactions in PostgreSQL are implemented using +Serializable Snapshot Isolation (SSI), based on the work of Cahill +et al. Fundamentally, this allows snapshot isolation to run as it +previously did, while monitoring for conditions which could create a +serialization anomaly. + +SSI is based on the observation [2] that each snapshot isolation +anomaly corresponds to a cycle that contains a "dangerous structure" +of two adjacent rw-conflict edges: + + Tin ------> Tpivot ------> Tout + rw rw + +SSI works by watching for this dangerous structure, and rolling +back a transaction when needed to prevent any anomaly. This means it +only needs to track rw-conflicts between concurrent transactions, not +wr- and ww-dependencies. It also means there is a risk of false +positives, because not every dangerous structure is embedded in an +actual cycle. The number of false positives is low in practice, so +this represents an acceptable tradeoff for keeping the detection +overhead low. + +The PostgreSQL implementation uses two additional optimizations: + +* Tout must commit before any other transaction in the cycle + (see proof of Theorem 2.1 of [2]). We only roll back a transaction + if Tout commits before Tpivot and Tin. + +* if Tin is read-only, there can only be an anomaly if Tout committed + before Tin takes its snapshot. This optimization is an original + one. Proof: + + - Because there is a cycle, there must be some transaction T0 that + precedes Tin in the cycle. (T0 might be the same as Tout.) + + - The edge between T0 and Tin can't be a rw-conflict or ww-dependency, + because Tin was read-only, so it must be a wr-dependency. + Those can only occur if T0 committed before Tin took its snapshot, + else Tin would have ignored T0's output. + + - Because Tout must commit before any other transaction in the + cycle, it must commit before T0 commits -- and thus before Tin + starts. + + +PostgreSQL Implementation +------------------------- + + * Since this technique is based on Snapshot Isolation (SI), those +areas in PostgreSQL which don't use SI can't be brought under SSI. +This includes system tables, temporary tables, sequences, hint bit +rewrites, etc. SSI can not eliminate existing anomalies in these +areas. + + * Any transaction which is run at a transaction isolation level +other than SERIALIZABLE will not be affected by SSI. If you want to +enforce business rules through SSI, all transactions should be run at +the SERIALIZABLE transaction isolation level, and that should +probably be set as the default. + + * If all transactions are run at the SERIALIZABLE transaction +isolation level, business rules can be enforced in triggers or +application code without ever having a need to acquire an explicit +lock or to use SELECT FOR SHARE or SELECT FOR UPDATE. + + * Those who want to continue to use snapshot isolation without +the additional protections of SSI (and the associated costs of +enforcing those protections), can use the REPEATABLE READ transaction +isolation level. This level retains its legacy behavior, which +is identical to the old SERIALIZABLE implementation and fully +consistent with the standard's requirements for the REPEATABLE READ +transaction isolation level. + + * Performance under this SSI implementation will be significantly +improved if transactions which don't modify permanent tables are +declared to be READ ONLY before they begin reading data. + + * Performance under SSI will tend to degrade more rapidly with a +large number of active database transactions than under less strict +isolation levels. Limiting the number of active transactions through +use of a connection pool or similar techniques may be necessary to +maintain good performance. + + * Any transaction which must be rolled back to prevent +serialization anomalies will fail with SQLSTATE 40001, which has a +standard meaning of "serialization failure". + + * This SSI implementation makes an effort to choose the +transaction to be canceled such that an immediate retry of the +transaction will not fail due to conflicts with exactly the same +transactions. Pursuant to this goal, no transaction is canceled +until one of the other transactions in the set of conflicts which +could generate an anomaly has successfully committed. This is +conceptually similar to how write conflicts are handled. To fully +implement this guarantee there needs to be a way to roll back the +active transaction for another process with a serialization failure +SQLSTATE, even if it is "idle in transaction". + + +Predicate Locking +----------------- + +Both S2PL and SSI require some form of predicate locking to handle +situations where reads conflict with later inserts or with later +updates which move data into the selected range. PostgreSQL didn't +already have predicate locking, so it needed to be added to support +full serializable transactions under either strategy. Practical +implementations of predicate locking generally involve acquiring +locks against data as it is accessed, using multiple granularities +(tuple, page, table, etc.) with escalation as needed to keep the lock +count to a number which can be tracked within RAM structures. This +approach was used in PostgreSQL. Coarse granularities can cause some +false positive indications of conflict. The number of false positives +can be influenced by plan choice. + + +Implementation overview +----------------------- + +New RAM structures, inspired by those used to track traditional locks +in PostgreSQL, but tailored to the needs of SIREAD predicate locking, +are used. These refer to physical objects actually accessed in the +course of executing the query, to model the predicates through +inference. Anyone interested in this subject should review the +Hellerstein, Stonebraker and Hamilton paper [3], along with the +locking papers referenced from that and the Cahill papers. + +Because the SIREAD locks don't block, traditional locking techniques +have to be modified. Intent locking (locking higher level objects +before locking lower level objects) doesn't work with non-blocking +"locks" (which are, in some respects, more like flags than locks). + +A configurable amount of shared memory is reserved at postmaster +start-up to track predicate locks. This size cannot be changed +without a restart. + +To prevent resource exhaustion, multiple fine-grained locks may +be promoted to a single coarser-grained lock as needed. + +An attempt to acquire an SIREAD lock on a tuple when the same +transaction already holds an SIREAD lock on the page or the relation +will be ignored. Likewise, an attempt to lock a page when the +relation is locked will be ignored, and the acquisition of a coarser +lock will result in the automatic release of all finer-grained locks +it covers. + + +Heap locking +------------ + +Predicate locks will be acquired for the heap based on the following: + + * For a table scan, the entire relation will be locked. + + * Each tuple read which is visible to the reading transaction +will be locked, whether or not it meets selection criteria; except +that there is no need to acquire an SIREAD lock on a tuple when the +transaction already holds a write lock on any tuple representing the +row, since a rw-conflict would also create a ww-dependency which +has more aggressive enforcement and thus will prevent any anomaly. + + * Modifying a heap tuple creates a rw-conflict with any transaction +that holds a SIREAD lock on that tuple, or on the page or relation +that contains it. + + * Inserting a new tuple creates a rw-conflict with any transaction +holding a SIREAD lock on the entire relation. It doesn't conflict with +page-level locks, because page-level locks are only used to aggregate +tuple locks. Unlike index page locks, they don't lock "gaps" on the page. + + +Index AM implementations +------------------------ + +Since predicate locks only exist to detect writes which conflict with +earlier reads, and heap tuple locks are acquired to cover all heap +tuples actually read, including those read through indexes, the index +tuples which were actually scanned are not of interest in themselves; +we only care about their "new neighbors" -- later inserts into the +index which would have been included in the scan had they existed at +the time. Conceptually, we want to lock the gaps between and +surrounding index entries within the scanned range. + +Correctness requires that any insert into an index generates a +rw-conflict with a concurrent serializable transaction if, after that +insert, re-execution of any index scan of the other transaction would +access the heap for a row not accessed during the previous execution. +Note that a non-HOT update which expires an old index entry covered +by the scan and adds a new entry for the modified row's new tuple +need not generate a conflict, although an update which "moves" a row +into the scan must generate a conflict. While correctness allows +false positives, they should be minimized for performance reasons. + +Several optimizations are possible, though not all are implemented yet: + + * An index scan which is just finding the right position for an +index insertion or deletion need not acquire a predicate lock. + + * An index scan which is comparing for equality on the entire key +for a unique index need not acquire a predicate lock as long as a key +is found corresponding to a visible tuple which has not been modified +by another transaction -- there are no "between or around" gaps to +cover. + + * As long as built-in foreign key enforcement continues to use +its current "special tricks" to deal with MVCC issues, predicate +locks should not be needed for scans done by enforcement code. + + * If a search determines that no rows can be found regardless of +index contents because the search conditions are contradictory (e.g., +x = 1 AND x = 2), then no predicate lock is needed. + +Other index AM implementation considerations: + + * For an index AM that doesn't have support for predicate locking, +we just acquire a predicate lock on the whole index for any search. + + * B-tree index searches acquire predicate locks only on the +index *leaf* pages needed to lock the appropriate index range. If, +however, a search discovers that no root page has yet been created, a +predicate lock on the index relation is required. + + * Like a B-tree, GIN searches acquire predicate locks only on the +leaf pages of entry tree. When performing an equality scan, and an +entry has a posting tree, the posting tree root is locked instead, to +lock only that key value. However, fastupdate=on postpones the +insertion of tuples into index structure by temporarily storing them +into pending list. That makes us unable to detect r-w conflicts using +page-level locks. To cope with that, insertions to the pending list +conflict with all scans. + + * GiST searches can determine that there are no matches at any +level of the index, so we acquire predicate lock at each index +level during a GiST search. An index insert at the leaf level can +then be trusted to ripple up to all levels and locations where +conflicting predicate locks may exist. In case there is a page split, +we need to copy predicate lock from the original page to all the new +pages. + + * Hash index searches acquire predicate locks on the primary +page of a bucket. It acquires a lock on both the old and new buckets +for scans that happen concurrently with page splits. During a bucket +split, a predicate lock is copied from the primary page of an old +bucket to the primary page of a new bucket. + + * The effects of page splits, overflows, consolidations, and +removals must be carefully reviewed to ensure that predicate locks +aren't "lost" during those operations, or kept with pages which could +get re-used for different parts of the index. + + +Innovations +----------- + +The PostgreSQL implementation of Serializable Snapshot Isolation +differs from what is described in the cited papers for several +reasons: + + 1. PostgreSQL didn't have any existing predicate locking. It had +to be added from scratch. + + 2. The existing in-memory lock structures were not suitable for +tracking SIREAD locks. + * In PostgreSQL, tuple level locks are not held in RAM for +any length of time; lock information is written to the tuples +involved in the transactions. + * In PostgreSQL, existing lock structures have pointers to +memory which is related to a session. SIREAD locks need to persist +past the end of the originating transaction and even the session +which ran it. + * PostgreSQL needs to be able to tolerate a large number of +transactions executing while one long-running transaction stays open +-- the in-RAM techniques discussed in the papers wouldn't support +that. + + 3. Unlike the database products used for the prototypes described +in the papers, PostgreSQL didn't already have a true serializable +isolation level distinct from snapshot isolation. + + 4. PostgreSQL supports subtransactions -- an issue not mentioned +in the papers. + + 5. PostgreSQL doesn't assign a transaction number to a database +transaction until and unless necessary (normally, when the transaction +attempts to modify data). + + 6. PostgreSQL has pluggable data types with user-definable +operators, as well as pluggable index types, not all of which are +based around data types which support ordering. + + 7. Some possible optimizations became apparent during development +and testing. + +Differences from the implementation described in the papers are +listed below. + + * New structures needed to be created in shared memory to track +the proper information for serializable transactions and their SIREAD +locks. + + * Because PostgreSQL does not have the same concept of an "oldest +transaction ID" for all serializable transactions as assumed in the +Cahill thesis, we track the oldest snapshot xmin among serializable +transactions, and a count of how many active transactions use that +xmin. When the count hits zero we find the new oldest xmin and run a +clean-up based on that. + + * Because reads in a subtransaction may cause that subtransaction +to roll back, thereby affecting what is written by the top level +transaction, predicate locks must survive a subtransaction rollback. +As a consequence, all xid usage in SSI, including predicate locking, +is based on the top level xid. When looking at an xid that comes +from a tuple's xmin or xmax, for example, we always call +SubTransGetTopmostTransaction() before doing much else with it. + + * PostgreSQL does not use "update in place" with a rollback log +for its MVCC implementation. Where possible it uses "HOT" updates on +the same page (if there is room and no indexed value is changed). +For non-HOT updates the old tuple is expired in place and a new tuple +is inserted at a new location. Because of this difference, a tuple +lock in PostgreSQL doesn't automatically lock any other versions of a +row. We don't try to copy or expand a tuple lock to any other +versions of the row, based on the following proof that any additional +serialization failures we would get from that would be false +positives: + + o If transaction T1 reads a row version (thus acquiring a +predicate lock on it) and a second transaction T2 updates that row +version (thus creating a rw-conflict graph edge from T1 to T2), must a +third transaction T3 which re-updates the new version of the row also +have a rw-conflict in from T1 to prevent anomalies? In other words, +does it matter whether we recognize the edge T1 -> T3? + + o If T1 has a conflict in, it certainly doesn't. Adding the +edge T1 -> T3 would create a dangerous structure, but we already had +one from the edge T1 -> T2, so we would have aborted something anyway. +(T2 has already committed, else T3 could not have updated its output; +but we would have aborted either T1 or T1's predecessor(s). Hence +no cycle involving T1 and T3 can survive.) + + o Now let's consider the case where T1 doesn't have a +rw-conflict in. If that's the case, for this edge T1 -> T3 to make a +difference, T3 must have a rw-conflict out that induces a cycle in the +dependency graph, i.e. a conflict out to some transaction preceding T1 +in the graph. (A conflict out to T1 itself would be problematic too, +but that would mean T1 has a conflict in, the case we already +eliminated.) + + o So now we're trying to figure out if there can be an +rw-conflict edge T3 -> T0, where T0 is some transaction that precedes +T1. For T0 to precede T1, there has to be some edge, or sequence of +edges, from T0 to T1. At least the last edge has to be a wr-dependency +or ww-dependency rather than a rw-conflict, because T1 doesn't have a +rw-conflict in. And that gives us enough information about the order +of transactions to see that T3 can't have a rw-conflict to T0: + - T0 committed before T1 started (the wr/ww-dependency implies this) + - T1 started before T2 committed (the T1->T2 rw-conflict implies this) + - T2 committed before T3 started (otherwise, T3 would get aborted + because of an update conflict) + + o That means T0 committed before T3 started, and therefore +there can't be a rw-conflict from T3 to T0. + + o So in all cases, we don't need the T1 -> T3 edge to +recognize cycles. Therefore it's not necessary for T1's SIREAD lock +on the original tuple version to cover later versions as well. + + * Predicate locking in PostgreSQL starts at the tuple level +when possible. Multiple fine-grained locks are promoted to a single +coarser-granularity lock as needed to avoid resource exhaustion. The +amount of memory used for these structures is configurable, to balance +RAM usage against SIREAD lock granularity. + + * Each backend keeps a process-local table of the locks it holds. +To support granularity promotion decisions with low CPU and locking +overhead, this table also includes the coarser covering locks and the +number of finer-granularity locks they cover. + + * Conflicts are identified by looking for predicate locks +when tuples are written, and by looking at the MVCC information when +tuples are read. There is no matching between two RAM-based locks. + + * Because write locks are stored in the heap tuples rather than a +RAM-based lock table, the optimization described in the Cahill thesis +which eliminates an SIREAD lock where there is a write lock is +implemented by the following: + 1. When checking a heap write for conflicts against existing +predicate locks, a tuple lock on the tuple being written is removed. + 2. When acquiring a predicate lock on a heap tuple, we +return quickly without doing anything if it is a tuple written by the +reading transaction. + + * Rather than using conflictIn and conflictOut pointers which use +NULL to indicate no conflict and a self-reference to indicate +multiple conflicts or conflicts with committed transactions, we use a +list of rw-conflicts. With the more complete information, false +positives are reduced and we have sufficient data for more aggressive +clean-up and other optimizations: + + o We can avoid ever rolling back a transaction until and +unless there is a pivot where a transaction on the conflict *out* +side of the pivot committed before either of the other transactions. + + o We can avoid ever rolling back a transaction when the +transaction on the conflict *in* side of the pivot is explicitly or +implicitly READ ONLY unless the transaction on the conflict *out* +side of the pivot committed before the READ ONLY transaction acquired +its snapshot. (An implicit READ ONLY transaction is one which +committed without writing, even though it was not explicitly declared +to be READ ONLY.) + + o We can more aggressively clean up conflicts, predicate +locks, and SSI transaction information. + + * We allow a READ ONLY transaction to "opt out" of SSI if there are +no READ WRITE transactions which could cause the READ ONLY +transaction to ever become part of a "dangerous structure" of +overlapping transaction dependencies. + + * We allow the user to request that a READ ONLY transaction wait +until the conditions are right for it to start in the "opt out" state +described above. We add a DEFERRABLE state to transactions, which is +specified and maintained in a way similar to READ ONLY. It is +ignored for transactions that are not SERIALIZABLE and READ ONLY. + + * When a transaction must be rolled back, we pick among the +active transactions such that an immediate retry will not fail again +on conflicts with the same transactions. + + * We use the PostgreSQL SLRU system to hold summarized +information about older committed transactions to put an upper bound +on RAM used. Beyond that limit, information spills to disk. +Performance can degrade in a pessimal situation, but it should be +tolerable, and transactions won't need to be canceled or blocked +from starting. + + +R&D Issues +---------- + +This is intended to be the place to record specific issues which need +more detailed review or analysis. + + * WAL file replay. While serializable implementations using S2PL +can guarantee that the write-ahead log contains commits in a sequence +consistent with some serial execution of serializable transactions, +SSI cannot make that guarantee. While the WAL replay is no less +consistent than under snapshot isolation, it is possible that under +PITR recovery or hot standby a database could reach a readable state +where some transactions appear before other transactions which would +have had to precede them to maintain serializable consistency. In +essence, if we do nothing, WAL replay will be at snapshot isolation +even for serializable transactions. Is this OK? If not, how do we +address it? + + * External replication. Look at how this impacts external +replication solutions, like Postgres-R, Slony, pgpool, HS/SR, etc. +This is related to the "WAL file replay" issue. + + * UNIQUE btree search for equality on all columns. Since a search +of a UNIQUE index using equality tests on all columns will lock the +heap tuple if an entry is found, it appears that there is no need to +get a predicate lock on the index in that case. A predicate lock is +still needed for such a search if a matching index entry which points +to a visible tuple is not found. + + * Minimize touching of shared memory. Should lists in shared +memory push entries which have just been returned to the front of the +available list, so they will be popped back off soon and some memory +might never be touched, or should we keep adding returned items to +the end of the available list? + + +References +---------- + +[1] http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt +Search for serial execution to find the relevant section. + +[2] A. Fekete et al. Making Snapshot Isolation Serializable. In ACM +Transactions on Database Systems 30:2, Jun. 2005. +http://dx.doi.org/10.1145/1071610.1071615 + +[3] Joseph M. Hellerstein, Michael Stonebraker and James Hamilton. 2007. +Architecture of a Database System. Foundations and Trends(R) in +Databases Vol. 1, No. 2 (2007) 141-259. +http://db.cs.berkeley.edu/papers/fntdb07-architecture.pdf + Of particular interest: + * 6.1 A Note on ACID + * 6.2 A Brief Review of Serializability + * 6.3 Locking and Latching + * 6.3.1 Transaction Isolation Levels + * 6.5.3 Next-Key Locking: Physical Surrogates for Logical Properties diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier new file mode 100644 index 0000000..f78e5ac --- /dev/null +++ b/src/backend/storage/lmgr/README.barrier @@ -0,0 +1,197 @@ +Memory Barriers +=============== + +Modern CPUs make extensive use of pipe-lining and out-of-order execution, +meaning that the CPU is often executing more than one instruction at a +time, and not necessarily in the order that the source code would suggest. +Furthermore, even before the CPU gets a chance to reorder operations, the +compiler may (and often does) reorganize the code for greater efficiency, +particularly at higher optimization levels. Optimizing compilers and +out-of-order execution are both critical for good performance, but they +can lead to surprising results when multiple processes access the same +memory space. + +Example +======= + +Suppose x is a pointer to a structure stored in shared memory, and that the +entire structure has been initialized to zero bytes. One backend executes +the following code fragment: + + x->foo = 1; + x->bar = 1; + +Meanwhile, at approximately the same time, another backend executes this +code fragment: + + bar = x->bar; + foo = x->foo; + +The second backend might end up with foo = 1 and bar = 1 (if it executes +both statements after the first backend), or with foo = 0 and bar = 0 (if +it executes both statements before the first backend), or with foo = 1 and +bar = 0 (if the first backend executes the first statement, the second +backend executes both statements, and then the first backend executes the +second statement). + +Surprisingly, however, the second backend could also end up with foo = 0 +and bar = 1. The compiler might swap the order of the two stores performed +by the first backend, or the two loads performed by the second backend. +Even if it doesn't, on a machine with weak memory ordering (such as PowerPC +or ARM) the CPU might choose to execute either the loads or the stores +out of order. This surprising result can lead to bugs. + +A common pattern where this actually does result in a bug is when adding items +onto a queue. The writer does this: + + q->items[q->num_items] = new_item; + ++q->num_items; + +The reader does this: + + num_items = q->num_items; + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +This code turns out to be unsafe, because the writer might increment +q->num_items before it finishes storing the new item into the appropriate slot. +More subtly, the reader might prefetch the contents of the q->items array +before reading q->num_items. Thus, there's still a bug here *even if the +writer does everything in the order we expect*. We need the writer to update +the array before bumping the item counter, and the reader to examine the item +counter before examining the array. + +Note that these types of highly counterintuitive bugs can *only* occur when +multiple processes are interacting with the same memory segment. A given +process always perceives its *own* writes to memory in program order. + +Avoiding Memory Ordering Bugs +============================= + +The simplest (and often best) way to avoid memory ordering bugs is to +protect the data structures involved with an lwlock. For more details, see +src/backend/storage/lmgr/README. For instance, in the above example, the +writer could acquire an lwlock in exclusive mode before appending to the +queue, and each reader could acquire the same lock in shared mode before +reading it. If the data structure is not heavily trafficked, this solution is +generally entirely adequate. + +However, in some cases, it is desirable to avoid the overhead of acquiring +and releasing locks. In this case, memory barriers may be used to ensure +that the apparent order of execution is as the programmer desires. In +PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve +this result. In the example above, we can prevent the reader from seeing a +garbage value by having the writer do this: + + q->items[q->num_items] = new_item; + pg_memory_barrier(); + ++q->num_items; + +And by having the reader do this: + + num_items = q->num_items; + pg_memory_barrier(); + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +The pg_memory_barrier() macro will (1) prevent the compiler from rearranging +the code in such a way as to allow the memory accesses to occur out of order +and (2) generate any code (often, inline assembly) that is needed to prevent +the CPU from executing the memory accesses out of order. Specifically, the +barrier prevents loads and stores written after the barrier from being +performed before the barrier, and vice-versa. + +Although this code will work, it is needlessly inefficient. On systems with +strong memory ordering (such as x86), the CPU never reorders loads with other +loads, nor stores with other stores. It can, however, allow a load to be +performed before a subsequent store. To avoid emitting unnecessary memory +instructions, we provide two additional primitives: pg_read_barrier(), and +pg_write_barrier(). When a memory barrier is being used to separate two +loads, use pg_read_barrier(); when it is separating two stores, use +pg_write_barrier(); when it is a separating a load and a store (in either +order), use pg_memory_barrier(). pg_memory_barrier() can always substitute +for either a read or a write barrier, but is typically more expensive, and +therefore should be used only when needed. + +With these guidelines in mind, the writer can do this: + + q->items[q->num_items] = new_item; + pg_write_barrier(); + ++q->num_items; + +And the reader can do this: + + num_items = q->num_items; + pg_read_barrier(); + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +On machines with strong memory ordering, these weaker barriers will simply +prevent compiler rearrangement, without emitting any actual machine code. +On machines with weak memory ordering, they will prevent compiler +reordering and also emit whatever hardware barrier may be required. Even +on machines with weak memory ordering, a read or write barrier may be able +to use a less expensive instruction than a full barrier. + +Weaknesses of Memory Barriers +============================= + +While memory barriers are a powerful tool, and much cheaper than locks, they +are also much less capable than locks. Here are some of the problems. + +1. Concurrent writers are unsafe. In the above example of a queue, using +memory barriers doesn't make it safe for two processes to add items to the +same queue at the same time. If more than one process can write to the queue, +a spinlock or lwlock must be used to synchronize access. The readers can +perhaps proceed without any lock, but the writers may not. + +Even very simple write operations often require additional synchronization. +For example, it's not safe for multiple writers to simultaneously execute +this code (supposing x is a pointer into shared memory): + + x->foo++; + +Although this may compile down to a single machine-language instruction, +the CPU will execute that instruction by reading the current value of foo, +adding one to it, and then storing the result back to the original address. +If two CPUs try to do this simultaneously, both may do their reads before +either one does their writes. Such a case could be made safe by using an +atomic variable and an atomic add. See port/atomics.h. + +2. Eight-byte loads and stores aren't necessarily atomic. We assume in +various places in the source code that an aligned four-byte load or store is +atomic, and that other processes therefore won't see a half-set value. +Sadly, the same can't be said for eight-byte value: on some platforms, an +aligned eight-byte load or store will generate two four-byte operations. If +you need an atomic eight-byte read or write, you must either serialize access +with a lock or use an atomic variable. + +3. No ordering guarantees. While memory barriers ensure that any given +process performs loads and stores to shared memory in order, they don't +guarantee synchronization. In the queue example above, we can use memory +barriers to be sure that readers won't see garbage, but there's nothing to +say whether a given reader will run before or after a given writer. If this +matters in a given situation, some other mechanism must be used instead of +or in addition to memory barriers. + +4. Barrier proliferation. Many algorithms that at first seem appealing +require multiple barriers. If the number of barriers required is more than +one or two, you may be better off just using a lock. Keep in mind that, on +some platforms, a barrier may be implemented by acquiring and releasing a +backend-private spinlock. This may be better than a centralized lock under +contention, but it may also be slower in the uncontended case. + +Further Reading +=============== + +Much of the documentation about memory barriers appears to be quite +Linux-specific. The following papers may be helpful: + +Memory Ordering in Modern Microprocessors, by Paul E. McKenney +* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf + +Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney +* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf + +The Linux kernel also has some useful documentation on this topic. Start +with Documentation/memory-barriers.txt diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c new file mode 100644 index 0000000..80d70c1 --- /dev/null +++ b/src/backend/storage/lmgr/condition_variable.c @@ -0,0 +1,364 @@ +/*------------------------------------------------------------------------- + * + * condition_variable.c + * Implementation of condition variables. Condition variables provide + * a way for one process to wait until a specific condition occurs, + * without needing to know the specific identity of the process for + * which they are waiting. Waits for condition variables can be + * interrupted, unlike LWLock waits. Condition variables are safe + * to use within dynamic shared memory segments. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/storage/lmgr/condition_variable.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "portability/instr_time.h" +#include "storage/condition_variable.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/proclist.h" +#include "storage/spin.h" +#include "utils/memutils.h" + +/* Initially, we are not prepared to sleep on any condition variable. */ +static ConditionVariable *cv_sleep_target = NULL; + +/* + * Initialize a condition variable. + */ +void +ConditionVariableInit(ConditionVariable *cv) +{ + SpinLockInit(&cv->mutex); + proclist_init(&cv->wakeup); +} + +/* + * Prepare to wait on a given condition variable. + * + * This can optionally be called before entering a test/sleep loop. + * Doing so is more efficient if we'll need to sleep at least once. + * However, if the first test of the exit condition is likely to succeed, + * it's more efficient to omit the ConditionVariablePrepareToSleep call. + * See comments in ConditionVariableSleep for more detail. + * + * Caution: "before entering the loop" means you *must* test the exit + * condition between calling ConditionVariablePrepareToSleep and calling + * ConditionVariableSleep. If that is inconvenient, omit calling + * ConditionVariablePrepareToSleep. + */ +void +ConditionVariablePrepareToSleep(ConditionVariable *cv) +{ + int pgprocno = MyProc->pgprocno; + + /* + * If some other sleep is already prepared, cancel it; this is necessary + * because we have just one static variable tracking the prepared sleep, + * and also only one cvWaitLink in our PGPROC. It's okay to do this + * because whenever control does return to the other test-and-sleep loop, + * its ConditionVariableSleep call will just re-establish that sleep as + * the prepared one. + */ + if (cv_sleep_target != NULL) + ConditionVariableCancelSleep(); + + /* Record the condition variable on which we will sleep. */ + cv_sleep_target = cv; + + /* Add myself to the wait queue. */ + SpinLockAcquire(&cv->mutex); + proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink); + SpinLockRelease(&cv->mutex); +} + +/* + * Wait for the given condition variable to be signaled. + * + * This should be called in a predicate loop that tests for a specific exit + * condition and otherwise sleeps, like so: + * + * ConditionVariablePrepareToSleep(cv); // optional + * while (condition for which we are waiting is not true) + * ConditionVariableSleep(cv, wait_event_info); + * ConditionVariableCancelSleep(); + * + * wait_event_info should be a value from one of the WaitEventXXX enums + * defined in pgstat.h. This controls the contents of pg_stat_activity's + * wait_event_type and wait_event columns while waiting. + */ +void +ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info) +{ + (void) ConditionVariableTimedSleep(cv, -1 /* no timeout */ , + wait_event_info); +} + +/* + * Wait for a condition variable to be signaled or a timeout to be reached. + * + * Returns true when timeout expires, otherwise returns false. + * + * See ConditionVariableSleep() for general usage. + */ +bool +ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, + uint32 wait_event_info) +{ + long cur_timeout = -1; + instr_time start_time; + instr_time cur_time; + int wait_events; + + /* + * If the caller didn't prepare to sleep explicitly, then do so now and + * return immediately. The caller's predicate loop should immediately + * call again if its exit condition is not yet met. This will result in + * the exit condition being tested twice before we first sleep. The extra + * test can be prevented by calling ConditionVariablePrepareToSleep(cv) + * first. Whether it's worth doing that depends on whether you expect the + * exit condition to be met initially, in which case skipping the prepare + * is recommended because it avoids manipulations of the wait list, or not + * met initially, in which case preparing first is better because it + * avoids one extra test of the exit condition. + * + * If we are currently prepared to sleep on some other CV, we just cancel + * that and prepare this one; see ConditionVariablePrepareToSleep. + */ + if (cv_sleep_target != cv) + { + ConditionVariablePrepareToSleep(cv); + return false; + } + + /* + * Record the current time so that we can calculate the remaining timeout + * if we are woken up spuriously. + */ + if (timeout >= 0) + { + INSTR_TIME_SET_CURRENT(start_time); + Assert(timeout >= 0 && timeout <= INT_MAX); + cur_timeout = timeout; + wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH; + } + else + wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH; + + while (true) + { + bool done = false; + + /* + * Wait for latch to be set. (If we're awakened for some other + * reason, the code below will cope anyway.) + */ + (void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info); + + /* Reset latch before examining the state of the wait list. */ + ResetLatch(MyLatch); + + /* + * If this process has been taken out of the wait list, then we know + * that it has been signaled by ConditionVariableSignal (or + * ConditionVariableBroadcast), so we should return to the caller. But + * that doesn't guarantee that the exit condition is met, only that we + * ought to check it. So we must put the process back into the wait + * list, to ensure we don't miss any additional wakeup occurring while + * the caller checks its exit condition. We can take ourselves out of + * the wait list only when the caller calls + * ConditionVariableCancelSleep. + * + * If we're still in the wait list, then the latch must have been set + * by something other than ConditionVariableSignal; though we don't + * guarantee not to return spuriously, we'll avoid this obvious case. + */ + SpinLockAcquire(&cv->mutex); + if (!proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink)) + { + done = true; + proclist_push_tail(&cv->wakeup, MyProc->pgprocno, cvWaitLink); + } + SpinLockRelease(&cv->mutex); + + /* + * Check for interrupts, and return spuriously if that caused the + * current sleep target to change (meaning that interrupt handler code + * waited for a different condition variable). + */ + CHECK_FOR_INTERRUPTS(); + if (cv != cv_sleep_target) + done = true; + + /* We were signaled, so return */ + if (done) + return false; + + /* If we're not done, update cur_timeout for next iteration */ + if (timeout >= 0) + { + INSTR_TIME_SET_CURRENT(cur_time); + INSTR_TIME_SUBTRACT(cur_time, start_time); + cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time); + + /* Have we crossed the timeout threshold? */ + if (cur_timeout <= 0) + return true; + } + } +} + +/* + * Cancel any pending sleep operation. + * + * We just need to remove ourselves from the wait queue of any condition + * variable for which we have previously prepared a sleep. + * + * Do nothing if nothing is pending; this allows this function to be called + * during transaction abort to clean up any unfinished CV sleep. + */ +void +ConditionVariableCancelSleep(void) +{ + ConditionVariable *cv = cv_sleep_target; + bool signaled = false; + + if (cv == NULL) + return; + + SpinLockAcquire(&cv->mutex); + if (proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink)) + proclist_delete(&cv->wakeup, MyProc->pgprocno, cvWaitLink); + else + signaled = true; + SpinLockRelease(&cv->mutex); + + /* + * If we've received a signal, pass it on to another waiting process, if + * there is one. Otherwise a call to ConditionVariableSignal() might get + * lost, despite there being another process ready to handle it. + */ + if (signaled) + ConditionVariableSignal(cv); + + cv_sleep_target = NULL; +} + +/* + * Wake up the oldest process sleeping on the CV, if there is any. + * + * Note: it's difficult to tell whether this has any real effect: we know + * whether we took an entry off the list, but the entry might only be a + * sentinel. Hence, think twice before proposing that this should return + * a flag telling whether it woke somebody. + */ +void +ConditionVariableSignal(ConditionVariable *cv) +{ + PGPROC *proc = NULL; + + /* Remove the first process from the wakeup queue (if any). */ + SpinLockAcquire(&cv->mutex); + if (!proclist_is_empty(&cv->wakeup)) + proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink); + SpinLockRelease(&cv->mutex); + + /* If we found someone sleeping, set their latch to wake them up. */ + if (proc != NULL) + SetLatch(&proc->procLatch); +} + +/* + * Wake up all processes sleeping on the given CV. + * + * This guarantees to wake all processes that were sleeping on the CV + * at time of call, but processes that add themselves to the list mid-call + * will typically not get awakened. + */ +void +ConditionVariableBroadcast(ConditionVariable *cv) +{ + int pgprocno = MyProc->pgprocno; + PGPROC *proc = NULL; + bool have_sentinel = false; + + /* + * In some use-cases, it is common for awakened processes to immediately + * re-queue themselves. If we just naively try to reduce the wakeup list + * to empty, we'll get into a potentially-indefinite loop against such a + * process. The semantics we really want are just to be sure that we have + * wakened all processes that were in the list at entry. We can use our + * own cvWaitLink as a sentinel to detect when we've finished. + * + * A seeming flaw in this approach is that someone else might signal the + * CV and in doing so remove our sentinel entry. But that's fine: since + * CV waiters are always added and removed in order, that must mean that + * every previous waiter has been wakened, so we're done. We'll get an + * extra "set" on our latch from the someone else's signal, which is + * slightly inefficient but harmless. + * + * We can't insert our cvWaitLink as a sentinel if it's already in use in + * some other proclist. While that's not expected to be true for typical + * uses of this function, we can deal with it by simply canceling any + * prepared CV sleep. The next call to ConditionVariableSleep will take + * care of re-establishing the lost state. + */ + if (cv_sleep_target != NULL) + ConditionVariableCancelSleep(); + + /* + * Inspect the state of the queue. If it's empty, we have nothing to do. + * If there's exactly one entry, we need only remove and signal that + * entry. Otherwise, remove the first entry and insert our sentinel. + */ + SpinLockAcquire(&cv->mutex); + /* While we're here, let's assert we're not in the list. */ + Assert(!proclist_contains(&cv->wakeup, pgprocno, cvWaitLink)); + + if (!proclist_is_empty(&cv->wakeup)) + { + proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink); + if (!proclist_is_empty(&cv->wakeup)) + { + proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink); + have_sentinel = true; + } + } + SpinLockRelease(&cv->mutex); + + /* Awaken first waiter, if there was one. */ + if (proc != NULL) + SetLatch(&proc->procLatch); + + while (have_sentinel) + { + /* + * Each time through the loop, remove the first wakeup list entry, and + * signal it unless it's our sentinel. Repeat as long as the sentinel + * remains in the list. + * + * Notice that if someone else removes our sentinel, we will waken one + * additional process before exiting. That's intentional, because if + * someone else signals the CV, they may be intending to waken some + * third process that added itself to the list after we added the + * sentinel. Better to give a spurious wakeup (which should be + * harmless beyond wasting some cycles) than to lose a wakeup. + */ + proc = NULL; + SpinLockAcquire(&cv->mutex); + if (!proclist_is_empty(&cv->wakeup)) + proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink); + have_sentinel = proclist_contains(&cv->wakeup, pgprocno, cvWaitLink); + SpinLockRelease(&cv->mutex); + + if (proc != NULL && proc != MyProc) + SetLatch(&proc->procLatch); + } +} diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c new file mode 100644 index 0000000..67733c0 --- /dev/null +++ b/src/backend/storage/lmgr/deadlock.c @@ -0,0 +1,1177 @@ +/*------------------------------------------------------------------------- + * + * deadlock.c + * POSTGRES deadlock detection code + * + * See src/backend/storage/lmgr/README for a description of the deadlock + * detection and resolution algorithms. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/deadlock.c + * + * Interface: + * + * DeadLockCheck() + * DeadLockReport() + * RememberSimpleDeadLock() + * InitDeadLockChecking() + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "utils/memutils.h" + + +/* + * One edge in the waits-for graph. + * + * waiter and blocker may or may not be members of a lock group, but if either + * is, it will be the leader rather than any other member of the lock group. + * The group leaders act as representatives of the whole group even though + * those particular processes need not be waiting at all. There will be at + * least one member of the waiter's lock group on the wait queue for the given + * lock, maybe more. + */ +typedef struct +{ + PGPROC *waiter; /* the leader of the waiting lock group */ + PGPROC *blocker; /* the leader of the group it is waiting for */ + LOCK *lock; /* the lock being waited for */ + int pred; /* workspace for TopoSort */ + int link; /* workspace for TopoSort */ +} EDGE; + +/* One potential reordering of a lock's wait queue */ +typedef struct +{ + LOCK *lock; /* the lock whose wait queue is described */ + PGPROC **procs; /* array of PGPROC *'s in new wait order */ + int nProcs; +} WAIT_ORDER; + +/* + * Information saved about each edge in a detected deadlock cycle. This + * is used to print a diagnostic message upon failure. + * + * Note: because we want to examine this info after releasing the lock + * manager's partition locks, we can't just store LOCK and PGPROC pointers; + * we must extract out all the info we want to be able to print. + */ +typedef struct +{ + LOCKTAG locktag; /* ID of awaited lock object */ + LOCKMODE lockmode; /* type of lock we're waiting for */ + int pid; /* PID of blocked backend */ +} DEADLOCK_INFO; + + +static bool DeadLockCheckRecurse(PGPROC *proc); +static int TestConfiguration(PGPROC *startProc); +static bool FindLockCycle(PGPROC *checkProc, + EDGE *softEdges, int *nSoftEdges); +static bool FindLockCycleRecurse(PGPROC *checkProc, int depth, + EDGE *softEdges, int *nSoftEdges); +static bool FindLockCycleRecurseMember(PGPROC *checkProc, + PGPROC *checkProcLeader, + int depth, EDGE *softEdges, int *nSoftEdges); +static bool ExpandConstraints(EDGE *constraints, int nConstraints); +static bool TopoSort(LOCK *lock, EDGE *constraints, int nConstraints, + PGPROC **ordering); + +#ifdef DEBUG_DEADLOCK +static void PrintLockQueue(LOCK *lock, const char *info); +#endif + + +/* + * Working space for the deadlock detector + */ + +/* Workspace for FindLockCycle */ +static PGPROC **visitedProcs; /* Array of visited procs */ +static int nVisitedProcs; + +/* Workspace for TopoSort */ +static PGPROC **topoProcs; /* Array of not-yet-output procs */ +static int *beforeConstraints; /* Counts of remaining before-constraints */ +static int *afterConstraints; /* List head for after-constraints */ + +/* Output area for ExpandConstraints */ +static WAIT_ORDER *waitOrders; /* Array of proposed queue rearrangements */ +static int nWaitOrders; +static PGPROC **waitOrderProcs; /* Space for waitOrders queue contents */ + +/* Current list of constraints being considered */ +static EDGE *curConstraints; +static int nCurConstraints; +static int maxCurConstraints; + +/* Storage space for results from FindLockCycle */ +static EDGE *possibleConstraints; +static int nPossibleConstraints; +static int maxPossibleConstraints; +static DEADLOCK_INFO *deadlockDetails; +static int nDeadlockDetails; + +/* PGPROC pointer of any blocking autovacuum worker found */ +static PGPROC *blocking_autovacuum_proc = NULL; + + +/* + * InitDeadLockChecking -- initialize deadlock checker during backend startup + * + * This does per-backend initialization of the deadlock checker; primarily, + * allocation of working memory for DeadLockCheck. We do this per-backend + * since there's no percentage in making the kernel do copy-on-write + * inheritance of workspace from the postmaster. We want to allocate the + * space at startup because (a) the deadlock checker might be invoked when + * there's no free memory left, and (b) the checker is normally run inside a + * signal handler, which is a very dangerous place to invoke palloc from. + */ +void +InitDeadLockChecking(void) +{ + MemoryContext oldcxt; + + /* Make sure allocations are permanent */ + oldcxt = MemoryContextSwitchTo(TopMemoryContext); + + /* + * FindLockCycle needs at most MaxBackends entries in visitedProcs[] and + * deadlockDetails[]. + */ + visitedProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *)); + deadlockDetails = (DEADLOCK_INFO *) palloc(MaxBackends * sizeof(DEADLOCK_INFO)); + + /* + * TopoSort needs to consider at most MaxBackends wait-queue entries, and + * it needn't run concurrently with FindLockCycle. + */ + topoProcs = visitedProcs; /* re-use this space */ + beforeConstraints = (int *) palloc(MaxBackends * sizeof(int)); + afterConstraints = (int *) palloc(MaxBackends * sizeof(int)); + + /* + * We need to consider rearranging at most MaxBackends/2 wait queues + * (since it takes at least two waiters in a queue to create a soft edge), + * and the expanded form of the wait queues can't involve more than + * MaxBackends total waiters. + */ + waitOrders = (WAIT_ORDER *) + palloc((MaxBackends / 2) * sizeof(WAIT_ORDER)); + waitOrderProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *)); + + /* + * Allow at most MaxBackends distinct constraints in a configuration. (Is + * this enough? In practice it seems it should be, but I don't quite see + * how to prove it. If we run out, we might fail to find a workable wait + * queue rearrangement even though one exists.) NOTE that this number + * limits the maximum recursion depth of DeadLockCheckRecurse. Making it + * really big might potentially allow a stack-overflow problem. + */ + maxCurConstraints = MaxBackends; + curConstraints = (EDGE *) palloc(maxCurConstraints * sizeof(EDGE)); + + /* + * Allow up to 3*MaxBackends constraints to be saved without having to + * re-run TestConfiguration. (This is probably more than enough, but we + * can survive if we run low on space by doing excess runs of + * TestConfiguration to re-compute constraint lists each time needed.) The + * last MaxBackends entries in possibleConstraints[] are reserved as + * output workspace for FindLockCycle. + */ + maxPossibleConstraints = MaxBackends * 4; + possibleConstraints = + (EDGE *) palloc(maxPossibleConstraints * sizeof(EDGE)); + + MemoryContextSwitchTo(oldcxt); +} + +/* + * DeadLockCheck -- Checks for deadlocks for a given process + * + * This code looks for deadlocks involving the given process. If any + * are found, it tries to rearrange lock wait queues to resolve the + * deadlock. If resolution is impossible, return DS_HARD_DEADLOCK --- + * the caller is then expected to abort the given proc's transaction. + * + * Caller must already have locked all partitions of the lock tables. + * + * On failure, deadlock details are recorded in deadlockDetails[] for + * subsequent printing by DeadLockReport(). That activity is separate + * because (a) we don't want to do it while holding all those LWLocks, + * and (b) we are typically invoked inside a signal handler. + */ +DeadLockState +DeadLockCheck(PGPROC *proc) +{ + int i, + j; + + /* Initialize to "no constraints" */ + nCurConstraints = 0; + nPossibleConstraints = 0; + nWaitOrders = 0; + + /* Initialize to not blocked by an autovacuum worker */ + blocking_autovacuum_proc = NULL; + + /* Search for deadlocks and possible fixes */ + if (DeadLockCheckRecurse(proc)) + { + /* + * Call FindLockCycle one more time, to record the correct + * deadlockDetails[] for the basic state with no rearrangements. + */ + int nSoftEdges; + + TRACE_POSTGRESQL_DEADLOCK_FOUND(); + + nWaitOrders = 0; + if (!FindLockCycle(proc, possibleConstraints, &nSoftEdges)) + elog(FATAL, "deadlock seems to have disappeared"); + + return DS_HARD_DEADLOCK; /* cannot find a non-deadlocked state */ + } + + /* Apply any needed rearrangements of wait queues */ + for (i = 0; i < nWaitOrders; i++) + { + LOCK *lock = waitOrders[i].lock; + PGPROC **procs = waitOrders[i].procs; + int nProcs = waitOrders[i].nProcs; + PROC_QUEUE *waitQueue = &(lock->waitProcs); + + Assert(nProcs == waitQueue->size); + +#ifdef DEBUG_DEADLOCK + PrintLockQueue(lock, "DeadLockCheck:"); +#endif + + /* Reset the queue and re-add procs in the desired order */ + ProcQueueInit(waitQueue); + for (j = 0; j < nProcs; j++) + { + SHMQueueInsertBefore(&(waitQueue->links), &(procs[j]->links)); + waitQueue->size++; + } + +#ifdef DEBUG_DEADLOCK + PrintLockQueue(lock, "rearranged to:"); +#endif + + /* See if any waiters for the lock can be woken up now */ + ProcLockWakeup(GetLocksMethodTable(lock), lock); + } + + /* Return code tells caller if we had to escape a deadlock or not */ + if (nWaitOrders > 0) + return DS_SOFT_DEADLOCK; + else if (blocking_autovacuum_proc != NULL) + return DS_BLOCKED_BY_AUTOVACUUM; + else + return DS_NO_DEADLOCK; +} + +/* + * Return the PGPROC of the autovacuum that's blocking a process. + * + * We reset the saved pointer as soon as we pass it back. + */ +PGPROC * +GetBlockingAutoVacuumPgproc(void) +{ + PGPROC *ptr; + + ptr = blocking_autovacuum_proc; + blocking_autovacuum_proc = NULL; + + return ptr; +} + +/* + * DeadLockCheckRecurse -- recursively search for valid orderings + * + * curConstraints[] holds the current set of constraints being considered + * by an outer level of recursion. Add to this each possible solution + * constraint for any cycle detected at this level. + * + * Returns true if no solution exists. Returns false if a deadlock-free + * state is attainable, in which case waitOrders[] shows the required + * rearrangements of lock wait queues (if any). + */ +static bool +DeadLockCheckRecurse(PGPROC *proc) +{ + int nEdges; + int oldPossibleConstraints; + bool savedList; + int i; + + nEdges = TestConfiguration(proc); + if (nEdges < 0) + return true; /* hard deadlock --- no solution */ + if (nEdges == 0) + return false; /* good configuration found */ + if (nCurConstraints >= maxCurConstraints) + return true; /* out of room for active constraints? */ + oldPossibleConstraints = nPossibleConstraints; + if (nPossibleConstraints + nEdges + MaxBackends <= maxPossibleConstraints) + { + /* We can save the edge list in possibleConstraints[] */ + nPossibleConstraints += nEdges; + savedList = true; + } + else + { + /* Not room; will need to regenerate the edges on-the-fly */ + savedList = false; + } + + /* + * Try each available soft edge as an addition to the configuration. + */ + for (i = 0; i < nEdges; i++) + { + if (!savedList && i > 0) + { + /* Regenerate the list of possible added constraints */ + if (nEdges != TestConfiguration(proc)) + elog(FATAL, "inconsistent results during deadlock check"); + } + curConstraints[nCurConstraints] = + possibleConstraints[oldPossibleConstraints + i]; + nCurConstraints++; + if (!DeadLockCheckRecurse(proc)) + return false; /* found a valid solution! */ + /* give up on that added constraint, try again */ + nCurConstraints--; + } + nPossibleConstraints = oldPossibleConstraints; + return true; /* no solution found */ +} + + +/*-------------------- + * Test a configuration (current set of constraints) for validity. + * + * Returns: + * 0: the configuration is good (no deadlocks) + * -1: the configuration has a hard deadlock or is not self-consistent + * >0: the configuration has one or more soft deadlocks + * + * In the soft-deadlock case, one of the soft cycles is chosen arbitrarily + * and a list of its soft edges is returned beginning at + * possibleConstraints+nPossibleConstraints. The return value is the + * number of soft edges. + *-------------------- + */ +static int +TestConfiguration(PGPROC *startProc) +{ + int softFound = 0; + EDGE *softEdges = possibleConstraints + nPossibleConstraints; + int nSoftEdges; + int i; + + /* + * Make sure we have room for FindLockCycle's output. + */ + if (nPossibleConstraints + MaxBackends > maxPossibleConstraints) + return -1; + + /* + * Expand current constraint set into wait orderings. Fail if the + * constraint set is not self-consistent. + */ + if (!ExpandConstraints(curConstraints, nCurConstraints)) + return -1; + + /* + * Check for cycles involving startProc or any of the procs mentioned in + * constraints. We check startProc last because if it has a soft cycle + * still to be dealt with, we want to deal with that first. + */ + for (i = 0; i < nCurConstraints; i++) + { + if (FindLockCycle(curConstraints[i].waiter, softEdges, &nSoftEdges)) + { + if (nSoftEdges == 0) + return -1; /* hard deadlock detected */ + softFound = nSoftEdges; + } + if (FindLockCycle(curConstraints[i].blocker, softEdges, &nSoftEdges)) + { + if (nSoftEdges == 0) + return -1; /* hard deadlock detected */ + softFound = nSoftEdges; + } + } + if (FindLockCycle(startProc, softEdges, &nSoftEdges)) + { + if (nSoftEdges == 0) + return -1; /* hard deadlock detected */ + softFound = nSoftEdges; + } + return softFound; +} + + +/* + * FindLockCycle -- basic check for deadlock cycles + * + * Scan outward from the given proc to see if there is a cycle in the + * waits-for graph that includes this proc. Return true if a cycle + * is found, else false. If a cycle is found, we return a list of + * the "soft edges", if any, included in the cycle. These edges could + * potentially be eliminated by rearranging wait queues. We also fill + * deadlockDetails[] with information about the detected cycle; this info + * is not used by the deadlock algorithm itself, only to print a useful + * message after failing. + * + * Since we need to be able to check hypothetical configurations that would + * exist after wait queue rearrangement, the routine pays attention to the + * table of hypothetical queue orders in waitOrders[]. These orders will + * be believed in preference to the actual ordering seen in the locktable. + */ +static bool +FindLockCycle(PGPROC *checkProc, + EDGE *softEdges, /* output argument */ + int *nSoftEdges) /* output argument */ +{ + nVisitedProcs = 0; + nDeadlockDetails = 0; + *nSoftEdges = 0; + return FindLockCycleRecurse(checkProc, 0, softEdges, nSoftEdges); +} + +static bool +FindLockCycleRecurse(PGPROC *checkProc, + int depth, + EDGE *softEdges, /* output argument */ + int *nSoftEdges) /* output argument */ +{ + int i; + dlist_iter iter; + + /* + * If this process is a lock group member, check the leader instead. (Note + * that we might be the leader, in which case this is a no-op.) + */ + if (checkProc->lockGroupLeader != NULL) + checkProc = checkProc->lockGroupLeader; + + /* + * Have we already seen this proc? + */ + for (i = 0; i < nVisitedProcs; i++) + { + if (visitedProcs[i] == checkProc) + { + /* If we return to starting point, we have a deadlock cycle */ + if (i == 0) + { + /* + * record total length of cycle --- outer levels will now fill + * deadlockDetails[] + */ + Assert(depth <= MaxBackends); + nDeadlockDetails = depth; + + return true; + } + + /* + * Otherwise, we have a cycle but it does not include the start + * point, so say "no deadlock". + */ + return false; + } + } + /* Mark proc as seen */ + Assert(nVisitedProcs < MaxBackends); + visitedProcs[nVisitedProcs++] = checkProc; + + /* + * If the process is waiting, there is an outgoing waits-for edge to each + * process that blocks it. + */ + if (checkProc->links.next != NULL && checkProc->waitLock != NULL && + FindLockCycleRecurseMember(checkProc, checkProc, depth, softEdges, + nSoftEdges)) + return true; + + /* + * If the process is not waiting, there could still be outgoing waits-for + * edges if it is part of a lock group, because other members of the lock + * group might be waiting even though this process is not. (Given lock + * groups {A1, A2} and {B1, B2}, if A1 waits for B1 and B2 waits for A2, + * that is a deadlock even neither of B1 and A2 are waiting for anything.) + */ + dlist_foreach(iter, &checkProc->lockGroupMembers) + { + PGPROC *memberProc; + + memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur); + + if (memberProc->links.next != NULL && memberProc->waitLock != NULL && + memberProc != checkProc && + FindLockCycleRecurseMember(memberProc, checkProc, depth, softEdges, + nSoftEdges)) + return true; + } + + return false; +} + +static bool +FindLockCycleRecurseMember(PGPROC *checkProc, + PGPROC *checkProcLeader, + int depth, + EDGE *softEdges, /* output argument */ + int *nSoftEdges) /* output argument */ +{ + PGPROC *proc; + LOCK *lock = checkProc->waitLock; + PROCLOCK *proclock; + SHM_QUEUE *procLocks; + LockMethod lockMethodTable; + PROC_QUEUE *waitQueue; + int queue_size; + int conflictMask; + int i; + int numLockModes, + lm; + + /* + * The relation extension or page lock can never participate in actual + * deadlock cycle. See Asserts in LockAcquireExtended. So, there is no + * advantage in checking wait edges from them. + */ + if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND || + (LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE)) + return false; + + lockMethodTable = GetLocksMethodTable(lock); + numLockModes = lockMethodTable->numLockModes; + conflictMask = lockMethodTable->conflictTab[checkProc->waitLockMode]; + + /* + * Scan for procs that already hold conflicting locks. These are "hard" + * edges in the waits-for graph. + */ + procLocks = &(lock->procLocks); + + proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, lockLink)); + + while (proclock) + { + PGPROC *leader; + + proc = proclock->tag.myProc; + leader = proc->lockGroupLeader == NULL ? proc : proc->lockGroupLeader; + + /* A proc never blocks itself or any other lock group member */ + if (leader != checkProcLeader) + { + for (lm = 1; lm <= numLockModes; lm++) + { + if ((proclock->holdMask & LOCKBIT_ON(lm)) && + (conflictMask & LOCKBIT_ON(lm))) + { + /* This proc hard-blocks checkProc */ + if (FindLockCycleRecurse(proc, depth + 1, + softEdges, nSoftEdges)) + { + /* fill deadlockDetails[] */ + DEADLOCK_INFO *info = &deadlockDetails[depth]; + + info->locktag = lock->tag; + info->lockmode = checkProc->waitLockMode; + info->pid = checkProc->pid; + + return true; + } + + /* + * No deadlock here, but see if this proc is an autovacuum + * that is directly hard-blocking our own proc. If so, + * report it so that the caller can send a cancel signal + * to it, if appropriate. If there's more than one such + * proc, it's indeterminate which one will be reported. + * + * We don't touch autovacuums that are indirectly blocking + * us; it's up to the direct blockee to take action. This + * rule simplifies understanding the behavior and ensures + * that an autovacuum won't be canceled with less than + * deadlock_timeout grace period. + * + * Note we read statusFlags without any locking. This is + * OK only for checking the PROC_IS_AUTOVACUUM flag, + * because that flag is set at process start and never + * reset. There is logic elsewhere to avoid canceling an + * autovacuum that is working to prevent XID wraparound + * problems (which needs to read a different statusFlags + * bit), but we don't do that here to avoid grabbing + * ProcArrayLock. + */ + if (checkProc == MyProc && + proc->statusFlags & PROC_IS_AUTOVACUUM) + blocking_autovacuum_proc = proc; + + /* We're done looking at this proclock */ + break; + } + } + } + + proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, + offsetof(PROCLOCK, lockLink)); + } + + /* + * Scan for procs that are ahead of this one in the lock's wait queue. + * Those that have conflicting requests soft-block this one. This must be + * done after the hard-block search, since if another proc both hard- and + * soft-blocks this one, we want to call it a hard edge. + * + * If there is a proposed re-ordering of the lock's wait order, use that + * rather than the current wait order. + */ + for (i = 0; i < nWaitOrders; i++) + { + if (waitOrders[i].lock == lock) + break; + } + + if (i < nWaitOrders) + { + /* Use the given hypothetical wait queue order */ + PGPROC **procs = waitOrders[i].procs; + + queue_size = waitOrders[i].nProcs; + + for (i = 0; i < queue_size; i++) + { + PGPROC *leader; + + proc = procs[i]; + leader = proc->lockGroupLeader == NULL ? proc : + proc->lockGroupLeader; + + /* + * TopoSort will always return an ordering with group members + * adjacent to each other in the wait queue (see comments + * therein). So, as soon as we reach a process in the same lock + * group as checkProc, we know we've found all the conflicts that + * precede any member of the lock group lead by checkProcLeader. + */ + if (leader == checkProcLeader) + break; + + /* Is there a conflict with this guy's request? */ + if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0) + { + /* This proc soft-blocks checkProc */ + if (FindLockCycleRecurse(proc, depth + 1, + softEdges, nSoftEdges)) + { + /* fill deadlockDetails[] */ + DEADLOCK_INFO *info = &deadlockDetails[depth]; + + info->locktag = lock->tag; + info->lockmode = checkProc->waitLockMode; + info->pid = checkProc->pid; + + /* + * Add this edge to the list of soft edges in the cycle + */ + Assert(*nSoftEdges < MaxBackends); + softEdges[*nSoftEdges].waiter = checkProcLeader; + softEdges[*nSoftEdges].blocker = leader; + softEdges[*nSoftEdges].lock = lock; + (*nSoftEdges)++; + return true; + } + } + } + } + else + { + PGPROC *lastGroupMember = NULL; + + /* Use the true lock wait queue order */ + waitQueue = &(lock->waitProcs); + + /* + * Find the last member of the lock group that is present in the wait + * queue. Anything after this is not a soft lock conflict. If group + * locking is not in use, then we know immediately which process we're + * looking for, but otherwise we've got to search the wait queue to + * find the last process actually present. + */ + if (checkProc->lockGroupLeader == NULL) + lastGroupMember = checkProc; + else + { + proc = (PGPROC *) waitQueue->links.next; + queue_size = waitQueue->size; + while (queue_size-- > 0) + { + if (proc->lockGroupLeader == checkProcLeader) + lastGroupMember = proc; + proc = (PGPROC *) proc->links.next; + } + Assert(lastGroupMember != NULL); + } + + /* + * OK, now rescan (or scan) the queue to identify the soft conflicts. + */ + queue_size = waitQueue->size; + proc = (PGPROC *) waitQueue->links.next; + while (queue_size-- > 0) + { + PGPROC *leader; + + leader = proc->lockGroupLeader == NULL ? proc : + proc->lockGroupLeader; + + /* Done when we reach the target proc */ + if (proc == lastGroupMember) + break; + + /* Is there a conflict with this guy's request? */ + if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0 && + leader != checkProcLeader) + { + /* This proc soft-blocks checkProc */ + if (FindLockCycleRecurse(proc, depth + 1, + softEdges, nSoftEdges)) + { + /* fill deadlockDetails[] */ + DEADLOCK_INFO *info = &deadlockDetails[depth]; + + info->locktag = lock->tag; + info->lockmode = checkProc->waitLockMode; + info->pid = checkProc->pid; + + /* + * Add this edge to the list of soft edges in the cycle + */ + Assert(*nSoftEdges < MaxBackends); + softEdges[*nSoftEdges].waiter = checkProcLeader; + softEdges[*nSoftEdges].blocker = leader; + softEdges[*nSoftEdges].lock = lock; + (*nSoftEdges)++; + return true; + } + } + + proc = (PGPROC *) proc->links.next; + } + } + + /* + * No conflict detected here. + */ + return false; +} + + +/* + * ExpandConstraints -- expand a list of constraints into a set of + * specific new orderings for affected wait queues + * + * Input is a list of soft edges to be reversed. The output is a list + * of nWaitOrders WAIT_ORDER structs in waitOrders[], with PGPROC array + * workspace in waitOrderProcs[]. + * + * Returns true if able to build an ordering that satisfies all the + * constraints, false if not (there are contradictory constraints). + */ +static bool +ExpandConstraints(EDGE *constraints, + int nConstraints) +{ + int nWaitOrderProcs = 0; + int i, + j; + + nWaitOrders = 0; + + /* + * Scan constraint list backwards. This is because the last-added + * constraint is the only one that could fail, and so we want to test it + * for inconsistency first. + */ + for (i = nConstraints; --i >= 0;) + { + LOCK *lock = constraints[i].lock; + + /* Did we already make a list for this lock? */ + for (j = nWaitOrders; --j >= 0;) + { + if (waitOrders[j].lock == lock) + break; + } + if (j >= 0) + continue; + /* No, so allocate a new list */ + waitOrders[nWaitOrders].lock = lock; + waitOrders[nWaitOrders].procs = waitOrderProcs + nWaitOrderProcs; + waitOrders[nWaitOrders].nProcs = lock->waitProcs.size; + nWaitOrderProcs += lock->waitProcs.size; + Assert(nWaitOrderProcs <= MaxBackends); + + /* + * Do the topo sort. TopoSort need not examine constraints after this + * one, since they must be for different locks. + */ + if (!TopoSort(lock, constraints, i + 1, + waitOrders[nWaitOrders].procs)) + return false; + nWaitOrders++; + } + return true; +} + + +/* + * TopoSort -- topological sort of a wait queue + * + * Generate a re-ordering of a lock's wait queue that satisfies given + * constraints about certain procs preceding others. (Each such constraint + * is a fact of a partial ordering.) Minimize rearrangement of the queue + * not needed to achieve the partial ordering. + * + * This is a lot simpler and slower than, for example, the topological sort + * algorithm shown in Knuth's Volume 1. However, Knuth's method doesn't + * try to minimize the damage to the existing order. In practice we are + * not likely to be working with more than a few constraints, so the apparent + * slowness of the algorithm won't really matter. + * + * The initial queue ordering is taken directly from the lock's wait queue. + * The output is an array of PGPROC pointers, of length equal to the lock's + * wait queue length (the caller is responsible for providing this space). + * The partial order is specified by an array of EDGE structs. Each EDGE + * is one that we need to reverse, therefore the "waiter" must appear before + * the "blocker" in the output array. The EDGE array may well contain + * edges associated with other locks; these should be ignored. + * + * Returns true if able to build an ordering that satisfies all the + * constraints, false if not (there are contradictory constraints). + */ +static bool +TopoSort(LOCK *lock, + EDGE *constraints, + int nConstraints, + PGPROC **ordering) /* output argument */ +{ + PROC_QUEUE *waitQueue = &(lock->waitProcs); + int queue_size = waitQueue->size; + PGPROC *proc; + int i, + j, + jj, + k, + kk, + last; + + /* First, fill topoProcs[] array with the procs in their current order */ + proc = (PGPROC *) waitQueue->links.next; + for (i = 0; i < queue_size; i++) + { + topoProcs[i] = proc; + proc = (PGPROC *) proc->links.next; + } + + /* + * Scan the constraints, and for each proc in the array, generate a count + * of the number of constraints that say it must be before something else, + * plus a list of the constraints that say it must be after something + * else. The count for the j'th proc is stored in beforeConstraints[j], + * and the head of its list in afterConstraints[j]. Each constraint + * stores its list link in constraints[i].link (note any constraint will + * be in just one list). The array index for the before-proc of the i'th + * constraint is remembered in constraints[i].pred. + * + * Note that it's not necessarily the case that every constraint affects + * this particular wait queue. Prior to group locking, a process could be + * waiting for at most one lock. But a lock group can be waiting for + * zero, one, or multiple locks. Since topoProcs[] is an array of the + * processes actually waiting, while constraints[] is an array of group + * leaders, we've got to scan through topoProcs[] for each constraint, + * checking whether both a waiter and a blocker for that group are + * present. If so, the constraint is relevant to this wait queue; if not, + * it isn't. + */ + MemSet(beforeConstraints, 0, queue_size * sizeof(int)); + MemSet(afterConstraints, 0, queue_size * sizeof(int)); + for (i = 0; i < nConstraints; i++) + { + /* + * Find a representative process that is on the lock queue and part of + * the waiting lock group. This may or may not be the leader, which + * may or may not be waiting at all. If there are any other processes + * in the same lock group on the queue, set their number of + * beforeConstraints to -1 to indicate that they should be emitted + * with their groupmates rather than considered separately. + * + * In this loop and the similar one just below, it's critical that we + * consistently select the same representative member of any one lock + * group, so that all the constraints are associated with the same + * proc, and the -1's are only associated with not-representative + * members. We select the last one in the topoProcs array. + */ + proc = constraints[i].waiter; + Assert(proc != NULL); + jj = -1; + for (j = queue_size; --j >= 0;) + { + PGPROC *waiter = topoProcs[j]; + + if (waiter == proc || waiter->lockGroupLeader == proc) + { + Assert(waiter->waitLock == lock); + if (jj == -1) + jj = j; + else + { + Assert(beforeConstraints[j] <= 0); + beforeConstraints[j] = -1; + } + } + } + + /* If no matching waiter, constraint is not relevant to this lock. */ + if (jj < 0) + continue; + + /* + * Similarly, find a representative process that is on the lock queue + * and waiting for the blocking lock group. Again, this could be the + * leader but does not need to be. + */ + proc = constraints[i].blocker; + Assert(proc != NULL); + kk = -1; + for (k = queue_size; --k >= 0;) + { + PGPROC *blocker = topoProcs[k]; + + if (blocker == proc || blocker->lockGroupLeader == proc) + { + Assert(blocker->waitLock == lock); + if (kk == -1) + kk = k; + else + { + Assert(beforeConstraints[k] <= 0); + beforeConstraints[k] = -1; + } + } + } + + /* If no matching blocker, constraint is not relevant to this lock. */ + if (kk < 0) + continue; + + Assert(beforeConstraints[jj] >= 0); + beforeConstraints[jj]++; /* waiter must come before */ + /* add this constraint to list of after-constraints for blocker */ + constraints[i].pred = jj; + constraints[i].link = afterConstraints[kk]; + afterConstraints[kk] = i + 1; + } + + /*-------------------- + * Now scan the topoProcs array backwards. At each step, output the + * last proc that has no remaining before-constraints plus any other + * members of the same lock group; then decrease the beforeConstraints + * count of each of the procs it was constrained against. + * i = index of ordering[] entry we want to output this time + * j = search index for topoProcs[] + * k = temp for scanning constraint list for proc j + * last = last non-null index in topoProcs (avoid redundant searches) + *-------------------- + */ + last = queue_size - 1; + for (i = queue_size - 1; i >= 0;) + { + int c; + int nmatches = 0; + + /* Find next candidate to output */ + while (topoProcs[last] == NULL) + last--; + for (j = last; j >= 0; j--) + { + if (topoProcs[j] != NULL && beforeConstraints[j] == 0) + break; + } + + /* If no available candidate, topological sort fails */ + if (j < 0) + return false; + + /* + * Output everything in the lock group. There's no point in + * outputting an ordering where members of the same lock group are not + * consecutive on the wait queue: if some other waiter is between two + * requests that belong to the same group, then either it conflicts + * with both of them and is certainly not a solution; or it conflicts + * with at most one of them and is thus isomorphic to an ordering + * where the group members are consecutive. + */ + proc = topoProcs[j]; + if (proc->lockGroupLeader != NULL) + proc = proc->lockGroupLeader; + Assert(proc != NULL); + for (c = 0; c <= last; ++c) + { + if (topoProcs[c] == proc || (topoProcs[c] != NULL && + topoProcs[c]->lockGroupLeader == proc)) + { + ordering[i - nmatches] = topoProcs[c]; + topoProcs[c] = NULL; + ++nmatches; + } + } + Assert(nmatches > 0); + i -= nmatches; + + /* Update beforeConstraints counts of its predecessors */ + for (k = afterConstraints[j]; k > 0; k = constraints[k - 1].link) + beforeConstraints[constraints[k - 1].pred]--; + } + + /* Done */ + return true; +} + +#ifdef DEBUG_DEADLOCK +static void +PrintLockQueue(LOCK *lock, const char *info) +{ + PROC_QUEUE *waitQueue = &(lock->waitProcs); + int queue_size = waitQueue->size; + PGPROC *proc; + int i; + + printf("%s lock %p queue ", info, lock); + proc = (PGPROC *) waitQueue->links.next; + for (i = 0; i < queue_size; i++) + { + printf(" %d", proc->pid); + proc = (PGPROC *) proc->links.next; + } + printf("\n"); + fflush(stdout); +} +#endif + +/* + * Report a detected deadlock, with available details. + */ +void +DeadLockReport(void) +{ + StringInfoData clientbuf; /* errdetail for client */ + StringInfoData logbuf; /* errdetail for server log */ + StringInfoData locktagbuf; + int i; + + initStringInfo(&clientbuf); + initStringInfo(&logbuf); + initStringInfo(&locktagbuf); + + /* Generate the "waits for" lines sent to the client */ + for (i = 0; i < nDeadlockDetails; i++) + { + DEADLOCK_INFO *info = &deadlockDetails[i]; + int nextpid; + + /* The last proc waits for the first one... */ + if (i < nDeadlockDetails - 1) + nextpid = info[1].pid; + else + nextpid = deadlockDetails[0].pid; + + /* reset locktagbuf to hold next object description */ + resetStringInfo(&locktagbuf); + + DescribeLockTag(&locktagbuf, &info->locktag); + + if (i > 0) + appendStringInfoChar(&clientbuf, '\n'); + + appendStringInfo(&clientbuf, + _("Process %d waits for %s on %s; blocked by process %d."), + info->pid, + GetLockmodeName(info->locktag.locktag_lockmethodid, + info->lockmode), + locktagbuf.data, + nextpid); + } + + /* Duplicate all the above for the server ... */ + appendBinaryStringInfo(&logbuf, clientbuf.data, clientbuf.len); + + /* ... and add info about query strings */ + for (i = 0; i < nDeadlockDetails; i++) + { + DEADLOCK_INFO *info = &deadlockDetails[i]; + + appendStringInfoChar(&logbuf, '\n'); + + appendStringInfo(&logbuf, + _("Process %d: %s"), + info->pid, + pgstat_get_backend_current_activity(info->pid, false)); + } + + pgstat_report_deadlock(); + + ereport(ERROR, + (errcode(ERRCODE_T_R_DEADLOCK_DETECTED), + errmsg("deadlock detected"), + errdetail_internal("%s", clientbuf.data), + errdetail_log("%s", logbuf.data), + errhint("See server log for query details."))); +} + +/* + * RememberSimpleDeadLock: set up info for DeadLockReport when ProcSleep + * detects a trivial (two-way) deadlock. proc1 wants to block for lockmode + * on lock, but proc2 is already waiting and would be blocked by proc1. + */ +void +RememberSimpleDeadLock(PGPROC *proc1, + LOCKMODE lockmode, + LOCK *lock, + PGPROC *proc2) +{ + DEADLOCK_INFO *info = &deadlockDetails[0]; + + info->locktag = lock->tag; + info->lockmode = lockmode; + info->pid = proc1->pid; + info++; + info->locktag = proc2->waitLock->tag; + info->lockmode = proc2->waitLockMode; + info->pid = proc2->pid; + nDeadlockDetails = 2; +} diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl new file mode 100644 index 0000000..8a44946 --- /dev/null +++ b/src/backend/storage/lmgr/generate-lwlocknames.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl +# +# Generate lwlocknames.h and lwlocknames.c from lwlocknames.txt +# Copyright (c) 2000-2021, PostgreSQL Global Development Group + +use strict; +use warnings; + +my $lastlockidx = -1; +my $continue = "\n"; + +open my $lwlocknames, '<', $ARGV[0] or die; + +# Include PID in suffix in case parallel make runs this multiple times. +my $htmp = "lwlocknames.h.tmp$$"; +my $ctmp = "lwlocknames.c.tmp$$"; +open my $h, '>', $htmp or die "Could not open $htmp: $!"; +open my $c, '>', $ctmp or die "Could not open $ctmp: $!"; + +my $autogen = + "/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */\n"; +print $h $autogen; +print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n"; +print $c $autogen, "\n"; + +print $c "const char *const IndividualLWLockNames[] = {"; + +while (<$lwlocknames>) +{ + chomp; + + # Skip comments + next if /^#/; + next if /^\s*$/; + + die "unable to parse lwlocknames.txt" + unless /^(\w+)\s+(\d+)$/; + + (my $lockname, my $lockidx) = ($1, $2); + + my $trimmedlockname = $lockname; + $trimmedlockname =~ s/Lock$//; + die "lock names must end with 'Lock'" if $trimmedlockname eq $lockname; + + die "lwlocknames.txt not in order" if $lockidx < $lastlockidx; + die "lwlocknames.txt has duplicates" if $lockidx == $lastlockidx; + + while ($lastlockidx < $lockidx - 1) + { + ++$lastlockidx; + printf $c "%s \"<unassigned:%d>\"", $continue, $lastlockidx; + $continue = ",\n"; + } + printf $c "%s \"%s\"", $continue, $trimmedlockname; + $lastlockidx = $lockidx; + $continue = ",\n"; + + print $h "#define $lockname (&MainLWLockArray[$lockidx].lock)\n"; +} + +printf $c "\n};\n"; +print $h "\n"; +printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1; + +close $h; +close $c; + +rename($htmp, 'lwlocknames.h') || die "rename: $htmp: $!"; +rename($ctmp, 'lwlocknames.c') || die "rename: $ctmp: $!"; + +close $lwlocknames; diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c new file mode 100644 index 0000000..2db0424 --- /dev/null +++ b/src/backend/storage/lmgr/lmgr.c @@ -0,0 +1,1196 @@ +/*------------------------------------------------------------------------- + * + * lmgr.c + * POSTGRES lock manager code + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/lmgr.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "commands/progress.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "utils/inval.h" + + +/* + * Per-backend counter for generating speculative insertion tokens. + * + * This may wrap around, but that's OK as it's only used for the short + * duration between inserting a tuple and checking that there are no (unique) + * constraint violations. It's theoretically possible that a backend sees a + * tuple that was speculatively inserted by another backend, but before it has + * started waiting on the token, the other backend completes its insertion, + * and then performs 2^32 unrelated insertions. And after all that, the + * first backend finally calls SpeculativeInsertionLockAcquire(), with the + * intention of waiting for the first insertion to complete, but ends up + * waiting for the latest unrelated insertion instead. Even then, nothing + * particularly bad happens: in the worst case they deadlock, causing one of + * the transactions to abort. + */ +static uint32 speculativeInsertionToken = 0; + + +/* + * Struct to hold context info for transaction lock waits. + * + * 'oper' is the operation that needs to wait for the other transaction; 'rel' + * and 'ctid' specify the address of the tuple being waited for. + */ +typedef struct XactLockTableWaitInfo +{ + XLTW_Oper oper; + Relation rel; + ItemPointer ctid; +} XactLockTableWaitInfo; + +static void XactLockTableWaitErrorCb(void *arg); + +/* + * RelationInitLockInfo + * Initializes the lock information in a relation descriptor. + * + * relcache.c must call this during creation of any reldesc. + */ +void +RelationInitLockInfo(Relation relation) +{ + Assert(RelationIsValid(relation)); + Assert(OidIsValid(RelationGetRelid(relation))); + + relation->rd_lockInfo.lockRelId.relId = RelationGetRelid(relation); + + if (relation->rd_rel->relisshared) + relation->rd_lockInfo.lockRelId.dbId = InvalidOid; + else + relation->rd_lockInfo.lockRelId.dbId = MyDatabaseId; +} + +/* + * SetLocktagRelationOid + * Set up a locktag for a relation, given only relation OID + */ +static inline void +SetLocktagRelationOid(LOCKTAG *tag, Oid relid) +{ + Oid dbid; + + if (IsSharedRelation(relid)) + dbid = InvalidOid; + else + dbid = MyDatabaseId; + + SET_LOCKTAG_RELATION(*tag, dbid, relid); +} + +/* + * LockRelationOid + * + * Lock a relation given only its OID. This should generally be used + * before attempting to open the relation's relcache entry. + */ +void +LockRelationOid(Oid relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SetLocktagRelationOid(&tag, relid); + + res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock); + + /* + * Now that we have the lock, check for invalidation messages, so that we + * will update or flush any stale relcache entry before we try to use it. + * RangeVarGetRelid() specifically relies on us for this. We can skip + * this in the not-uncommon case that we already had the same type of lock + * being requested, since then no one else could have modified the + * relcache entry in an undesirable way. (In the case where our own xact + * modifies the rel, the relcache update happens via + * CommandCounterIncrement, not here.) + * + * However, in corner cases where code acts on tables (usually catalogs) + * recursively, we might get here while still processing invalidation + * messages in some outer execution of this function or a sibling. The + * "cleared" status of the lock tells us whether we really are done + * absorbing relevant inval messages. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } +} + +/* + * ConditionalLockRelationOid + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + * + * NOTE: we do not currently need conditional versions of all the + * LockXXX routines in this file, but they could easily be added if needed. + */ +bool +ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SetLocktagRelationOid(&tag, relid); + + res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock); + + if (res == LOCKACQUIRE_NOT_AVAIL) + return false; + + /* + * Now that we have the lock, check for invalidation messages; see notes + * in LockRelationOid. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } + + return true; +} + +/* + * UnlockRelationId + * + * Unlock, given a LockRelId. This is preferred over UnlockRelationOid + * for speed reasons. + */ +void +UnlockRelationId(LockRelId *relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId); + + LockRelease(&tag, lockmode, false); +} + +/* + * UnlockRelationOid + * + * Unlock, given only a relation Oid. Use UnlockRelationId if you can. + */ +void +UnlockRelationOid(Oid relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SetLocktagRelationOid(&tag, relid); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockRelation + * + * This is a convenience routine for acquiring an additional lock on an + * already-open relation. Never try to do "relation_open(foo, NoLock)" + * and then lock with this. + */ +void +LockRelation(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock); + + /* + * Now that we have the lock, check for invalidation messages; see notes + * in LockRelationOid. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } +} + +/* + * ConditionalLockRelation + * + * This is a convenience routine for acquiring an additional lock on an + * already-open relation. Never try to do "relation_open(foo, NoLock)" + * and then lock with this. + */ +bool +ConditionalLockRelation(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock); + + if (res == LOCKACQUIRE_NOT_AVAIL) + return false; + + /* + * Now that we have the lock, check for invalidation messages; see notes + * in LockRelationOid. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } + + return true; +} + +/* + * UnlockRelation + * + * This is a convenience routine for unlocking a relation without also + * closing it. + */ +void +UnlockRelation(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + LockRelease(&tag, lockmode, false); +} + +/* + * CheckRelationLockedByMe + * + * Returns true if current transaction holds a lock on 'relation' of mode + * 'lockmode'. If 'orstronger' is true, a stronger lockmode is also OK. + * ("Stronger" is defined as "numerically higher", which is a bit + * semantically dubious but is OK for the purposes we use this for.) + */ +bool +CheckRelationLockedByMe(Relation relation, LOCKMODE lockmode, bool orstronger) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + if (LockHeldByMe(&tag, lockmode)) + return true; + + if (orstronger) + { + LOCKMODE slockmode; + + for (slockmode = lockmode + 1; + slockmode <= MaxLockMode; + slockmode++) + { + if (LockHeldByMe(&tag, slockmode)) + { +#ifdef NOT_USED + /* Sometimes this might be useful for debugging purposes */ + elog(WARNING, "lock mode %s substituted for %s on relation %s", + GetLockmodeName(tag.locktag_lockmethodid, slockmode), + GetLockmodeName(tag.locktag_lockmethodid, lockmode), + RelationGetRelationName(relation)); +#endif + return true; + } + } + } + + return false; +} + +/* + * LockHasWaitersRelation + * + * This is a function to check whether someone else is waiting for a + * lock which we are currently holding. + */ +bool +LockHasWaitersRelation(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return LockHasWaiters(&tag, lockmode, false); +} + +/* + * LockRelationIdForSession + * + * This routine grabs a session-level lock on the target relation. The + * session lock persists across transaction boundaries. It will be removed + * when UnlockRelationIdForSession() is called, or if an ereport(ERROR) occurs, + * or if the backend exits. + * + * Note that one should also grab a transaction-level lock on the rel + * in any transaction that actually uses the rel, to ensure that the + * relcache entry is up to date. + */ +void +LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId); + + (void) LockAcquire(&tag, lockmode, true, false); +} + +/* + * UnlockRelationIdForSession + */ +void +UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId); + + LockRelease(&tag, lockmode, true); +} + +/* + * LockRelationForExtension + * + * This lock tag is used to interlock addition of pages to relations. + * We need such locking because bufmgr/smgr definition of P_NEW is not + * race-condition-proof. + * + * We assume the caller is already holding some type of regular lock on + * the relation, so no AcceptInvalidationMessages call is needed here. + */ +void +LockRelationForExtension(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* + * ConditionalLockRelationForExtension + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + */ +bool +ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); +} + +/* + * RelationExtensionLockWaiterCount + * + * Count the number of processes waiting for the given relation extension lock. + */ +int +RelationExtensionLockWaiterCount(Relation relation) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return LockWaiterCount(&tag); +} + +/* + * UnlockRelationForExtension + */ +void +UnlockRelationForExtension(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockDatabaseFrozenIds + * + * This allows one backend per database to execute vac_update_datfrozenxid(). + */ +void +LockDatabaseFrozenIds(LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* + * LockPage + * + * Obtain a page-level lock. This is currently used by some index access + * methods to lock individual index pages. + */ +void +LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_PAGE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + blkno); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* + * ConditionalLockPage + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + */ +bool +ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_PAGE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + blkno); + + return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); +} + +/* + * UnlockPage + */ +void +UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_PAGE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + blkno); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockTuple + * + * Obtain a tuple-level lock. This is used in a less-than-intuitive fashion + * because we can't afford to keep a separate lock in shared memory for every + * tuple. See heap_lock_tuple before using this! + */ +void +LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_TUPLE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* + * ConditionalLockTuple + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + */ +bool +ConditionalLockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_TUPLE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); +} + +/* + * UnlockTuple + */ +void +UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_TUPLE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + LockRelease(&tag, lockmode, false); +} + +/* + * XactLockTableInsert + * + * Insert a lock showing that the given transaction ID is running --- + * this is done when an XID is acquired by a transaction or subtransaction. + * The lock can then be used to wait for the transaction to finish. + */ +void +XactLockTableInsert(TransactionId xid) +{ + LOCKTAG tag; + + SET_LOCKTAG_TRANSACTION(tag, xid); + + (void) LockAcquire(&tag, ExclusiveLock, false, false); +} + +/* + * XactLockTableDelete + * + * Delete the lock showing that the given transaction ID is running. + * (This is never used for main transaction IDs; those locks are only + * released implicitly at transaction end. But we do use it for subtrans IDs.) + */ +void +XactLockTableDelete(TransactionId xid) +{ + LOCKTAG tag; + + SET_LOCKTAG_TRANSACTION(tag, xid); + + LockRelease(&tag, ExclusiveLock, false); +} + +/* + * XactLockTableWait + * + * Wait for the specified transaction to commit or abort. If an operation + * is specified, an error context callback is set up. If 'oper' is passed as + * None, no error context callback is set up. + * + * Note that this does the right thing for subtransactions: if we wait on a + * subtransaction, we will exit as soon as it aborts or its top parent commits. + * It takes some extra work to ensure this, because to save on shared memory + * the XID lock of a subtransaction is released when it ends, whether + * successfully or unsuccessfully. So we have to check if it's "still running" + * and if so wait for its parent. + */ +void +XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, + XLTW_Oper oper) +{ + LOCKTAG tag; + XactLockTableWaitInfo info; + ErrorContextCallback callback; + bool first = true; + + /* + * If an operation is specified, set up our verbose error context + * callback. + */ + if (oper != XLTW_None) + { + Assert(RelationIsValid(rel)); + Assert(ItemPointerIsValid(ctid)); + + info.rel = rel; + info.ctid = ctid; + info.oper = oper; + + callback.callback = XactLockTableWaitErrorCb; + callback.arg = &info; + callback.previous = error_context_stack; + error_context_stack = &callback; + } + + for (;;) + { + Assert(TransactionIdIsValid(xid)); + Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); + + SET_LOCKTAG_TRANSACTION(tag, xid); + + (void) LockAcquire(&tag, ShareLock, false, false); + + LockRelease(&tag, ShareLock, false); + + if (!TransactionIdIsInProgress(xid)) + break; + + /* + * If the Xid belonged to a subtransaction, then the lock would have + * gone away as soon as it was finished; for correct tuple visibility, + * the right action is to wait on its parent transaction to go away. + * But instead of going levels up one by one, we can just wait for the + * topmost transaction to finish with the same end result, which also + * incurs less locktable traffic. + * + * Some uses of this function don't involve tuple visibility -- such + * as when building snapshots for logical decoding. It is possible to + * see a transaction in ProcArray before it registers itself in the + * locktable. The topmost transaction in that case is the same xid, + * so we try again after a short sleep. (Don't sleep the first time + * through, to avoid slowing down the normal case.) + */ + if (!first) + pg_usleep(1000L); + first = false; + xid = SubTransGetTopmostTransaction(xid); + } + + if (oper != XLTW_None) + error_context_stack = callback.previous; +} + +/* + * ConditionalXactLockTableWait + * + * As above, but only lock if we can get the lock without blocking. + * Returns true if the lock was acquired. + */ +bool +ConditionalXactLockTableWait(TransactionId xid) +{ + LOCKTAG tag; + bool first = true; + + for (;;) + { + Assert(TransactionIdIsValid(xid)); + Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); + + SET_LOCKTAG_TRANSACTION(tag, xid); + + if (LockAcquire(&tag, ShareLock, false, true) == LOCKACQUIRE_NOT_AVAIL) + return false; + + LockRelease(&tag, ShareLock, false); + + if (!TransactionIdIsInProgress(xid)) + break; + + /* See XactLockTableWait about this case */ + if (!first) + pg_usleep(1000L); + first = false; + xid = SubTransGetTopmostTransaction(xid); + } + + return true; +} + +/* + * SpeculativeInsertionLockAcquire + * + * Insert a lock showing that the given transaction ID is inserting a tuple, + * but hasn't yet decided whether it's going to keep it. The lock can then be + * used to wait for the decision to go ahead with the insertion, or aborting + * it. + * + * The token is used to distinguish multiple insertions by the same + * transaction. It is returned to caller. + */ +uint32 +SpeculativeInsertionLockAcquire(TransactionId xid) +{ + LOCKTAG tag; + + speculativeInsertionToken++; + + /* + * Check for wrap-around. Zero means no token is held, so don't use that. + */ + if (speculativeInsertionToken == 0) + speculativeInsertionToken = 1; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken); + + (void) LockAcquire(&tag, ExclusiveLock, false, false); + + return speculativeInsertionToken; +} + +/* + * SpeculativeInsertionLockRelease + * + * Delete the lock showing that the given transaction is speculatively + * inserting a tuple. + */ +void +SpeculativeInsertionLockRelease(TransactionId xid) +{ + LOCKTAG tag; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken); + + LockRelease(&tag, ExclusiveLock, false); +} + +/* + * SpeculativeInsertionWait + * + * Wait for the specified transaction to finish or abort the insertion of a + * tuple. + */ +void +SpeculativeInsertionWait(TransactionId xid, uint32 token) +{ + LOCKTAG tag; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, token); + + Assert(TransactionIdIsValid(xid)); + Assert(token != 0); + + (void) LockAcquire(&tag, ShareLock, false, false); + LockRelease(&tag, ShareLock, false); +} + +/* + * XactLockTableWaitErrorCb + * Error context callback for transaction lock waits. + */ +static void +XactLockTableWaitErrorCb(void *arg) +{ + XactLockTableWaitInfo *info = (XactLockTableWaitInfo *) arg; + + /* + * We would like to print schema name too, but that would require a + * syscache lookup. + */ + if (info->oper != XLTW_None && + ItemPointerIsValid(info->ctid) && RelationIsValid(info->rel)) + { + const char *cxt; + + switch (info->oper) + { + case XLTW_Update: + cxt = gettext_noop("while updating tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_Delete: + cxt = gettext_noop("while deleting tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_Lock: + cxt = gettext_noop("while locking tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_LockUpdated: + cxt = gettext_noop("while locking updated version (%u,%u) of tuple in relation \"%s\""); + break; + case XLTW_InsertIndex: + cxt = gettext_noop("while inserting index tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_InsertIndexUnique: + cxt = gettext_noop("while checking uniqueness of tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_FetchUpdated: + cxt = gettext_noop("while rechecking updated tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_RecheckExclusionConstr: + cxt = gettext_noop("while checking exclusion constraint on tuple (%u,%u) in relation \"%s\""); + break; + + default: + return; + } + + errcontext(cxt, + ItemPointerGetBlockNumber(info->ctid), + ItemPointerGetOffsetNumber(info->ctid), + RelationGetRelationName(info->rel)); + } +} + +/* + * WaitForLockersMultiple + * Wait until no transaction holds locks that conflict with the given + * locktags at the given lockmode. + * + * To do this, obtain the current list of lockers, and wait on their VXIDs + * until they are finished. + * + * Note we don't try to acquire the locks on the given locktags, only the + * VXIDs and XIDs of their lock holders; if somebody grabs a conflicting lock + * on the objects after we obtained our initial list of lockers, we will not + * wait for them. + */ +void +WaitForLockersMultiple(List *locktags, LOCKMODE lockmode, bool progress) +{ + List *holders = NIL; + ListCell *lc; + int total = 0; + int done = 0; + + /* Done if no locks to wait for */ + if (list_length(locktags) == 0) + return; + + /* Collect the transactions we need to wait on */ + foreach(lc, locktags) + { + LOCKTAG *locktag = lfirst(lc); + int count; + + holders = lappend(holders, + GetLockConflicts(locktag, lockmode, + progress ? &count : NULL)); + if (progress) + total += count; + } + + if (progress) + pgstat_progress_update_param(PROGRESS_WAITFOR_TOTAL, total); + + /* + * Note: GetLockConflicts() never reports our own xid, hence we need not + * check for that. Also, prepared xacts are reported and awaited. + */ + + /* Finally wait for each such transaction to complete */ + foreach(lc, holders) + { + VirtualTransactionId *lockholders = lfirst(lc); + + while (VirtualTransactionIdIsValid(*lockholders)) + { + /* If requested, publish who we're going to wait for. */ + if (progress) + { + PGPROC *holder = BackendIdGetProc(lockholders->backendId); + + if (holder) + pgstat_progress_update_param(PROGRESS_WAITFOR_CURRENT_PID, + holder->pid); + } + VirtualXactLock(*lockholders, true); + lockholders++; + + if (progress) + pgstat_progress_update_param(PROGRESS_WAITFOR_DONE, ++done); + } + } + if (progress) + { + const int index[] = { + PROGRESS_WAITFOR_TOTAL, + PROGRESS_WAITFOR_DONE, + PROGRESS_WAITFOR_CURRENT_PID + }; + const int64 values[] = { + 0, 0, 0 + }; + + pgstat_progress_update_multi_param(3, index, values); + } + + list_free_deep(holders); +} + +/* + * WaitForLockers + * + * Same as WaitForLockersMultiple, for a single lock tag. + */ +void +WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode, bool progress) +{ + List *l; + + l = list_make1(&heaplocktag); + WaitForLockersMultiple(l, lockmode, progress); + list_free(l); +} + + +/* + * LockDatabaseObject + * + * Obtain a lock on a general object of the current database. Don't use + * this for shared objects (such as tablespaces). It's unwise to apply it + * to relations, also, since a lock taken this way will NOT conflict with + * locks taken via LockRelation and friends. + */ +void +LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + MyDatabaseId, + classid, + objid, + objsubid); + + (void) LockAcquire(&tag, lockmode, false, false); + + /* Make sure syscaches are up-to-date with any changes we waited for */ + AcceptInvalidationMessages(); +} + +/* + * UnlockDatabaseObject + */ +void +UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + MyDatabaseId, + classid, + objid, + objsubid); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockSharedObject + * + * Obtain a lock on a shared-across-databases object. + */ +void +LockSharedObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + InvalidOid, + classid, + objid, + objsubid); + + (void) LockAcquire(&tag, lockmode, false, false); + + /* Make sure syscaches are up-to-date with any changes we waited for */ + AcceptInvalidationMessages(); +} + +/* + * UnlockSharedObject + */ +void +UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + InvalidOid, + classid, + objid, + objsubid); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockSharedObjectForSession + * + * Obtain a session-level lock on a shared-across-databases object. + * See LockRelationIdForSession for notes about session-level locks. + */ +void +LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + InvalidOid, + classid, + objid, + objsubid); + + (void) LockAcquire(&tag, lockmode, true, false); +} + +/* + * UnlockSharedObjectForSession + */ +void +UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + InvalidOid, + classid, + objid, + objsubid); + + LockRelease(&tag, lockmode, true); +} + + +/* + * Append a description of a lockable object to buf. + * + * Ideally we would print names for the numeric values, but that requires + * getting locks on system tables, which might cause problems since this is + * typically used to report deadlock situations. + */ +void +DescribeLockTag(StringInfo buf, const LOCKTAG *tag) +{ + switch ((LockTagType) tag->locktag_type) + { + case LOCKTAG_RELATION: + appendStringInfo(buf, + _("relation %u of database %u"), + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_RELATION_EXTEND: + appendStringInfo(buf, + _("extension of relation %u of database %u"), + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_DATABASE_FROZEN_IDS: + appendStringInfo(buf, + _("pg_database.datfrozenxid of database %u"), + tag->locktag_field1); + break; + case LOCKTAG_PAGE: + appendStringInfo(buf, + _("page %u of relation %u of database %u"), + tag->locktag_field3, + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_TUPLE: + appendStringInfo(buf, + _("tuple (%u,%u) of relation %u of database %u"), + tag->locktag_field3, + tag->locktag_field4, + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_TRANSACTION: + appendStringInfo(buf, + _("transaction %u"), + tag->locktag_field1); + break; + case LOCKTAG_VIRTUALTRANSACTION: + appendStringInfo(buf, + _("virtual transaction %d/%u"), + tag->locktag_field1, + tag->locktag_field2); + break; + case LOCKTAG_SPECULATIVE_TOKEN: + appendStringInfo(buf, + _("speculative token %u of transaction %u"), + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_OBJECT: + appendStringInfo(buf, + _("object %u of class %u of database %u"), + tag->locktag_field3, + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_USERLOCK: + /* reserved for old contrib code, now on pgfoundry */ + appendStringInfo(buf, + _("user lock [%u,%u,%u]"), + tag->locktag_field1, + tag->locktag_field2, + tag->locktag_field3); + break; + case LOCKTAG_ADVISORY: + appendStringInfo(buf, + _("advisory lock [%u,%u,%u,%u]"), + tag->locktag_field1, + tag->locktag_field2, + tag->locktag_field3, + tag->locktag_field4); + break; + default: + appendStringInfo(buf, + _("unrecognized locktag type %d"), + (int) tag->locktag_type); + break; + } +} + +/* + * GetLockNameFromTagType + * + * Given locktag type, return the corresponding lock name. + */ +const char * +GetLockNameFromTagType(uint16 locktag_type) +{ + if (locktag_type > LOCKTAG_LAST_TYPE) + return "???"; + return LockTagTypeNames[locktag_type]; +} diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c new file mode 100644 index 0000000..818666f --- /dev/null +++ b/src/backend/storage/lmgr/lock.c @@ -0,0 +1,4738 @@ +/*------------------------------------------------------------------------- + * + * lock.c + * POSTGRES primary lock mechanism + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/lock.c + * + * NOTES + * A lock table is a shared memory hash table. When + * a process tries to acquire a lock of a type that conflicts + * with existing locks, it is put to sleep using the routines + * in storage/lmgr/proc.c. + * + * For the most part, this code should be invoked via lmgr.c + * or another lock-management module, not directly. + * + * Interface: + * + * InitLocks(), GetLocksMethodTable(), GetLockTagsMethodTable(), + * LockAcquire(), LockRelease(), LockReleaseAll(), + * LockCheckConflicts(), GrantLock() + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> + +#include "access/transam.h" +#include "access/twophase.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "storage/spin.h" +#include "storage/standby.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/resowner_private.h" + + +/* This configuration variable is used to set the lock table size */ +int max_locks_per_xact; /* set by guc.c */ + +#define NLOCKENTS() \ + mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) + + +/* + * Data structures defining the semantics of the standard lock methods. + * + * The conflict table defines the semantics of the various lock modes. + */ +static const LOCKMASK LockConflicts[] = { + 0, + + /* AccessShareLock */ + LOCKBIT_ON(AccessExclusiveLock), + + /* RowShareLock */ + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* RowExclusiveLock */ + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* ShareUpdateExclusiveLock */ + LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* ShareLock */ + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* ShareRowExclusiveLock */ + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* ExclusiveLock */ + LOCKBIT_ON(RowShareLock) | + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* AccessExclusiveLock */ + LOCKBIT_ON(AccessShareLock) | LOCKBIT_ON(RowShareLock) | + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock) + +}; + +/* Names of lock modes, for debug printouts */ +static const char *const lock_mode_names[] = +{ + "INVALID", + "AccessShareLock", + "RowShareLock", + "RowExclusiveLock", + "ShareUpdateExclusiveLock", + "ShareLock", + "ShareRowExclusiveLock", + "ExclusiveLock", + "AccessExclusiveLock" +}; + +#ifndef LOCK_DEBUG +static bool Dummy_trace = false; +#endif + +static const LockMethodData default_lockmethod = { + AccessExclusiveLock, /* highest valid lock mode number */ + LockConflicts, + lock_mode_names, +#ifdef LOCK_DEBUG + &Trace_locks +#else + &Dummy_trace +#endif +}; + +static const LockMethodData user_lockmethod = { + AccessExclusiveLock, /* highest valid lock mode number */ + LockConflicts, + lock_mode_names, +#ifdef LOCK_DEBUG + &Trace_userlocks +#else + &Dummy_trace +#endif +}; + +/* + * map from lock method id to the lock table data structures + */ +static const LockMethod LockMethods[] = { + NULL, + &default_lockmethod, + &user_lockmethod +}; + + +/* Record that's written to 2PC state file when a lock is persisted */ +typedef struct TwoPhaseLockRecord +{ + LOCKTAG locktag; + LOCKMODE lockmode; +} TwoPhaseLockRecord; + + +/* + * Count of the number of fast path lock slots we believe to be used. This + * might be higher than the real number if another backend has transferred + * our locks to the primary lock table, but it can never be lower than the + * real value, since only we can acquire locks on our own behalf. + */ +static int FastPathLocalUseCount = 0; + +/* + * Flag to indicate if the relation extension lock is held by this backend. + * This flag is used to ensure that while holding the relation extension lock + * we don't try to acquire a heavyweight lock on any other object. This + * restriction implies that the relation extension lock won't ever participate + * in the deadlock cycle because we can never wait for any other heavyweight + * lock after acquiring this lock. + * + * Such a restriction is okay for relation extension locks as unlike other + * heavyweight locks these are not held till the transaction end. These are + * taken for a short duration to extend a particular relation and then + * released. + */ +static bool IsRelationExtensionLockHeld PG_USED_FOR_ASSERTS_ONLY = false; + +/* + * Flag to indicate if the page lock is held by this backend. We don't + * acquire any other heavyweight lock while holding the page lock except for + * relation extension. However, these locks are never taken in reverse order + * which implies that page locks will also never participate in the deadlock + * cycle. + * + * Similar to relation extension, page locks are also held for a short + * duration, so imposing such a restriction won't hurt. + */ +static bool IsPageLockHeld PG_USED_FOR_ASSERTS_ONLY = false; + +/* Macros for manipulating proc->fpLockBits */ +#define FAST_PATH_BITS_PER_SLOT 3 +#define FAST_PATH_LOCKNUMBER_OFFSET 1 +#define FAST_PATH_MASK ((1 << FAST_PATH_BITS_PER_SLOT) - 1) +#define FAST_PATH_GET_BITS(proc, n) \ + (((proc)->fpLockBits >> (FAST_PATH_BITS_PER_SLOT * n)) & FAST_PATH_MASK) +#define FAST_PATH_BIT_POSITION(n, l) \ + (AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \ + AssertMacro((l) < FAST_PATH_BITS_PER_SLOT+FAST_PATH_LOCKNUMBER_OFFSET), \ + AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \ + ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n))) +#define FAST_PATH_SET_LOCKMODE(proc, n, l) \ + (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l) +#define FAST_PATH_CLEAR_LOCKMODE(proc, n, l) \ + (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)) +#define FAST_PATH_CHECK_LOCKMODE(proc, n, l) \ + ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))) + +/* + * The fast-path lock mechanism is concerned only with relation locks on + * unshared relations by backends bound to a database. The fast-path + * mechanism exists mostly to accelerate acquisition and release of locks + * that rarely conflict. Because ShareUpdateExclusiveLock is + * self-conflicting, it can't use the fast-path mechanism; but it also does + * not conflict with any of the locks that do, so we can ignore it completely. + */ +#define EligibleForRelationFastPath(locktag, mode) \ + ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \ + (locktag)->locktag_type == LOCKTAG_RELATION && \ + (locktag)->locktag_field1 == MyDatabaseId && \ + MyDatabaseId != InvalidOid && \ + (mode) < ShareUpdateExclusiveLock) +#define ConflictsWithRelationFastPath(locktag, mode) \ + ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \ + (locktag)->locktag_type == LOCKTAG_RELATION && \ + (locktag)->locktag_field1 != InvalidOid && \ + (mode) > ShareUpdateExclusiveLock) + +static bool FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode); +static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode); +static bool FastPathTransferRelationLocks(LockMethod lockMethodTable, + const LOCKTAG *locktag, uint32 hashcode); +static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock); + +/* + * To make the fast-path lock mechanism work, we must have some way of + * preventing the use of the fast-path when a conflicting lock might be present. + * We partition* the locktag space into FAST_PATH_STRONG_LOCK_HASH_PARTITIONS, + * and maintain an integer count of the number of "strong" lockers + * in each partition. When any "strong" lockers are present (which is + * hopefully not very often), the fast-path mechanism can't be used, and we + * must fall back to the slower method of pushing matching locks directly + * into the main lock tables. + * + * The deadlock detector does not know anything about the fast path mechanism, + * so any locks that might be involved in a deadlock must be transferred from + * the fast-path queues to the main lock table. + */ + +#define FAST_PATH_STRONG_LOCK_HASH_BITS 10 +#define FAST_PATH_STRONG_LOCK_HASH_PARTITIONS \ + (1 << FAST_PATH_STRONG_LOCK_HASH_BITS) +#define FastPathStrongLockHashPartition(hashcode) \ + ((hashcode) % FAST_PATH_STRONG_LOCK_HASH_PARTITIONS) + +typedef struct +{ + slock_t mutex; + uint32 count[FAST_PATH_STRONG_LOCK_HASH_PARTITIONS]; +} FastPathStrongRelationLockData; + +static volatile FastPathStrongRelationLockData *FastPathStrongRelationLocks; + + +/* + * Pointers to hash tables containing lock state + * + * The LockMethodLockHash and LockMethodProcLockHash hash tables are in + * shared memory; LockMethodLocalHash is local to each backend. + */ +static HTAB *LockMethodLockHash; +static HTAB *LockMethodProcLockHash; +static HTAB *LockMethodLocalHash; + + +/* private state for error cleanup */ +static LOCALLOCK *StrongLockInProgress; +static LOCALLOCK *awaitedLock; +static ResourceOwner awaitedOwner; + + +#ifdef LOCK_DEBUG + +/*------ + * The following configuration options are available for lock debugging: + * + * TRACE_LOCKS -- give a bunch of output what's going on in this file + * TRACE_USERLOCKS -- same but for user locks + * TRACE_LOCK_OIDMIN-- do not trace locks for tables below this oid + * (use to avoid output on system tables) + * TRACE_LOCK_TABLE -- trace locks on this table (oid) unconditionally + * DEBUG_DEADLOCKS -- currently dumps locks at untimely occasions ;) + * + * Furthermore, but in storage/lmgr/lwlock.c: + * TRACE_LWLOCKS -- trace lightweight locks (pretty useless) + * + * Define LOCK_DEBUG at compile time to get all these enabled. + * -------- + */ + +int Trace_lock_oidmin = FirstNormalObjectId; +bool Trace_locks = false; +bool Trace_userlocks = false; +int Trace_lock_table = 0; +bool Debug_deadlocks = false; + + +inline static bool +LOCK_DEBUG_ENABLED(const LOCKTAG *tag) +{ + return + (*(LockMethods[tag->locktag_lockmethodid]->trace_flag) && + ((Oid) tag->locktag_field2 >= (Oid) Trace_lock_oidmin)) + || (Trace_lock_table && + (tag->locktag_field2 == Trace_lock_table)); +} + + +inline static void +LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type) +{ + if (LOCK_DEBUG_ENABLED(&lock->tag)) + elog(LOG, + "%s: lock(%p) id(%u,%u,%u,%u,%u,%u) grantMask(%x) " + "req(%d,%d,%d,%d,%d,%d,%d)=%d " + "grant(%d,%d,%d,%d,%d,%d,%d)=%d wait(%d) type(%s)", + where, lock, + lock->tag.locktag_field1, lock->tag.locktag_field2, + lock->tag.locktag_field3, lock->tag.locktag_field4, + lock->tag.locktag_type, lock->tag.locktag_lockmethodid, + lock->grantMask, + lock->requested[1], lock->requested[2], lock->requested[3], + lock->requested[4], lock->requested[5], lock->requested[6], + lock->requested[7], lock->nRequested, + lock->granted[1], lock->granted[2], lock->granted[3], + lock->granted[4], lock->granted[5], lock->granted[6], + lock->granted[7], lock->nGranted, + lock->waitProcs.size, + LockMethods[LOCK_LOCKMETHOD(*lock)]->lockModeNames[type]); +} + + +inline static void +PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP) +{ + if (LOCK_DEBUG_ENABLED(&proclockP->tag.myLock->tag)) + elog(LOG, + "%s: proclock(%p) lock(%p) method(%u) proc(%p) hold(%x)", + where, proclockP, proclockP->tag.myLock, + PROCLOCK_LOCKMETHOD(*(proclockP)), + proclockP->tag.myProc, (int) proclockP->holdMask); +} +#else /* not LOCK_DEBUG */ + +#define LOCK_PRINT(where, lock, type) ((void) 0) +#define PROCLOCK_PRINT(where, proclockP) ((void) 0) +#endif /* not LOCK_DEBUG */ + + +static uint32 proclock_hash(const void *key, Size keysize); +static void RemoveLocalLock(LOCALLOCK *locallock); +static PROCLOCK *SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, + const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode); +static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner); +static void BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode); +static void FinishStrongLockAcquire(void); +static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner); +static void ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock); +static void LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent); +static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode, + PROCLOCK *proclock, LockMethod lockMethodTable); +static void CleanUpLock(LOCK *lock, PROCLOCK *proclock, + LockMethod lockMethodTable, uint32 hashcode, + bool wakeupNeeded); +static void LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, + LOCKTAG *locktag, LOCKMODE lockmode, + bool decrement_strong_lock_count); +static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc, + BlockedProcsData *data); + + +/* + * InitLocks -- Initialize the lock manager's data structures. + * + * This is called from CreateSharedMemoryAndSemaphores(), which see for + * more comments. In the normal postmaster case, the shared hash tables + * are created here, as well as a locallock hash table that will remain + * unused and empty in the postmaster itself. Backends inherit the pointers + * to the shared tables via fork(), and also inherit an image of the locallock + * hash table, which they proceed to use. In the EXEC_BACKEND case, each + * backend re-executes this code to obtain pointers to the already existing + * shared hash tables and to create its locallock hash table. + */ +void +InitLocks(void) +{ + HASHCTL info; + long init_table_size, + max_table_size; + bool found; + + /* + * Compute init/max size to request for lock hashtables. Note these + * calculations must agree with LockShmemSize! + */ + max_table_size = NLOCKENTS(); + init_table_size = max_table_size / 2; + + /* + * Allocate hash table for LOCK structs. This stores per-locked-object + * information. + */ + info.keysize = sizeof(LOCKTAG); + info.entrysize = sizeof(LOCK); + info.num_partitions = NUM_LOCK_PARTITIONS; + + LockMethodLockHash = ShmemInitHash("LOCK hash", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | HASH_PARTITION); + + /* Assume an average of 2 holders per lock */ + max_table_size *= 2; + init_table_size *= 2; + + /* + * Allocate hash table for PROCLOCK structs. This stores + * per-lock-per-holder information. + */ + info.keysize = sizeof(PROCLOCKTAG); + info.entrysize = sizeof(PROCLOCK); + info.hash = proclock_hash; + info.num_partitions = NUM_LOCK_PARTITIONS; + + LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); + + /* + * Allocate fast-path structures. + */ + FastPathStrongRelationLocks = + ShmemInitStruct("Fast Path Strong Relation Lock Data", + sizeof(FastPathStrongRelationLockData), &found); + if (!found) + SpinLockInit(&FastPathStrongRelationLocks->mutex); + + /* + * Allocate non-shared hash table for LOCALLOCK structs. This stores lock + * counts and resource owner information. + * + * The non-shared table could already exist in this process (this occurs + * when the postmaster is recreating shared memory after a backend crash). + * If so, delete and recreate it. (We could simply leave it, since it + * ought to be empty in the postmaster, but for safety let's zap it.) + */ + if (LockMethodLocalHash) + hash_destroy(LockMethodLocalHash); + + info.keysize = sizeof(LOCALLOCKTAG); + info.entrysize = sizeof(LOCALLOCK); + + LockMethodLocalHash = hash_create("LOCALLOCK hash", + 16, + &info, + HASH_ELEM | HASH_BLOBS); +} + + +/* + * Fetch the lock method table associated with a given lock + */ +LockMethod +GetLocksMethodTable(const LOCK *lock) +{ + LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*lock); + + Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods)); + return LockMethods[lockmethodid]; +} + +/* + * Fetch the lock method table associated with a given locktag + */ +LockMethod +GetLockTagsMethodTable(const LOCKTAG *locktag) +{ + LOCKMETHODID lockmethodid = (LOCKMETHODID) locktag->locktag_lockmethodid; + + Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods)); + return LockMethods[lockmethodid]; +} + + +/* + * Compute the hash code associated with a LOCKTAG. + * + * To avoid unnecessary recomputations of the hash code, we try to do this + * just once per function, and then pass it around as needed. Aside from + * passing the hashcode to hash_search_with_hash_value(), we can extract + * the lock partition number from the hashcode. + */ +uint32 +LockTagHashCode(const LOCKTAG *locktag) +{ + return get_hash_value(LockMethodLockHash, (const void *) locktag); +} + +/* + * Compute the hash code associated with a PROCLOCKTAG. + * + * Because we want to use just one set of partition locks for both the + * LOCK and PROCLOCK hash tables, we have to make sure that PROCLOCKs + * fall into the same partition number as their associated LOCKs. + * dynahash.c expects the partition number to be the low-order bits of + * the hash code, and therefore a PROCLOCKTAG's hash code must have the + * same low-order bits as the associated LOCKTAG's hash code. We achieve + * this with this specialized hash function. + */ +static uint32 +proclock_hash(const void *key, Size keysize) +{ + const PROCLOCKTAG *proclocktag = (const PROCLOCKTAG *) key; + uint32 lockhash; + Datum procptr; + + Assert(keysize == sizeof(PROCLOCKTAG)); + + /* Look into the associated LOCK object, and compute its hash code */ + lockhash = LockTagHashCode(&proclocktag->myLock->tag); + + /* + * To make the hash code also depend on the PGPROC, we xor the proc + * struct's address into the hash code, left-shifted so that the + * partition-number bits don't change. Since this is only a hash, we + * don't care if we lose high-order bits of the address; use an + * intermediate variable to suppress cast-pointer-to-int warnings. + */ + procptr = PointerGetDatum(proclocktag->myProc); + lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS; + + return lockhash; +} + +/* + * Compute the hash code associated with a PROCLOCKTAG, given the hashcode + * for its underlying LOCK. + * + * We use this just to avoid redundant calls of LockTagHashCode(). + */ +static inline uint32 +ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode) +{ + uint32 lockhash = hashcode; + Datum procptr; + + /* + * This must match proclock_hash()! + */ + procptr = PointerGetDatum(proclocktag->myProc); + lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS; + + return lockhash; +} + +/* + * Given two lock modes, return whether they would conflict. + */ +bool +DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2) +{ + LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD]; + + if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2)) + return true; + + return false; +} + +/* + * LockHeldByMe -- test whether lock 'locktag' is held with mode 'lockmode' + * by the current transaction + */ +bool +LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode) +{ + LOCALLOCKTAG localtag; + LOCALLOCK *locallock; + + /* + * See if there is a LOCALLOCK entry for this lock and lockmode + */ + MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */ + localtag.lock = *locktag; + localtag.mode = lockmode; + + locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, + (void *) &localtag, + HASH_FIND, NULL); + + return (locallock && locallock->nLocks > 0); +} + +#ifdef USE_ASSERT_CHECKING +/* + * GetLockMethodLocalHash -- return the hash of local locks, for modules that + * evaluate assertions based on all locks held. + */ +HTAB * +GetLockMethodLocalHash(void) +{ + return LockMethodLocalHash; +} +#endif + +/* + * LockHasWaiters -- look up 'locktag' and check if releasing this + * lock would wake up other processes waiting for it. + */ +bool +LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LockMethod lockMethodTable; + LOCALLOCKTAG localtag; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + LWLock *partitionLock; + bool hasWaiters = false; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) + elog(ERROR, "unrecognized lock mode: %d", lockmode); + +#ifdef LOCK_DEBUG + if (LOCK_DEBUG_ENABLED(locktag)) + elog(LOG, "LockHasWaiters: lock [%u,%u] %s", + locktag->locktag_field1, locktag->locktag_field2, + lockMethodTable->lockModeNames[lockmode]); +#endif + + /* + * Find the LOCALLOCK entry for this lock and lockmode + */ + MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */ + localtag.lock = *locktag; + localtag.mode = lockmode; + + locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, + (void *) &localtag, + HASH_FIND, NULL); + + /* + * let the caller print its own error message, too. Do not ereport(ERROR). + */ + if (!locallock || locallock->nLocks <= 0) + { + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + return false; + } + + /* + * Check the shared lock table. + */ + partitionLock = LockHashPartitionLock(locallock->hashcode); + + LWLockAcquire(partitionLock, LW_SHARED); + + /* + * We don't need to re-find the lock or proclock, since we kept their + * addresses in the locallock table, and they couldn't have been removed + * while we were holding a lock on them. + */ + lock = locallock->lock; + LOCK_PRINT("LockHasWaiters: found", lock, lockmode); + proclock = locallock->proclock; + PROCLOCK_PRINT("LockHasWaiters: found", proclock); + + /* + * Double-check that we are actually holding a lock of the type we want to + * release. + */ + if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) + { + PROCLOCK_PRINT("LockHasWaiters: WRONGTYPE", proclock); + LWLockRelease(partitionLock); + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + RemoveLocalLock(locallock); + return false; + } + + /* + * Do the checking. + */ + if ((lockMethodTable->conflictTab[lockmode] & lock->waitMask) != 0) + hasWaiters = true; + + LWLockRelease(partitionLock); + + return hasWaiters; +} + +/* + * LockAcquire -- Check for lock conflicts, sleep if conflict found, + * set lock if/when no conflicts. + * + * Inputs: + * locktag: unique identifier for the lockable object + * lockmode: lock mode to acquire + * sessionLock: if true, acquire lock for session not current transaction + * dontWait: if true, don't wait to acquire lock + * + * Returns one of: + * LOCKACQUIRE_NOT_AVAIL lock not available, and dontWait=true + * LOCKACQUIRE_OK lock successfully acquired + * LOCKACQUIRE_ALREADY_HELD incremented count for lock already held + * LOCKACQUIRE_ALREADY_CLEAR incremented count for lock already clear + * + * In the normal case where dontWait=false and the caller doesn't need to + * distinguish a freshly acquired lock from one already taken earlier in + * this same transaction, there is no need to examine the return value. + * + * Side Effects: The lock is acquired and recorded in lock tables. + * + * NOTE: if we wait for the lock, there is no way to abort the wait + * short of aborting the transaction. + */ +LockAcquireResult +LockAcquire(const LOCKTAG *locktag, + LOCKMODE lockmode, + bool sessionLock, + bool dontWait) +{ + return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait, + true, NULL); +} + +/* + * LockAcquireExtended - allows us to specify additional options + * + * reportMemoryError specifies whether a lock request that fills the lock + * table should generate an ERROR or not. Passing "false" allows the caller + * to attempt to recover from lock-table-full situations, perhaps by forcibly + * canceling other lock holders and then retrying. Note, however, that the + * return code for that is LOCKACQUIRE_NOT_AVAIL, so that it's unsafe to use + * in combination with dontWait = true, as the cause of failure couldn't be + * distinguished. + * + * If locallockp isn't NULL, *locallockp receives a pointer to the LOCALLOCK + * table entry if a lock is successfully acquired, or NULL if not. + */ +LockAcquireResult +LockAcquireExtended(const LOCKTAG *locktag, + LOCKMODE lockmode, + bool sessionLock, + bool dontWait, + bool reportMemoryError, + LOCALLOCK **locallockp) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LockMethod lockMethodTable; + LOCALLOCKTAG localtag; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + bool found; + ResourceOwner owner; + uint32 hashcode; + LWLock *partitionLock; + bool found_conflict; + bool log_lock = false; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) + elog(ERROR, "unrecognized lock mode: %d", lockmode); + + if (RecoveryInProgress() && !InRecovery && + (locktag->locktag_type == LOCKTAG_OBJECT || + locktag->locktag_type == LOCKTAG_RELATION) && + lockmode > RowExclusiveLock) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot acquire lock mode %s on database objects while recovery is in progress", + lockMethodTable->lockModeNames[lockmode]), + errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery."))); + +#ifdef LOCK_DEBUG + if (LOCK_DEBUG_ENABLED(locktag)) + elog(LOG, "LockAcquire: lock [%u,%u] %s", + locktag->locktag_field1, locktag->locktag_field2, + lockMethodTable->lockModeNames[lockmode]); +#endif + + /* Identify owner for lock */ + if (sessionLock) + owner = NULL; + else + owner = CurrentResourceOwner; + + /* + * Find or create a LOCALLOCK entry for this lock and lockmode + */ + MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */ + localtag.lock = *locktag; + localtag.mode = lockmode; + + locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, + (void *) &localtag, + HASH_ENTER, &found); + + /* + * if it's a new locallock object, initialize it + */ + if (!found) + { + locallock->lock = NULL; + locallock->proclock = NULL; + locallock->hashcode = LockTagHashCode(&(localtag.lock)); + locallock->nLocks = 0; + locallock->holdsStrongLockCount = false; + locallock->lockCleared = false; + locallock->numLockOwners = 0; + locallock->maxLockOwners = 8; + locallock->lockOwners = NULL; /* in case next line fails */ + locallock->lockOwners = (LOCALLOCKOWNER *) + MemoryContextAlloc(TopMemoryContext, + locallock->maxLockOwners * sizeof(LOCALLOCKOWNER)); + } + else + { + /* Make sure there will be room to remember the lock */ + if (locallock->numLockOwners >= locallock->maxLockOwners) + { + int newsize = locallock->maxLockOwners * 2; + + locallock->lockOwners = (LOCALLOCKOWNER *) + repalloc(locallock->lockOwners, + newsize * sizeof(LOCALLOCKOWNER)); + locallock->maxLockOwners = newsize; + } + } + hashcode = locallock->hashcode; + + if (locallockp) + *locallockp = locallock; + + /* + * If we already hold the lock, we can just increase the count locally. + * + * If lockCleared is already set, caller need not worry about absorbing + * sinval messages related to the lock's object. + */ + if (locallock->nLocks > 0) + { + GrantLockLocal(locallock, owner); + if (locallock->lockCleared) + return LOCKACQUIRE_ALREADY_CLEAR; + else + return LOCKACQUIRE_ALREADY_HELD; + } + + /* + * We don't acquire any other heavyweight lock while holding the relation + * extension lock. We do allow to acquire the same relation extension + * lock more than once but that case won't reach here. + */ + Assert(!IsRelationExtensionLockHeld); + + /* + * We don't acquire any other heavyweight lock while holding the page lock + * except for relation extension. + */ + Assert(!IsPageLockHeld || + (locktag->locktag_type == LOCKTAG_RELATION_EXTEND)); + + /* + * Prepare to emit a WAL record if acquisition of this lock needs to be + * replayed in a standby server. + * + * Here we prepare to log; after lock is acquired we'll issue log record. + * This arrangement simplifies error recovery in case the preparation step + * fails. + * + * Only AccessExclusiveLocks can conflict with lock types that read-only + * transactions can acquire in a standby server. Make sure this definition + * matches the one in GetRunningTransactionLocks(). + */ + if (lockmode >= AccessExclusiveLock && + locktag->locktag_type == LOCKTAG_RELATION && + !RecoveryInProgress() && + XLogStandbyInfoActive()) + { + LogAccessExclusiveLockPrepare(); + log_lock = true; + } + + /* + * Attempt to take lock via fast path, if eligible. But if we remember + * having filled up the fast path array, we don't attempt to make any + * further use of it until we release some locks. It's possible that some + * other backend has transferred some of those locks to the shared hash + * table, leaving space free, but it's not worth acquiring the LWLock just + * to check. It's also possible that we're acquiring a second or third + * lock type on a relation we have already locked using the fast-path, but + * for now we don't worry about that case either. + */ + if (EligibleForRelationFastPath(locktag, lockmode) && + FastPathLocalUseCount < FP_LOCK_SLOTS_PER_BACKEND) + { + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + bool acquired; + + /* + * LWLockAcquire acts as a memory sequencing point, so it's safe to + * assume that any strong locker whose increment to + * FastPathStrongRelationLocks->counts becomes visible after we test + * it has yet to begin to transfer fast-path locks. + */ + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + if (FastPathStrongRelationLocks->count[fasthashcode] != 0) + acquired = false; + else + acquired = FastPathGrantRelationLock(locktag->locktag_field2, + lockmode); + LWLockRelease(&MyProc->fpInfoLock); + if (acquired) + { + /* + * The locallock might contain stale pointers to some old shared + * objects; we MUST reset these to null before considering the + * lock to be acquired via fast-path. + */ + locallock->lock = NULL; + locallock->proclock = NULL; + GrantLockLocal(locallock, owner); + return LOCKACQUIRE_OK; + } + } + + /* + * If this lock could potentially have been taken via the fast-path by + * some other backend, we must (temporarily) disable further use of the + * fast-path for this lock tag, and migrate any locks already taken via + * this method to the main lock table. + */ + if (ConflictsWithRelationFastPath(locktag, lockmode)) + { + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + + BeginStrongLockAcquire(locallock, fasthashcode); + if (!FastPathTransferRelationLocks(lockMethodTable, locktag, + hashcode)) + { + AbortStrongLockAcquire(); + if (locallock->nLocks == 0) + RemoveLocalLock(locallock); + if (locallockp) + *locallockp = NULL; + if (reportMemoryError) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_locks_per_transaction."))); + else + return LOCKACQUIRE_NOT_AVAIL; + } + } + + /* + * We didn't find the lock in our LOCALLOCK table, and we didn't manage to + * take it via the fast-path, either, so we've got to mess with the shared + * lock table. + */ + partitionLock = LockHashPartitionLock(hashcode); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * Find or create lock and proclock entries with this tag + * + * Note: if the locallock object already existed, it might have a pointer + * to the lock already ... but we should not assume that that pointer is + * valid, since a lock object with zero hold and request counts can go + * away anytime. So we have to use SetupLockInTable() to recompute the + * lock and proclock pointers, even if they're already set. + */ + proclock = SetupLockInTable(lockMethodTable, MyProc, locktag, + hashcode, lockmode); + if (!proclock) + { + AbortStrongLockAcquire(); + LWLockRelease(partitionLock); + if (locallock->nLocks == 0) + RemoveLocalLock(locallock); + if (locallockp) + *locallockp = NULL; + if (reportMemoryError) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_locks_per_transaction."))); + else + return LOCKACQUIRE_NOT_AVAIL; + } + locallock->proclock = proclock; + lock = proclock->tag.myLock; + locallock->lock = lock; + + /* + * If lock requested conflicts with locks requested by waiters, must join + * wait queue. Otherwise, check for conflict with already-held locks. + * (That's last because most complex check.) + */ + if (lockMethodTable->conflictTab[lockmode] & lock->waitMask) + found_conflict = true; + else + found_conflict = LockCheckConflicts(lockMethodTable, lockmode, + lock, proclock); + + if (!found_conflict) + { + /* No conflict with held or previously requested locks */ + GrantLock(lock, proclock, lockmode); + GrantLockLocal(locallock, owner); + } + else + { + /* + * We can't acquire the lock immediately. If caller specified no + * blocking, remove useless table entries and return + * LOCKACQUIRE_NOT_AVAIL without waiting. + */ + if (dontWait) + { + AbortStrongLockAcquire(); + if (proclock->holdMask == 0) + { + uint32 proclock_hashcode; + + proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode); + SHMQueueDelete(&proclock->lockLink); + SHMQueueDelete(&proclock->procLink); + if (!hash_search_with_hash_value(LockMethodProcLockHash, + (void *) &(proclock->tag), + proclock_hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "proclock table corrupted"); + } + else + PROCLOCK_PRINT("LockAcquire: NOWAIT", proclock); + lock->nRequested--; + lock->requested[lockmode]--; + LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode); + Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0)); + Assert(lock->nGranted <= lock->nRequested); + LWLockRelease(partitionLock); + if (locallock->nLocks == 0) + RemoveLocalLock(locallock); + if (locallockp) + *locallockp = NULL; + return LOCKACQUIRE_NOT_AVAIL; + } + + /* + * Set bitmask of locks this process already holds on this object. + */ + MyProc->heldLocks = proclock->holdMask; + + /* + * Sleep till someone wakes me up. + */ + + TRACE_POSTGRESQL_LOCK_WAIT_START(locktag->locktag_field1, + locktag->locktag_field2, + locktag->locktag_field3, + locktag->locktag_field4, + locktag->locktag_type, + lockmode); + + WaitOnLock(locallock, owner); + + TRACE_POSTGRESQL_LOCK_WAIT_DONE(locktag->locktag_field1, + locktag->locktag_field2, + locktag->locktag_field3, + locktag->locktag_field4, + locktag->locktag_type, + lockmode); + + /* + * NOTE: do not do any material change of state between here and + * return. All required changes in locktable state must have been + * done when the lock was granted to us --- see notes in WaitOnLock. + */ + + /* + * Check the proclock entry status, in case something in the ipc + * communication doesn't work correctly. + */ + if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) + { + AbortStrongLockAcquire(); + PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock); + LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode); + /* Should we retry ? */ + LWLockRelease(partitionLock); + elog(ERROR, "LockAcquire failed"); + } + PROCLOCK_PRINT("LockAcquire: granted", proclock); + LOCK_PRINT("LockAcquire: granted", lock, lockmode); + } + + /* + * Lock state is fully up-to-date now; if we error out after this, no + * special error cleanup is required. + */ + FinishStrongLockAcquire(); + + LWLockRelease(partitionLock); + + /* + * Emit a WAL record if acquisition of this lock needs to be replayed in a + * standby server. + */ + if (log_lock) + { + /* + * Decode the locktag back to the original values, to avoid sending + * lots of empty bytes with every message. See lock.h to check how a + * locktag is defined for LOCKTAG_RELATION + */ + LogAccessExclusiveLock(locktag->locktag_field1, + locktag->locktag_field2); + } + + return LOCKACQUIRE_OK; +} + +/* + * Find or create LOCK and PROCLOCK objects as needed for a new lock + * request. + * + * Returns the PROCLOCK object, or NULL if we failed to create the objects + * for lack of shared memory. + * + * The appropriate partition lock must be held at entry, and will be + * held at exit. + */ +static PROCLOCK * +SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, + const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode) +{ + LOCK *lock; + PROCLOCK *proclock; + PROCLOCKTAG proclocktag; + uint32 proclock_hashcode; + bool found; + + /* + * Find or create a lock with this tag. + */ + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (const void *) locktag, + hashcode, + HASH_ENTER_NULL, + &found); + if (!lock) + return NULL; + + /* + * if it's a new lock object, initialize it + */ + if (!found) + { + lock->grantMask = 0; + lock->waitMask = 0; + SHMQueueInit(&(lock->procLocks)); + ProcQueueInit(&(lock->waitProcs)); + lock->nRequested = 0; + lock->nGranted = 0; + MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES); + MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES); + LOCK_PRINT("LockAcquire: new", lock, lockmode); + } + else + { + LOCK_PRINT("LockAcquire: found", lock, lockmode); + Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0)); + Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0)); + Assert(lock->nGranted <= lock->nRequested); + } + + /* + * Create the hash key for the proclock table. + */ + proclocktag.myLock = lock; + proclocktag.myProc = proc; + + proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode); + + /* + * Find or create a proclock entry with this tag + */ + proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, + (void *) &proclocktag, + proclock_hashcode, + HASH_ENTER_NULL, + &found); + if (!proclock) + { + /* Oops, not enough shmem for the proclock */ + if (lock->nRequested == 0) + { + /* + * There are no other requestors of this lock, so garbage-collect + * the lock object. We *must* do this to avoid a permanent leak + * of shared memory, because there won't be anything to cause + * anyone to release the lock object later. + */ + Assert(SHMQueueEmpty(&(lock->procLocks))); + if (!hash_search_with_hash_value(LockMethodLockHash, + (void *) &(lock->tag), + hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "lock table corrupted"); + } + return NULL; + } + + /* + * If new, initialize the new entry + */ + if (!found) + { + uint32 partition = LockHashPartition(hashcode); + + /* + * It might seem unsafe to access proclock->groupLeader without a + * lock, but it's not really. Either we are initializing a proclock + * on our own behalf, in which case our group leader isn't changing + * because the group leader for a process can only ever be changed by + * the process itself; or else we are transferring a fast-path lock to + * the main lock table, in which case that process can't change it's + * lock group leader without first releasing all of its locks (and in + * particular the one we are currently transferring). + */ + proclock->groupLeader = proc->lockGroupLeader != NULL ? + proc->lockGroupLeader : proc; + proclock->holdMask = 0; + proclock->releaseMask = 0; + /* Add proclock to appropriate lists */ + SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink); + SHMQueueInsertBefore(&(proc->myProcLocks[partition]), + &proclock->procLink); + PROCLOCK_PRINT("LockAcquire: new", proclock); + } + else + { + PROCLOCK_PRINT("LockAcquire: found", proclock); + Assert((proclock->holdMask & ~lock->grantMask) == 0); + +#ifdef CHECK_DEADLOCK_RISK + + /* + * Issue warning if we already hold a lower-level lock on this object + * and do not hold a lock of the requested level or higher. This + * indicates a deadlock-prone coding practice (eg, we'd have a + * deadlock if another backend were following the same code path at + * about the same time). + * + * This is not enabled by default, because it may generate log entries + * about user-level coding practices that are in fact safe in context. + * It can be enabled to help find system-level problems. + * + * XXX Doing numeric comparison on the lockmodes is a hack; it'd be + * better to use a table. For now, though, this works. + */ + { + int i; + + for (i = lockMethodTable->numLockModes; i > 0; i--) + { + if (proclock->holdMask & LOCKBIT_ON(i)) + { + if (i >= (int) lockmode) + break; /* safe: we have a lock >= req level */ + elog(LOG, "deadlock risk: raising lock level" + " from %s to %s on object %u/%u/%u", + lockMethodTable->lockModeNames[i], + lockMethodTable->lockModeNames[lockmode], + lock->tag.locktag_field1, lock->tag.locktag_field2, + lock->tag.locktag_field3); + break; + } + } + } +#endif /* CHECK_DEADLOCK_RISK */ + } + + /* + * lock->nRequested and lock->requested[] count the total number of + * requests, whether granted or waiting, so increment those immediately. + * The other counts don't increment till we get the lock. + */ + lock->nRequested++; + lock->requested[lockmode]++; + Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0)); + + /* + * We shouldn't already hold the desired lock; else locallock table is + * broken. + */ + if (proclock->holdMask & LOCKBIT_ON(lockmode)) + elog(ERROR, "lock %s on object %u/%u/%u is already held", + lockMethodTable->lockModeNames[lockmode], + lock->tag.locktag_field1, lock->tag.locktag_field2, + lock->tag.locktag_field3); + + return proclock; +} + +/* + * Check and set/reset the flag that we hold the relation extension/page lock. + * + * It is callers responsibility that this function is called after + * acquiring/releasing the relation extension/page lock. + * + * Pass acquired as true if lock is acquired, false otherwise. + */ +static inline void +CheckAndSetLockHeld(LOCALLOCK *locallock, bool acquired) +{ +#ifdef USE_ASSERT_CHECKING + if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_RELATION_EXTEND) + IsRelationExtensionLockHeld = acquired; + else if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_PAGE) + IsPageLockHeld = acquired; + +#endif +} + +/* + * Subroutine to free a locallock entry + */ +static void +RemoveLocalLock(LOCALLOCK *locallock) +{ + int i; + + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (locallock->lockOwners[i].owner != NULL) + ResourceOwnerForgetLock(locallock->lockOwners[i].owner, locallock); + } + locallock->numLockOwners = 0; + if (locallock->lockOwners != NULL) + pfree(locallock->lockOwners); + locallock->lockOwners = NULL; + + if (locallock->holdsStrongLockCount) + { + uint32 fasthashcode; + + fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode); + + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0); + FastPathStrongRelationLocks->count[fasthashcode]--; + locallock->holdsStrongLockCount = false; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); + } + + if (!hash_search(LockMethodLocalHash, + (void *) &(locallock->tag), + HASH_REMOVE, NULL)) + elog(WARNING, "locallock table corrupted"); + + /* + * Indicate that the lock is released for certain types of locks + */ + CheckAndSetLockHeld(locallock, false); +} + +/* + * LockCheckConflicts -- test whether requested lock conflicts + * with those already granted + * + * Returns true if conflict, false if no conflict. + * + * NOTES: + * Here's what makes this complicated: one process's locks don't + * conflict with one another, no matter what purpose they are held for + * (eg, session and transaction locks do not conflict). Nor do the locks + * of one process in a lock group conflict with those of another process in + * the same group. So, we must subtract off these locks when determining + * whether the requested new lock conflicts with those already held. + */ +bool +LockCheckConflicts(LockMethod lockMethodTable, + LOCKMODE lockmode, + LOCK *lock, + PROCLOCK *proclock) +{ + int numLockModes = lockMethodTable->numLockModes; + LOCKMASK myLocks; + int conflictMask = lockMethodTable->conflictTab[lockmode]; + int conflictsRemaining[MAX_LOCKMODES]; + int totalConflictsRemaining = 0; + int i; + SHM_QUEUE *procLocks; + PROCLOCK *otherproclock; + + /* + * first check for global conflicts: If no locks conflict with my request, + * then I get the lock. + * + * Checking for conflict: lock->grantMask represents the types of + * currently held locks. conflictTable[lockmode] has a bit set for each + * type of lock that conflicts with request. Bitwise compare tells if + * there is a conflict. + */ + if (!(conflictMask & lock->grantMask)) + { + PROCLOCK_PRINT("LockCheckConflicts: no conflict", proclock); + return false; + } + + /* + * Rats. Something conflicts. But it could still be my own lock, or a + * lock held by another member of my locking group. First, figure out how + * many conflicts remain after subtracting out any locks I hold myself. + */ + myLocks = proclock->holdMask; + for (i = 1; i <= numLockModes; i++) + { + if ((conflictMask & LOCKBIT_ON(i)) == 0) + { + conflictsRemaining[i] = 0; + continue; + } + conflictsRemaining[i] = lock->granted[i]; + if (myLocks & LOCKBIT_ON(i)) + --conflictsRemaining[i]; + totalConflictsRemaining += conflictsRemaining[i]; + } + + /* If no conflicts remain, we get the lock. */ + if (totalConflictsRemaining == 0) + { + PROCLOCK_PRINT("LockCheckConflicts: resolved (simple)", proclock); + return false; + } + + /* If no group locking, it's definitely a conflict. */ + if (proclock->groupLeader == MyProc && MyProc->lockGroupLeader == NULL) + { + Assert(proclock->tag.myProc == MyProc); + PROCLOCK_PRINT("LockCheckConflicts: conflicting (simple)", + proclock); + return true; + } + + /* + * The relation extension or page lock conflict even between the group + * members. + */ + if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND || + (LOCK_LOCKTAG(*lock) == LOCKTAG_PAGE)) + { + PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)", + proclock); + return true; + } + + /* + * Locks held in conflicting modes by members of our own lock group are + * not real conflicts; we can subtract those out and see if we still have + * a conflict. This is O(N) in the number of processes holding or + * awaiting locks on this object. We could improve that by making the + * shared memory state more complex (and larger) but it doesn't seem worth + * it. + */ + procLocks = &(lock->procLocks); + otherproclock = (PROCLOCK *) + SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink)); + while (otherproclock != NULL) + { + if (proclock != otherproclock && + proclock->groupLeader == otherproclock->groupLeader && + (otherproclock->holdMask & conflictMask) != 0) + { + int intersectMask = otherproclock->holdMask & conflictMask; + + for (i = 1; i <= numLockModes; i++) + { + if ((intersectMask & LOCKBIT_ON(i)) != 0) + { + if (conflictsRemaining[i] <= 0) + elog(PANIC, "proclocks held do not match lock"); + conflictsRemaining[i]--; + totalConflictsRemaining--; + } + } + + if (totalConflictsRemaining == 0) + { + PROCLOCK_PRINT("LockCheckConflicts: resolved (group)", + proclock); + return false; + } + } + otherproclock = (PROCLOCK *) + SHMQueueNext(procLocks, &otherproclock->lockLink, + offsetof(PROCLOCK, lockLink)); + } + + /* Nope, it's a real conflict. */ + PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)", proclock); + return true; +} + +/* + * GrantLock -- update the lock and proclock data structures to show + * the lock request has been granted. + * + * NOTE: if proc was blocked, it also needs to be removed from the wait list + * and have its waitLock/waitProcLock fields cleared. That's not done here. + * + * NOTE: the lock grant also has to be recorded in the associated LOCALLOCK + * table entry; but since we may be awaking some other process, we can't do + * that here; it's done by GrantLockLocal, instead. + */ +void +GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode) +{ + lock->nGranted++; + lock->granted[lockmode]++; + lock->grantMask |= LOCKBIT_ON(lockmode); + if (lock->granted[lockmode] == lock->requested[lockmode]) + lock->waitMask &= LOCKBIT_OFF(lockmode); + proclock->holdMask |= LOCKBIT_ON(lockmode); + LOCK_PRINT("GrantLock", lock, lockmode); + Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0)); + Assert(lock->nGranted <= lock->nRequested); +} + +/* + * UnGrantLock -- opposite of GrantLock. + * + * Updates the lock and proclock data structures to show that the lock + * is no longer held nor requested by the current holder. + * + * Returns true if there were any waiters waiting on the lock that + * should now be woken up with ProcLockWakeup. + */ +static bool +UnGrantLock(LOCK *lock, LOCKMODE lockmode, + PROCLOCK *proclock, LockMethod lockMethodTable) +{ + bool wakeupNeeded = false; + + Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0)); + Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0)); + Assert(lock->nGranted <= lock->nRequested); + + /* + * fix the general lock stats + */ + lock->nRequested--; + lock->requested[lockmode]--; + lock->nGranted--; + lock->granted[lockmode]--; + + if (lock->granted[lockmode] == 0) + { + /* change the conflict mask. No more of this lock type. */ + lock->grantMask &= LOCKBIT_OFF(lockmode); + } + + LOCK_PRINT("UnGrantLock: updated", lock, lockmode); + + /* + * We need only run ProcLockWakeup if the released lock conflicts with at + * least one of the lock types requested by waiter(s). Otherwise whatever + * conflict made them wait must still exist. NOTE: before MVCC, we could + * skip wakeup if lock->granted[lockmode] was still positive. But that's + * not true anymore, because the remaining granted locks might belong to + * some waiter, who could now be awakened because he doesn't conflict with + * his own locks. + */ + if (lockMethodTable->conflictTab[lockmode] & lock->waitMask) + wakeupNeeded = true; + + /* + * Now fix the per-proclock state. + */ + proclock->holdMask &= LOCKBIT_OFF(lockmode); + PROCLOCK_PRINT("UnGrantLock: updated", proclock); + + return wakeupNeeded; +} + +/* + * CleanUpLock -- clean up after releasing a lock. We garbage-collect the + * proclock and lock objects if possible, and call ProcLockWakeup if there + * are remaining requests and the caller says it's OK. (Normally, this + * should be called after UnGrantLock, and wakeupNeeded is the result from + * UnGrantLock.) + * + * The appropriate partition lock must be held at entry, and will be + * held at exit. + */ +static void +CleanUpLock(LOCK *lock, PROCLOCK *proclock, + LockMethod lockMethodTable, uint32 hashcode, + bool wakeupNeeded) +{ + /* + * If this was my last hold on this lock, delete my entry in the proclock + * table. + */ + if (proclock->holdMask == 0) + { + uint32 proclock_hashcode; + + PROCLOCK_PRINT("CleanUpLock: deleting", proclock); + SHMQueueDelete(&proclock->lockLink); + SHMQueueDelete(&proclock->procLink); + proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode); + if (!hash_search_with_hash_value(LockMethodProcLockHash, + (void *) &(proclock->tag), + proclock_hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "proclock table corrupted"); + } + + if (lock->nRequested == 0) + { + /* + * The caller just released the last lock, so garbage-collect the lock + * object. + */ + LOCK_PRINT("CleanUpLock: deleting", lock, 0); + Assert(SHMQueueEmpty(&(lock->procLocks))); + if (!hash_search_with_hash_value(LockMethodLockHash, + (void *) &(lock->tag), + hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "lock table corrupted"); + } + else if (wakeupNeeded) + { + /* There are waiters on this lock, so wake them up. */ + ProcLockWakeup(lockMethodTable, lock); + } +} + +/* + * GrantLockLocal -- update the locallock data structures to show + * the lock request has been granted. + * + * We expect that LockAcquire made sure there is room to add a new + * ResourceOwner entry. + */ +static void +GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner) +{ + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + int i; + + Assert(locallock->numLockOwners < locallock->maxLockOwners); + /* Count the total */ + locallock->nLocks++; + /* Count the per-owner lock */ + for (i = 0; i < locallock->numLockOwners; i++) + { + if (lockOwners[i].owner == owner) + { + lockOwners[i].nLocks++; + return; + } + } + lockOwners[i].owner = owner; + lockOwners[i].nLocks = 1; + locallock->numLockOwners++; + if (owner != NULL) + ResourceOwnerRememberLock(owner, locallock); + + /* Indicate that the lock is acquired for certain types of locks. */ + CheckAndSetLockHeld(locallock, true); +} + +/* + * BeginStrongLockAcquire - inhibit use of fastpath for a given LOCALLOCK, + * and arrange for error cleanup if it fails + */ +static void +BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode) +{ + Assert(StrongLockInProgress == NULL); + Assert(locallock->holdsStrongLockCount == false); + + /* + * Adding to a memory location is not atomic, so we take a spinlock to + * ensure we don't collide with someone else trying to bump the count at + * the same time. + * + * XXX: It might be worth considering using an atomic fetch-and-add + * instruction here, on architectures where that is supported. + */ + + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + FastPathStrongRelationLocks->count[fasthashcode]++; + locallock->holdsStrongLockCount = true; + StrongLockInProgress = locallock; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); +} + +/* + * FinishStrongLockAcquire - cancel pending cleanup for a strong lock + * acquisition once it's no longer needed + */ +static void +FinishStrongLockAcquire(void) +{ + StrongLockInProgress = NULL; +} + +/* + * AbortStrongLockAcquire - undo strong lock state changes performed by + * BeginStrongLockAcquire. + */ +void +AbortStrongLockAcquire(void) +{ + uint32 fasthashcode; + LOCALLOCK *locallock = StrongLockInProgress; + + if (locallock == NULL) + return; + + fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode); + Assert(locallock->holdsStrongLockCount == true); + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0); + FastPathStrongRelationLocks->count[fasthashcode]--; + locallock->holdsStrongLockCount = false; + StrongLockInProgress = NULL; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); +} + +/* + * GrantAwaitedLock -- call GrantLockLocal for the lock we are doing + * WaitOnLock on. + * + * proc.c needs this for the case where we are booted off the lock by + * timeout, but discover that someone granted us the lock anyway. + * + * We could just export GrantLockLocal, but that would require including + * resowner.h in lock.h, which creates circularity. + */ +void +GrantAwaitedLock(void) +{ + GrantLockLocal(awaitedLock, awaitedOwner); +} + +/* + * MarkLockClear -- mark an acquired lock as "clear" + * + * This means that we know we have absorbed all sinval messages that other + * sessions generated before we acquired this lock, and so we can confidently + * assume we know about any catalog changes protected by this lock. + */ +void +MarkLockClear(LOCALLOCK *locallock) +{ + Assert(locallock->nLocks > 0); + locallock->lockCleared = true; +} + +/* + * WaitOnLock -- wait to acquire a lock + * + * Caller must have set MyProc->heldLocks to reflect locks already held + * on the lockable object by this process. + * + * The appropriate partition lock must be held at entry. + */ +static void +WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) +{ + LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock); + LockMethod lockMethodTable = LockMethods[lockmethodid]; + char *volatile new_status = NULL; + + LOCK_PRINT("WaitOnLock: sleeping on lock", + locallock->lock, locallock->tag.mode); + + /* Report change to waiting status */ + if (update_process_title) + { + const char *old_status; + int len; + + old_status = get_ps_display(&len); + new_status = (char *) palloc(len + 8 + 1); + memcpy(new_status, old_status, len); + strcpy(new_status + len, " waiting"); + set_ps_display(new_status); + new_status[len] = '\0'; /* truncate off " waiting" */ + } + + awaitedLock = locallock; + awaitedOwner = owner; + + /* + * NOTE: Think not to put any shared-state cleanup after the call to + * ProcSleep, in either the normal or failure path. The lock state must + * be fully set by the lock grantor, or by CheckDeadLock if we give up + * waiting for the lock. This is necessary because of the possibility + * that a cancel/die interrupt will interrupt ProcSleep after someone else + * grants us the lock, but before we've noticed it. Hence, after granting, + * the locktable state must fully reflect the fact that we own the lock; + * we can't do additional work on return. + * + * We can and do use a PG_TRY block to try to clean up after failure, but + * this still has a major limitation: elog(FATAL) can occur while waiting + * (eg, a "die" interrupt), and then control won't come back here. So all + * cleanup of essential state should happen in LockErrorCleanup, not here. + * We can use PG_TRY to clear the "waiting" status flags, since doing that + * is unimportant if the process exits. + */ + PG_TRY(); + { + if (ProcSleep(locallock, lockMethodTable) != PROC_WAIT_STATUS_OK) + { + /* + * We failed as a result of a deadlock, see CheckDeadLock(). Quit + * now. + */ + awaitedLock = NULL; + LOCK_PRINT("WaitOnLock: aborting on lock", + locallock->lock, locallock->tag.mode); + LWLockRelease(LockHashPartitionLock(locallock->hashcode)); + + /* + * Now that we aren't holding the partition lock, we can give an + * error report including details about the detected deadlock. + */ + DeadLockReport(); + /* not reached */ + } + } + PG_CATCH(); + { + /* In this path, awaitedLock remains set until LockErrorCleanup */ + + /* Report change to non-waiting status */ + if (update_process_title) + { + set_ps_display(new_status); + pfree(new_status); + } + + /* and propagate the error */ + PG_RE_THROW(); + } + PG_END_TRY(); + + awaitedLock = NULL; + + /* Report change to non-waiting status */ + if (update_process_title) + { + set_ps_display(new_status); + pfree(new_status); + } + + LOCK_PRINT("WaitOnLock: wakeup on lock", + locallock->lock, locallock->tag.mode); +} + +/* + * Remove a proc from the wait-queue it is on (caller must know it is on one). + * This is only used when the proc has failed to get the lock, so we set its + * waitStatus to PROC_WAIT_STATUS_ERROR. + * + * Appropriate partition lock must be held by caller. Also, caller is + * responsible for signaling the proc if needed. + * + * NB: this does not clean up any locallock object that may exist for the lock. + */ +void +RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode) +{ + LOCK *waitLock = proc->waitLock; + PROCLOCK *proclock = proc->waitProcLock; + LOCKMODE lockmode = proc->waitLockMode; + LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*waitLock); + + /* Make sure proc is waiting */ + Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING); + Assert(proc->links.next != NULL); + Assert(waitLock); + Assert(waitLock->waitProcs.size > 0); + Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods)); + + /* Remove proc from lock's wait queue */ + SHMQueueDelete(&(proc->links)); + waitLock->waitProcs.size--; + + /* Undo increments of request counts by waiting process */ + Assert(waitLock->nRequested > 0); + Assert(waitLock->nRequested > proc->waitLock->nGranted); + waitLock->nRequested--; + Assert(waitLock->requested[lockmode] > 0); + waitLock->requested[lockmode]--; + /* don't forget to clear waitMask bit if appropriate */ + if (waitLock->granted[lockmode] == waitLock->requested[lockmode]) + waitLock->waitMask &= LOCKBIT_OFF(lockmode); + + /* Clean up the proc's own state, and pass it the ok/fail signal */ + proc->waitLock = NULL; + proc->waitProcLock = NULL; + proc->waitStatus = PROC_WAIT_STATUS_ERROR; + + /* + * Delete the proclock immediately if it represents no already-held locks. + * (This must happen now because if the owner of the lock decides to + * release it, and the requested/granted counts then go to zero, + * LockRelease expects there to be no remaining proclocks.) Then see if + * any other waiters for the lock can be woken up now. + */ + CleanUpLock(waitLock, proclock, + LockMethods[lockmethodid], hashcode, + true); +} + +/* + * LockRelease -- look up 'locktag' and release one 'lockmode' lock on it. + * Release a session lock if 'sessionLock' is true, else release a + * regular transaction lock. + * + * Side Effects: find any waiting processes that are now wakable, + * grant them their requested locks and awaken them. + * (We have to grant the lock here to avoid a race between + * the waking process and any new process to + * come along and request the lock.) + */ +bool +LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LockMethod lockMethodTable; + LOCALLOCKTAG localtag; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + LWLock *partitionLock; + bool wakeupNeeded; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) + elog(ERROR, "unrecognized lock mode: %d", lockmode); + +#ifdef LOCK_DEBUG + if (LOCK_DEBUG_ENABLED(locktag)) + elog(LOG, "LockRelease: lock [%u,%u] %s", + locktag->locktag_field1, locktag->locktag_field2, + lockMethodTable->lockModeNames[lockmode]); +#endif + + /* + * Find the LOCALLOCK entry for this lock and lockmode + */ + MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */ + localtag.lock = *locktag; + localtag.mode = lockmode; + + locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, + (void *) &localtag, + HASH_FIND, NULL); + + /* + * let the caller print its own error message, too. Do not ereport(ERROR). + */ + if (!locallock || locallock->nLocks <= 0) + { + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + return false; + } + + /* + * Decrease the count for the resource owner. + */ + { + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + ResourceOwner owner; + int i; + + /* Identify owner for lock */ + if (sessionLock) + owner = NULL; + else + owner = CurrentResourceOwner; + + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == owner) + { + Assert(lockOwners[i].nLocks > 0); + if (--lockOwners[i].nLocks == 0) + { + if (owner != NULL) + ResourceOwnerForgetLock(owner, locallock); + /* compact out unused slot */ + locallock->numLockOwners--; + if (i < locallock->numLockOwners) + lockOwners[i] = lockOwners[locallock->numLockOwners]; + } + break; + } + } + if (i < 0) + { + /* don't release a lock belonging to another owner */ + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + return false; + } + } + + /* + * Decrease the total local count. If we're still holding the lock, we're + * done. + */ + locallock->nLocks--; + + if (locallock->nLocks > 0) + return true; + + /* + * At this point we can no longer suppose we are clear of invalidation + * messages related to this lock. Although we'll delete the LOCALLOCK + * object before any intentional return from this routine, it seems worth + * the trouble to explicitly reset lockCleared right now, just in case + * some error prevents us from deleting the LOCALLOCK. + */ + locallock->lockCleared = false; + + /* Attempt fast release of any lock eligible for the fast path. */ + if (EligibleForRelationFastPath(locktag, lockmode) && + FastPathLocalUseCount > 0) + { + bool released; + + /* + * We might not find the lock here, even if we originally entered it + * here. Another backend may have moved it to the main table. + */ + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + released = FastPathUnGrantRelationLock(locktag->locktag_field2, + lockmode); + LWLockRelease(&MyProc->fpInfoLock); + if (released) + { + RemoveLocalLock(locallock); + return true; + } + } + + /* + * Otherwise we've got to mess with the shared lock table. + */ + partitionLock = LockHashPartitionLock(locallock->hashcode); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * Normally, we don't need to re-find the lock or proclock, since we kept + * their addresses in the locallock table, and they couldn't have been + * removed while we were holding a lock on them. But it's possible that + * the lock was taken fast-path and has since been moved to the main hash + * table by another backend, in which case we will need to look up the + * objects here. We assume the lock field is NULL if so. + */ + lock = locallock->lock; + if (!lock) + { + PROCLOCKTAG proclocktag; + + Assert(EligibleForRelationFastPath(locktag, lockmode)); + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (const void *) locktag, + locallock->hashcode, + HASH_FIND, + NULL); + if (!lock) + elog(ERROR, "failed to re-find shared lock object"); + locallock->lock = lock; + + proclocktag.myLock = lock; + proclocktag.myProc = MyProc; + locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash, + (void *) &proclocktag, + HASH_FIND, + NULL); + if (!locallock->proclock) + elog(ERROR, "failed to re-find shared proclock object"); + } + LOCK_PRINT("LockRelease: found", lock, lockmode); + proclock = locallock->proclock; + PROCLOCK_PRINT("LockRelease: found", proclock); + + /* + * Double-check that we are actually holding a lock of the type we want to + * release. + */ + if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) + { + PROCLOCK_PRINT("LockRelease: WRONGTYPE", proclock); + LWLockRelease(partitionLock); + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + RemoveLocalLock(locallock); + return false; + } + + /* + * Do the releasing. CleanUpLock will waken any now-wakable waiters. + */ + wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable); + + CleanUpLock(lock, proclock, + lockMethodTable, locallock->hashcode, + wakeupNeeded); + + LWLockRelease(partitionLock); + + RemoveLocalLock(locallock); + return true; +} + +/* + * LockReleaseAll -- Release all locks of the specified lock method that + * are held by the current process. + * + * Well, not necessarily *all* locks. The available behaviors are: + * allLocks == true: release all locks including session locks. + * allLocks == false: release all non-session locks. + */ +void +LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) +{ + HASH_SEQ_STATUS status; + LockMethod lockMethodTable; + int i, + numLockModes; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + int partition; + bool have_fast_path_lwlock = false; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + +#ifdef LOCK_DEBUG + if (*(lockMethodTable->trace_flag)) + elog(LOG, "LockReleaseAll: lockmethod=%d", lockmethodid); +#endif + + /* + * Get rid of our fast-path VXID lock, if appropriate. Note that this is + * the only way that the lock we hold on our own VXID can ever get + * released: it is always and only released when a toplevel transaction + * ends. + */ + if (lockmethodid == DEFAULT_LOCKMETHOD) + VirtualXactLockTableCleanup(); + + numLockModes = lockMethodTable->numLockModes; + + /* + * First we run through the locallock table and get rid of unwanted + * entries, then we scan the process's proclocks and get rid of those. We + * do this separately because we may have multiple locallock entries + * pointing to the same proclock, and we daren't end up with any dangling + * pointers. Fast-path locks are cleaned up during the locallock table + * scan, though. + */ + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + /* + * If the LOCALLOCK entry is unused, we must've run out of shared + * memory while trying to set up this lock. Just forget the local + * entry. + */ + if (locallock->nLocks == 0) + { + RemoveLocalLock(locallock); + continue; + } + + /* Ignore items that are not of the lockmethod to be removed */ + if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid) + continue; + + /* + * If we are asked to release all locks, we can just zap the entry. + * Otherwise, must scan to see if there are session locks. We assume + * there is at most one lockOwners entry for session locks. + */ + if (!allLocks) + { + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + + /* If session lock is above array position 0, move it down to 0 */ + for (i = 0; i < locallock->numLockOwners; i++) + { + if (lockOwners[i].owner == NULL) + lockOwners[0] = lockOwners[i]; + else + ResourceOwnerForgetLock(lockOwners[i].owner, locallock); + } + + if (locallock->numLockOwners > 0 && + lockOwners[0].owner == NULL && + lockOwners[0].nLocks > 0) + { + /* Fix the locallock to show just the session locks */ + locallock->nLocks = lockOwners[0].nLocks; + locallock->numLockOwners = 1; + /* We aren't deleting this locallock, so done */ + continue; + } + else + locallock->numLockOwners = 0; + } + + /* + * If the lock or proclock pointers are NULL, this lock was taken via + * the relation fast-path (and is not known to have been transferred). + */ + if (locallock->proclock == NULL || locallock->lock == NULL) + { + LOCKMODE lockmode = locallock->tag.mode; + Oid relid; + + /* Verify that a fast-path lock is what we've got. */ + if (!EligibleForRelationFastPath(&locallock->tag.lock, lockmode)) + elog(PANIC, "locallock table corrupted"); + + /* + * If we don't currently hold the LWLock that protects our + * fast-path data structures, we must acquire it before attempting + * to release the lock via the fast-path. We will continue to + * hold the LWLock until we're done scanning the locallock table, + * unless we hit a transferred fast-path lock. (XXX is this + * really such a good idea? There could be a lot of entries ...) + */ + if (!have_fast_path_lwlock) + { + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + have_fast_path_lwlock = true; + } + + /* Attempt fast-path release. */ + relid = locallock->tag.lock.locktag_field2; + if (FastPathUnGrantRelationLock(relid, lockmode)) + { + RemoveLocalLock(locallock); + continue; + } + + /* + * Our lock, originally taken via the fast path, has been + * transferred to the main lock table. That's going to require + * some extra work, so release our fast-path lock before starting. + */ + LWLockRelease(&MyProc->fpInfoLock); + have_fast_path_lwlock = false; + + /* + * Now dump the lock. We haven't got a pointer to the LOCK or + * PROCLOCK in this case, so we have to handle this a bit + * differently than a normal lock release. Unfortunately, this + * requires an extra LWLock acquire-and-release cycle on the + * partitionLock, but hopefully it shouldn't happen often. + */ + LockRefindAndRelease(lockMethodTable, MyProc, + &locallock->tag.lock, lockmode, false); + RemoveLocalLock(locallock); + continue; + } + + /* Mark the proclock to show we need to release this lockmode */ + if (locallock->nLocks > 0) + locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode); + + /* And remove the locallock hashtable entry */ + RemoveLocalLock(locallock); + } + + /* Done with the fast-path data structures */ + if (have_fast_path_lwlock) + LWLockRelease(&MyProc->fpInfoLock); + + /* + * Now, scan each lock partition separately. + */ + for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++) + { + LWLock *partitionLock; + SHM_QUEUE *procLocks = &(MyProc->myProcLocks[partition]); + PROCLOCK *nextplock; + + partitionLock = LockHashPartitionLockByIndex(partition); + + /* + * If the proclock list for this partition is empty, we can skip + * acquiring the partition lock. This optimization is trickier than + * it looks, because another backend could be in process of adding + * something to our proclock list due to promoting one of our + * fast-path locks. However, any such lock must be one that we + * decided not to delete above, so it's okay to skip it again now; + * we'd just decide not to delete it again. We must, however, be + * careful to re-fetch the list header once we've acquired the + * partition lock, to be sure we have a valid, up-to-date pointer. + * (There is probably no significant risk if pointer fetch/store is + * atomic, but we don't wish to assume that.) + * + * XXX This argument assumes that the locallock table correctly + * represents all of our fast-path locks. While allLocks mode + * guarantees to clean up all of our normal locks regardless of the + * locallock situation, we lose that guarantee for fast-path locks. + * This is not ideal. + */ + if (SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, procLink)) == NULL) + continue; /* needn't examine this partition */ + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + for (proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, procLink)); + proclock; + proclock = nextplock) + { + bool wakeupNeeded = false; + + /* Get link first, since we may unlink/delete this proclock */ + nextplock = (PROCLOCK *) + SHMQueueNext(procLocks, &proclock->procLink, + offsetof(PROCLOCK, procLink)); + + Assert(proclock->tag.myProc == MyProc); + + lock = proclock->tag.myLock; + + /* Ignore items that are not of the lockmethod to be removed */ + if (LOCK_LOCKMETHOD(*lock) != lockmethodid) + continue; + + /* + * In allLocks mode, force release of all locks even if locallock + * table had problems + */ + if (allLocks) + proclock->releaseMask = proclock->holdMask; + else + Assert((proclock->releaseMask & ~proclock->holdMask) == 0); + + /* + * Ignore items that have nothing to be released, unless they have + * holdMask == 0 and are therefore recyclable + */ + if (proclock->releaseMask == 0 && proclock->holdMask != 0) + continue; + + PROCLOCK_PRINT("LockReleaseAll", proclock); + LOCK_PRINT("LockReleaseAll", lock, 0); + Assert(lock->nRequested >= 0); + Assert(lock->nGranted >= 0); + Assert(lock->nGranted <= lock->nRequested); + Assert((proclock->holdMask & ~lock->grantMask) == 0); + + /* + * Release the previously-marked lock modes + */ + for (i = 1; i <= numLockModes; i++) + { + if (proclock->releaseMask & LOCKBIT_ON(i)) + wakeupNeeded |= UnGrantLock(lock, i, proclock, + lockMethodTable); + } + Assert((lock->nRequested >= 0) && (lock->nGranted >= 0)); + Assert(lock->nGranted <= lock->nRequested); + LOCK_PRINT("LockReleaseAll: updated", lock, 0); + + proclock->releaseMask = 0; + + /* CleanUpLock will wake up waiters if needed. */ + CleanUpLock(lock, proclock, + lockMethodTable, + LockTagHashCode(&lock->tag), + wakeupNeeded); + } /* loop over PROCLOCKs within this partition */ + + LWLockRelease(partitionLock); + } /* loop over partitions */ + +#ifdef LOCK_DEBUG + if (*(lockMethodTable->trace_flag)) + elog(LOG, "LockReleaseAll done"); +#endif +} + +/* + * LockReleaseSession -- Release all session locks of the specified lock method + * that are held by the current process. + */ +void +LockReleaseSession(LOCKMETHODID lockmethodid) +{ + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + /* Ignore items that are not of the specified lock method */ + if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid) + continue; + + ReleaseLockIfHeld(locallock, true); + } +} + +/* + * LockReleaseCurrentOwner + * Release all locks belonging to CurrentResourceOwner + * + * If the caller knows what those locks are, it can pass them as an array. + * That speeds up the call significantly, when a lot of locks are held. + * Otherwise, pass NULL for locallocks, and we'll traverse through our hash + * table to find them. + */ +void +LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks) +{ + if (locallocks == NULL) + { + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + ReleaseLockIfHeld(locallock, false); + } + else + { + int i; + + for (i = nlocks - 1; i >= 0; i--) + ReleaseLockIfHeld(locallocks[i], false); + } +} + +/* + * ReleaseLockIfHeld + * Release any session-level locks on this lockable object if sessionLock + * is true; else, release any locks held by CurrentResourceOwner. + * + * It is tempting to pass this a ResourceOwner pointer (or NULL for session + * locks), but without refactoring LockRelease() we cannot support releasing + * locks belonging to resource owners other than CurrentResourceOwner. + * If we were to refactor, it'd be a good idea to fix it so we don't have to + * do a hashtable lookup of the locallock, too. However, currently this + * function isn't used heavily enough to justify refactoring for its + * convenience. + */ +static void +ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock) +{ + ResourceOwner owner; + LOCALLOCKOWNER *lockOwners; + int i; + + /* Identify owner for lock (must match LockRelease!) */ + if (sessionLock) + owner = NULL; + else + owner = CurrentResourceOwner; + + /* Scan to see if there are any locks belonging to the target owner */ + lockOwners = locallock->lockOwners; + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == owner) + { + Assert(lockOwners[i].nLocks > 0); + if (lockOwners[i].nLocks < locallock->nLocks) + { + /* + * We will still hold this lock after forgetting this + * ResourceOwner. + */ + locallock->nLocks -= lockOwners[i].nLocks; + /* compact out unused slot */ + locallock->numLockOwners--; + if (owner != NULL) + ResourceOwnerForgetLock(owner, locallock); + if (i < locallock->numLockOwners) + lockOwners[i] = lockOwners[locallock->numLockOwners]; + } + else + { + Assert(lockOwners[i].nLocks == locallock->nLocks); + /* We want to call LockRelease just once */ + lockOwners[i].nLocks = 1; + locallock->nLocks = 1; + if (!LockRelease(&locallock->tag.lock, + locallock->tag.mode, + sessionLock)) + elog(WARNING, "ReleaseLockIfHeld: failed??"); + } + break; + } + } +} + +/* + * LockReassignCurrentOwner + * Reassign all locks belonging to CurrentResourceOwner to belong + * to its parent resource owner. + * + * If the caller knows what those locks are, it can pass them as an array. + * That speeds up the call significantly, when a lot of locks are held + * (e.g pg_dump with a large schema). Otherwise, pass NULL for locallocks, + * and we'll traverse through our hash table to find them. + */ +void +LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks) +{ + ResourceOwner parent = ResourceOwnerGetParent(CurrentResourceOwner); + + Assert(parent != NULL); + + if (locallocks == NULL) + { + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + LockReassignOwner(locallock, parent); + } + else + { + int i; + + for (i = nlocks - 1; i >= 0; i--) + LockReassignOwner(locallocks[i], parent); + } +} + +/* + * Subroutine of LockReassignCurrentOwner. Reassigns a given lock belonging to + * CurrentResourceOwner to its parent. + */ +static void +LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent) +{ + LOCALLOCKOWNER *lockOwners; + int i; + int ic = -1; + int ip = -1; + + /* + * Scan to see if there are any locks belonging to current owner or its + * parent + */ + lockOwners = locallock->lockOwners; + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == CurrentResourceOwner) + ic = i; + else if (lockOwners[i].owner == parent) + ip = i; + } + + if (ic < 0) + return; /* no current locks */ + + if (ip < 0) + { + /* Parent has no slot, so just give it the child's slot */ + lockOwners[ic].owner = parent; + ResourceOwnerRememberLock(parent, locallock); + } + else + { + /* Merge child's count with parent's */ + lockOwners[ip].nLocks += lockOwners[ic].nLocks; + /* compact out unused slot */ + locallock->numLockOwners--; + if (ic < locallock->numLockOwners) + lockOwners[ic] = lockOwners[locallock->numLockOwners]; + } + ResourceOwnerForgetLock(CurrentResourceOwner, locallock); +} + +/* + * FastPathGrantRelationLock + * Grant lock using per-backend fast-path array, if there is space. + */ +static bool +FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode) +{ + uint32 f; + uint32 unused_slot = FP_LOCK_SLOTS_PER_BACKEND; + + /* Scan for existing entry for this relid, remembering empty slot. */ + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + if (FAST_PATH_GET_BITS(MyProc, f) == 0) + unused_slot = f; + else if (MyProc->fpRelId[f] == relid) + { + Assert(!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode)); + FAST_PATH_SET_LOCKMODE(MyProc, f, lockmode); + return true; + } + } + + /* If no existing entry, use any empty slot. */ + if (unused_slot < FP_LOCK_SLOTS_PER_BACKEND) + { + MyProc->fpRelId[unused_slot] = relid; + FAST_PATH_SET_LOCKMODE(MyProc, unused_slot, lockmode); + ++FastPathLocalUseCount; + return true; + } + + /* No existing entry, and no empty slot. */ + return false; +} + +/* + * FastPathUnGrantRelationLock + * Release fast-path lock, if present. Update backend-private local + * use count, while we're at it. + */ +static bool +FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode) +{ + uint32 f; + bool result = false; + + FastPathLocalUseCount = 0; + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + if (MyProc->fpRelId[f] == relid + && FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode)) + { + Assert(!result); + FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode); + result = true; + /* we continue iterating so as to update FastPathLocalUseCount */ + } + if (FAST_PATH_GET_BITS(MyProc, f) != 0) + ++FastPathLocalUseCount; + } + return result; +} + +/* + * FastPathTransferRelationLocks + * Transfer locks matching the given lock tag from per-backend fast-path + * arrays to the shared hash table. + * + * Returns true if successful, false if ran out of shared memory. + */ +static bool +FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag, + uint32 hashcode) +{ + LWLock *partitionLock = LockHashPartitionLock(hashcode); + Oid relid = locktag->locktag_field2; + uint32 i; + + /* + * Every PGPROC that can potentially hold a fast-path lock is present in + * ProcGlobal->allProcs. Prepared transactions are not, but any + * outstanding fast-path locks held by prepared transactions are + * transferred to the main lock table. + */ + for (i = 0; i < ProcGlobal->allProcCount; i++) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; + + LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE); + + /* + * If the target backend isn't referencing the same database as the + * lock, then we needn't examine the individual relation IDs at all; + * none of them can be relevant. + * + * proc->databaseId is set at backend startup time and never changes + * thereafter, so it might be safe to perform this test before + * acquiring &proc->fpInfoLock. In particular, it's certainly safe to + * assume that if the target backend holds any fast-path locks, it + * must have performed a memory-fencing operation (in particular, an + * LWLock acquisition) since setting proc->databaseId. However, it's + * less clear that our backend is certain to have performed a memory + * fencing operation since the other backend set proc->databaseId. So + * for now, we test it after acquiring the LWLock just to be safe. + */ + if (proc->databaseId != locktag->locktag_field1) + { + LWLockRelease(&proc->fpInfoLock); + continue; + } + + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + uint32 lockmode; + + /* Look for an allocated slot matching the given relid. */ + if (relid != proc->fpRelId[f] || FAST_PATH_GET_BITS(proc, f) == 0) + continue; + + /* Find or create lock object. */ + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + for (lockmode = FAST_PATH_LOCKNUMBER_OFFSET; + lockmode < FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT; + ++lockmode) + { + PROCLOCK *proclock; + + if (!FAST_PATH_CHECK_LOCKMODE(proc, f, lockmode)) + continue; + proclock = SetupLockInTable(lockMethodTable, proc, locktag, + hashcode, lockmode); + if (!proclock) + { + LWLockRelease(partitionLock); + LWLockRelease(&proc->fpInfoLock); + return false; + } + GrantLock(proclock->tag.myLock, proclock, lockmode); + FAST_PATH_CLEAR_LOCKMODE(proc, f, lockmode); + } + LWLockRelease(partitionLock); + + /* No need to examine remaining slots. */ + break; + } + LWLockRelease(&proc->fpInfoLock); + } + return true; +} + +/* + * FastPathGetRelationLockEntry + * Return the PROCLOCK for a lock originally taken via the fast-path, + * transferring it to the primary lock table if necessary. + * + * Note: caller takes care of updating the locallock object. + */ +static PROCLOCK * +FastPathGetRelationLockEntry(LOCALLOCK *locallock) +{ + LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD]; + LOCKTAG *locktag = &locallock->tag.lock; + PROCLOCK *proclock = NULL; + LWLock *partitionLock = LockHashPartitionLock(locallock->hashcode); + Oid relid = locktag->locktag_field2; + uint32 f; + + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + uint32 lockmode; + + /* Look for an allocated slot matching the given relid. */ + if (relid != MyProc->fpRelId[f] || FAST_PATH_GET_BITS(MyProc, f) == 0) + continue; + + /* If we don't have a lock of the given mode, forget it! */ + lockmode = locallock->tag.mode; + if (!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode)) + break; + + /* Find or create lock object. */ + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + proclock = SetupLockInTable(lockMethodTable, MyProc, locktag, + locallock->hashcode, lockmode); + if (!proclock) + { + LWLockRelease(partitionLock); + LWLockRelease(&MyProc->fpInfoLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_locks_per_transaction."))); + } + GrantLock(proclock->tag.myLock, proclock, lockmode); + FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode); + + LWLockRelease(partitionLock); + + /* No need to examine remaining slots. */ + break; + } + + LWLockRelease(&MyProc->fpInfoLock); + + /* Lock may have already been transferred by some other backend. */ + if (proclock == NULL) + { + LOCK *lock; + PROCLOCKTAG proclocktag; + uint32 proclock_hashcode; + + LWLockAcquire(partitionLock, LW_SHARED); + + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (void *) locktag, + locallock->hashcode, + HASH_FIND, + NULL); + if (!lock) + elog(ERROR, "failed to re-find shared lock object"); + + proclocktag.myLock = lock; + proclocktag.myProc = MyProc; + + proclock_hashcode = ProcLockHashCode(&proclocktag, locallock->hashcode); + proclock = (PROCLOCK *) + hash_search_with_hash_value(LockMethodProcLockHash, + (void *) &proclocktag, + proclock_hashcode, + HASH_FIND, + NULL); + if (!proclock) + elog(ERROR, "failed to re-find shared proclock object"); + LWLockRelease(partitionLock); + } + + return proclock; +} + +/* + * GetLockConflicts + * Get an array of VirtualTransactionIds of xacts currently holding locks + * that would conflict with the specified lock/lockmode. + * xacts merely awaiting such a lock are NOT reported. + * + * The result array is palloc'd and is terminated with an invalid VXID. + * *countp, if not null, is updated to the number of items set. + * + * Of course, the result could be out of date by the time it's returned, so + * use of this function has to be thought about carefully. Similarly, a + * PGPROC with no "lxid" will be considered non-conflicting regardless of any + * lock it holds. Existing callers don't care about a locker after that + * locker's pg_xact updates complete. CommitTransaction() clears "lxid" after + * pg_xact updates and before releasing locks. + * + * Note we never include the current xact's vxid in the result array, + * since an xact never blocks itself. + */ +VirtualTransactionId * +GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) +{ + static VirtualTransactionId *vxids; + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LockMethod lockMethodTable; + LOCK *lock; + LOCKMASK conflictMask; + SHM_QUEUE *procLocks; + PROCLOCK *proclock; + uint32 hashcode; + LWLock *partitionLock; + int count = 0; + int fast_count = 0; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) + elog(ERROR, "unrecognized lock mode: %d", lockmode); + + /* + * Allocate memory to store results, and fill with InvalidVXID. We only + * need enough space for MaxBackends + max_prepared_xacts + a terminator. + * InHotStandby allocate once in TopMemoryContext. + */ + if (InHotStandby) + { + if (vxids == NULL) + vxids = (VirtualTransactionId *) + MemoryContextAlloc(TopMemoryContext, + sizeof(VirtualTransactionId) * + (MaxBackends + max_prepared_xacts + 1)); + } + else + vxids = (VirtualTransactionId *) + palloc0(sizeof(VirtualTransactionId) * + (MaxBackends + max_prepared_xacts + 1)); + + /* Compute hash code and partition lock, and look up conflicting modes. */ + hashcode = LockTagHashCode(locktag); + partitionLock = LockHashPartitionLock(hashcode); + conflictMask = lockMethodTable->conflictTab[lockmode]; + + /* + * Fast path locks might not have been entered in the primary lock table. + * If the lock we're dealing with could conflict with such a lock, we must + * examine each backend's fast-path array for conflicts. + */ + if (ConflictsWithRelationFastPath(locktag, lockmode)) + { + int i; + Oid relid = locktag->locktag_field2; + VirtualTransactionId vxid; + + /* + * Iterate over relevant PGPROCs. Anything held by a prepared + * transaction will have been transferred to the primary lock table, + * so we need not worry about those. This is all a bit fuzzy, because + * new locks could be taken after we've visited a particular + * partition, but the callers had better be prepared to deal with that + * anyway, since the locks could equally well be taken between the + * time we return the value and the time the caller does something + * with it. + */ + for (i = 0; i < ProcGlobal->allProcCount; i++) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; + + /* A backend never blocks itself */ + if (proc == MyProc) + continue; + + LWLockAcquire(&proc->fpInfoLock, LW_SHARED); + + /* + * If the target backend isn't referencing the same database as + * the lock, then we needn't examine the individual relation IDs + * at all; none of them can be relevant. + * + * See FastPathTransferRelationLocks() for discussion of why we do + * this test after acquiring the lock. + */ + if (proc->databaseId != locktag->locktag_field1) + { + LWLockRelease(&proc->fpInfoLock); + continue; + } + + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + uint32 lockmask; + + /* Look for an allocated slot matching the given relid. */ + if (relid != proc->fpRelId[f]) + continue; + lockmask = FAST_PATH_GET_BITS(proc, f); + if (!lockmask) + continue; + lockmask <<= FAST_PATH_LOCKNUMBER_OFFSET; + + /* + * There can only be one entry per relation, so if we found it + * and it doesn't conflict, we can skip the rest of the slots. + */ + if ((lockmask & conflictMask) == 0) + break; + + /* Conflict! */ + GET_VXID_FROM_PGPROC(vxid, *proc); + + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + /* else, xact already committed or aborted */ + + /* No need to examine remaining slots. */ + break; + } + + LWLockRelease(&proc->fpInfoLock); + } + } + + /* Remember how many fast-path conflicts we found. */ + fast_count = count; + + /* + * Look up the lock object matching the tag. + */ + LWLockAcquire(partitionLock, LW_SHARED); + + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (const void *) locktag, + hashcode, + HASH_FIND, + NULL); + if (!lock) + { + /* + * If the lock object doesn't exist, there is nothing holding a lock + * on this lockable object. + */ + LWLockRelease(partitionLock); + vxids[count].backendId = InvalidBackendId; + vxids[count].localTransactionId = InvalidLocalTransactionId; + if (countp) + *countp = count; + return vxids; + } + + /* + * Examine each existing holder (or awaiter) of the lock. + */ + + procLocks = &(lock->procLocks); + + proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, lockLink)); + + while (proclock) + { + if (conflictMask & proclock->holdMask) + { + PGPROC *proc = proclock->tag.myProc; + + /* A backend never blocks itself */ + if (proc != MyProc) + { + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + + if (VirtualTransactionIdIsValid(vxid)) + { + int i; + + /* Avoid duplicate entries. */ + for (i = 0; i < fast_count; ++i) + if (VirtualTransactionIdEquals(vxids[i], vxid)) + break; + if (i >= fast_count) + vxids[count++] = vxid; + } + /* else, xact already committed or aborted */ + } + } + + proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, + offsetof(PROCLOCK, lockLink)); + } + + LWLockRelease(partitionLock); + + if (count > MaxBackends + max_prepared_xacts) /* should never happen */ + elog(PANIC, "too many conflicting locks found"); + + vxids[count].backendId = InvalidBackendId; + vxids[count].localTransactionId = InvalidLocalTransactionId; + if (countp) + *countp = count; + return vxids; +} + +/* + * Find a lock in the shared lock table and release it. It is the caller's + * responsibility to verify that this is a sane thing to do. (For example, it + * would be bad to release a lock here if there might still be a LOCALLOCK + * object with pointers to it.) + * + * We currently use this in two situations: first, to release locks held by + * prepared transactions on commit (see lock_twophase_postcommit); and second, + * to release locks taken via the fast-path, transferred to the main hash + * table, and then released (see LockReleaseAll). + */ +static void +LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, + LOCKTAG *locktag, LOCKMODE lockmode, + bool decrement_strong_lock_count) +{ + LOCK *lock; + PROCLOCK *proclock; + PROCLOCKTAG proclocktag; + uint32 hashcode; + uint32 proclock_hashcode; + LWLock *partitionLock; + bool wakeupNeeded; + + hashcode = LockTagHashCode(locktag); + partitionLock = LockHashPartitionLock(hashcode); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * Re-find the lock object (it had better be there). + */ + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (void *) locktag, + hashcode, + HASH_FIND, + NULL); + if (!lock) + elog(PANIC, "failed to re-find shared lock object"); + + /* + * Re-find the proclock object (ditto). + */ + proclocktag.myLock = lock; + proclocktag.myProc = proc; + + proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode); + + proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, + (void *) &proclocktag, + proclock_hashcode, + HASH_FIND, + NULL); + if (!proclock) + elog(PANIC, "failed to re-find shared proclock object"); + + /* + * Double-check that we are actually holding a lock of the type we want to + * release. + */ + if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) + { + PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock); + LWLockRelease(partitionLock); + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + return; + } + + /* + * Do the releasing. CleanUpLock will waken any now-wakable waiters. + */ + wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable); + + CleanUpLock(lock, proclock, + lockMethodTable, hashcode, + wakeupNeeded); + + LWLockRelease(partitionLock); + + /* + * Decrement strong lock count. This logic is needed only for 2PC. + */ + if (decrement_strong_lock_count + && ConflictsWithRelationFastPath(locktag, lockmode)) + { + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0); + FastPathStrongRelationLocks->count[fasthashcode]--; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); + } +} + +/* + * CheckForSessionAndXactLocks + * Check to see if transaction holds both session-level and xact-level + * locks on the same object; if so, throw an error. + * + * If we have both session- and transaction-level locks on the same object, + * PREPARE TRANSACTION must fail. This should never happen with regular + * locks, since we only take those at session level in some special operations + * like VACUUM. It's possible to hit this with advisory locks, though. + * + * It would be nice if we could keep the session hold and give away the + * transactional hold to the prepared xact. However, that would require two + * PROCLOCK objects, and we cannot be sure that another PROCLOCK will be + * available when it comes time for PostPrepare_Locks to do the deed. + * So for now, we error out while we can still do so safely. + * + * Since the LOCALLOCK table stores a separate entry for each lockmode, + * we can't implement this check by examining LOCALLOCK entries in isolation. + * We must build a transient hashtable that is indexed by locktag only. + */ +static void +CheckForSessionAndXactLocks(void) +{ + typedef struct + { + LOCKTAG lock; /* identifies the lockable object */ + bool sessLock; /* is any lockmode held at session level? */ + bool xactLock; /* is any lockmode held at xact level? */ + } PerLockTagEntry; + + HASHCTL hash_ctl; + HTAB *lockhtab; + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + /* Create a local hash table keyed by LOCKTAG only */ + hash_ctl.keysize = sizeof(LOCKTAG); + hash_ctl.entrysize = sizeof(PerLockTagEntry); + hash_ctl.hcxt = CurrentMemoryContext; + + lockhtab = hash_create("CheckForSessionAndXactLocks table", + 256, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Scan local lock table to find entries for each LOCKTAG */ + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + PerLockTagEntry *hentry; + bool found; + int i; + + /* + * Ignore VXID locks. We don't want those to be held by prepared + * transactions, since they aren't meaningful after a restart. + */ + if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION) + continue; + + /* Ignore it if we don't actually hold the lock */ + if (locallock->nLocks <= 0) + continue; + + /* Otherwise, find or make an entry in lockhtab */ + hentry = (PerLockTagEntry *) hash_search(lockhtab, + (void *) &locallock->tag.lock, + HASH_ENTER, &found); + if (!found) /* initialize, if newly created */ + hentry->sessLock = hentry->xactLock = false; + + /* Scan to see if we hold lock at session or xact level or both */ + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == NULL) + hentry->sessLock = true; + else + hentry->xactLock = true; + } + + /* + * We can throw error immediately when we see both types of locks; no + * need to wait around to see if there are more violations. + */ + if (hentry->sessLock && hentry->xactLock) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object"))); + } + + /* Success, so clean up */ + hash_destroy(lockhtab); +} + +/* + * AtPrepare_Locks + * Do the preparatory work for a PREPARE: make 2PC state file records + * for all locks currently held. + * + * Session-level locks are ignored, as are VXID locks. + * + * For the most part, we don't need to touch shared memory for this --- + * all the necessary state information is in the locallock table. + * Fast-path locks are an exception, however: we move any such locks to + * the main table before allowing PREPARE TRANSACTION to succeed. + */ +void +AtPrepare_Locks(void) +{ + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + /* First, verify there aren't locks of both xact and session level */ + CheckForSessionAndXactLocks(); + + /* Now do the per-locallock cleanup work */ + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + TwoPhaseLockRecord record; + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + bool haveSessionLock; + bool haveXactLock; + int i; + + /* + * Ignore VXID locks. We don't want those to be held by prepared + * transactions, since they aren't meaningful after a restart. + */ + if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION) + continue; + + /* Ignore it if we don't actually hold the lock */ + if (locallock->nLocks <= 0) + continue; + + /* Scan to see whether we hold it at session or transaction level */ + haveSessionLock = haveXactLock = false; + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == NULL) + haveSessionLock = true; + else + haveXactLock = true; + } + + /* Ignore it if we have only session lock */ + if (!haveXactLock) + continue; + + /* This can't happen, because we already checked it */ + if (haveSessionLock) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object"))); + + /* + * If the local lock was taken via the fast-path, we need to move it + * to the primary lock table, or just get a pointer to the existing + * primary lock table entry if by chance it's already been + * transferred. + */ + if (locallock->proclock == NULL) + { + locallock->proclock = FastPathGetRelationLockEntry(locallock); + locallock->lock = locallock->proclock->tag.myLock; + } + + /* + * Arrange to not release any strong lock count held by this lock + * entry. We must retain the count until the prepared transaction is + * committed or rolled back. + */ + locallock->holdsStrongLockCount = false; + + /* + * Create a 2PC record. + */ + memcpy(&(record.locktag), &(locallock->tag.lock), sizeof(LOCKTAG)); + record.lockmode = locallock->tag.mode; + + RegisterTwoPhaseRecord(TWOPHASE_RM_LOCK_ID, 0, + &record, sizeof(TwoPhaseLockRecord)); + } +} + +/* + * PostPrepare_Locks + * Clean up after successful PREPARE + * + * Here, we want to transfer ownership of our locks to a dummy PGPROC + * that's now associated with the prepared transaction, and we want to + * clean out the corresponding entries in the LOCALLOCK table. + * + * Note: by removing the LOCALLOCK entries, we are leaving dangling + * pointers in the transaction's resource owner. This is OK at the + * moment since resowner.c doesn't try to free locks retail at a toplevel + * transaction commit or abort. We could alternatively zero out nLocks + * and leave the LOCALLOCK entries to be garbage-collected by LockReleaseAll, + * but that probably costs more cycles. + */ +void +PostPrepare_Locks(TransactionId xid) +{ + PGPROC *newproc = TwoPhaseGetDummyProc(xid, false); + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + PROCLOCKTAG proclocktag; + int partition; + + /* Can't prepare a lock group follower. */ + Assert(MyProc->lockGroupLeader == NULL || + MyProc->lockGroupLeader == MyProc); + + /* This is a critical section: any error means big trouble */ + START_CRIT_SECTION(); + + /* + * First we run through the locallock table and get rid of unwanted + * entries, then we scan the process's proclocks and transfer them to the + * target proc. + * + * We do this separately because we may have multiple locallock entries + * pointing to the same proclock, and we daren't end up with any dangling + * pointers. + */ + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + bool haveSessionLock; + bool haveXactLock; + int i; + + if (locallock->proclock == NULL || locallock->lock == NULL) + { + /* + * We must've run out of shared memory while trying to set up this + * lock. Just forget the local entry. + */ + Assert(locallock->nLocks == 0); + RemoveLocalLock(locallock); + continue; + } + + /* Ignore VXID locks */ + if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION) + continue; + + /* Scan to see whether we hold it at session or transaction level */ + haveSessionLock = haveXactLock = false; + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == NULL) + haveSessionLock = true; + else + haveXactLock = true; + } + + /* Ignore it if we have only session lock */ + if (!haveXactLock) + continue; + + /* This can't happen, because we already checked it */ + if (haveSessionLock) + ereport(PANIC, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object"))); + + /* Mark the proclock to show we need to release this lockmode */ + if (locallock->nLocks > 0) + locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode); + + /* And remove the locallock hashtable entry */ + RemoveLocalLock(locallock); + } + + /* + * Now, scan each lock partition separately. + */ + for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++) + { + LWLock *partitionLock; + SHM_QUEUE *procLocks = &(MyProc->myProcLocks[partition]); + PROCLOCK *nextplock; + + partitionLock = LockHashPartitionLockByIndex(partition); + + /* + * If the proclock list for this partition is empty, we can skip + * acquiring the partition lock. This optimization is safer than the + * situation in LockReleaseAll, because we got rid of any fast-path + * locks during AtPrepare_Locks, so there cannot be any case where + * another backend is adding something to our lists now. For safety, + * though, we code this the same way as in LockReleaseAll. + */ + if (SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, procLink)) == NULL) + continue; /* needn't examine this partition */ + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + for (proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, procLink)); + proclock; + proclock = nextplock) + { + /* Get link first, since we may unlink/relink this proclock */ + nextplock = (PROCLOCK *) + SHMQueueNext(procLocks, &proclock->procLink, + offsetof(PROCLOCK, procLink)); + + Assert(proclock->tag.myProc == MyProc); + + lock = proclock->tag.myLock; + + /* Ignore VXID locks */ + if (lock->tag.locktag_type == LOCKTAG_VIRTUALTRANSACTION) + continue; + + PROCLOCK_PRINT("PostPrepare_Locks", proclock); + LOCK_PRINT("PostPrepare_Locks", lock, 0); + Assert(lock->nRequested >= 0); + Assert(lock->nGranted >= 0); + Assert(lock->nGranted <= lock->nRequested); + Assert((proclock->holdMask & ~lock->grantMask) == 0); + + /* Ignore it if nothing to release (must be a session lock) */ + if (proclock->releaseMask == 0) + continue; + + /* Else we should be releasing all locks */ + if (proclock->releaseMask != proclock->holdMask) + elog(PANIC, "we seem to have dropped a bit somewhere"); + + /* + * We cannot simply modify proclock->tag.myProc to reassign + * ownership of the lock, because that's part of the hash key and + * the proclock would then be in the wrong hash chain. Instead + * use hash_update_hash_key. (We used to create a new hash entry, + * but that risks out-of-memory failure if other processes are + * busy making proclocks too.) We must unlink the proclock from + * our procLink chain and put it into the new proc's chain, too. + * + * Note: the updated proclock hash key will still belong to the + * same hash partition, cf proclock_hash(). So the partition lock + * we already hold is sufficient for this. + */ + SHMQueueDelete(&proclock->procLink); + + /* + * Create the new hash key for the proclock. + */ + proclocktag.myLock = lock; + proclocktag.myProc = newproc; + + /* + * Update groupLeader pointer to point to the new proc. (We'd + * better not be a member of somebody else's lock group!) + */ + Assert(proclock->groupLeader == proclock->tag.myProc); + proclock->groupLeader = newproc; + + /* + * Update the proclock. We should not find any existing entry for + * the same hash key, since there can be only one entry for any + * given lock with my own proc. + */ + if (!hash_update_hash_key(LockMethodProcLockHash, + (void *) proclock, + (void *) &proclocktag)) + elog(PANIC, "duplicate entry found while reassigning a prepared transaction's locks"); + + /* Re-link into the new proc's proclock list */ + SHMQueueInsertBefore(&(newproc->myProcLocks[partition]), + &proclock->procLink); + + PROCLOCK_PRINT("PostPrepare_Locks: updated", proclock); + } /* loop over PROCLOCKs within this partition */ + + LWLockRelease(partitionLock); + } /* loop over partitions */ + + END_CRIT_SECTION(); +} + + +/* + * Estimate shared-memory space used for lock tables + */ +Size +LockShmemSize(void) +{ + Size size = 0; + long max_table_size; + + /* lock hash table */ + max_table_size = NLOCKENTS(); + size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK))); + + /* proclock hash table */ + max_table_size *= 2; + size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK))); + + /* + * Since NLOCKENTS is only an estimate, add 10% safety margin. + */ + size = add_size(size, size / 10); + + return size; +} + +/* + * GetLockStatusData - Return a summary of the lock manager's internal + * status, for use in a user-level reporting function. + * + * The return data consists of an array of LockInstanceData objects, + * which are a lightly abstracted version of the PROCLOCK data structures, + * i.e. there is one entry for each unique lock and interested PGPROC. + * It is the caller's responsibility to match up related items (such as + * references to the same lockable object or PGPROC) if wanted. + * + * The design goal is to hold the LWLocks for as short a time as possible; + * thus, this function simply makes a copy of the necessary data and releases + * the locks, allowing the caller to contemplate and format the data for as + * long as it pleases. + */ +LockData * +GetLockStatusData(void) +{ + LockData *data; + PROCLOCK *proclock; + HASH_SEQ_STATUS seqstat; + int els; + int el; + int i; + + data = (LockData *) palloc(sizeof(LockData)); + + /* Guess how much space we'll need. */ + els = MaxBackends; + el = 0; + data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * els); + + /* + * First, we iterate through the per-backend fast-path arrays, locking + * them one at a time. This might produce an inconsistent picture of the + * system state, but taking all of those LWLocks at the same time seems + * impractical (in particular, note MAX_SIMUL_LWLOCKS). It shouldn't + * matter too much, because none of these locks can be involved in lock + * conflicts anyway - anything that might must be present in the main lock + * table. (For the same reason, we don't sweat about making leaderPid + * completely valid. We cannot safely dereference another backend's + * lockGroupLeader field without holding all lock partition locks, and + * it's not worth that.) + */ + for (i = 0; i < ProcGlobal->allProcCount; ++i) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; + + LWLockAcquire(&proc->fpInfoLock, LW_SHARED); + + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; ++f) + { + LockInstanceData *instance; + uint32 lockbits = FAST_PATH_GET_BITS(proc, f); + + /* Skip unallocated slots. */ + if (!lockbits) + continue; + + if (el >= els) + { + els += MaxBackends; + data->locks = (LockInstanceData *) + repalloc(data->locks, sizeof(LockInstanceData) * els); + } + + instance = &data->locks[el]; + SET_LOCKTAG_RELATION(instance->locktag, proc->databaseId, + proc->fpRelId[f]); + instance->holdMask = lockbits << FAST_PATH_LOCKNUMBER_OFFSET; + instance->waitLockMode = NoLock; + instance->backend = proc->backendId; + instance->lxid = proc->lxid; + instance->pid = proc->pid; + instance->leaderPid = proc->pid; + instance->fastpath = true; + + /* + * Successfully taking fast path lock means there were no + * conflicting locks. + */ + instance->waitStart = 0; + + el++; + } + + if (proc->fpVXIDLock) + { + VirtualTransactionId vxid; + LockInstanceData *instance; + + if (el >= els) + { + els += MaxBackends; + data->locks = (LockInstanceData *) + repalloc(data->locks, sizeof(LockInstanceData) * els); + } + + vxid.backendId = proc->backendId; + vxid.localTransactionId = proc->fpLocalTransactionId; + + instance = &data->locks[el]; + SET_LOCKTAG_VIRTUALTRANSACTION(instance->locktag, vxid); + instance->holdMask = LOCKBIT_ON(ExclusiveLock); + instance->waitLockMode = NoLock; + instance->backend = proc->backendId; + instance->lxid = proc->lxid; + instance->pid = proc->pid; + instance->leaderPid = proc->pid; + instance->fastpath = true; + instance->waitStart = 0; + + el++; + } + + LWLockRelease(&proc->fpInfoLock); + } + + /* + * Next, acquire lock on the entire shared lock data structure. We do + * this so that, at least for locks in the primary lock table, the state + * will be self-consistent. + * + * Since this is a read-only operation, we take shared instead of + * exclusive lock. There's not a whole lot of point to this, because all + * the normal operations require exclusive lock, but it doesn't hurt + * anything either. It will at least allow two backends to do + * GetLockStatusData in parallel. + * + * Must grab LWLocks in partition-number order to avoid LWLock deadlock. + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED); + + /* Now we can safely count the number of proclocks */ + data->nelements = el + hash_get_num_entries(LockMethodProcLockHash); + if (data->nelements > els) + { + els = data->nelements; + data->locks = (LockInstanceData *) + repalloc(data->locks, sizeof(LockInstanceData) * els); + } + + /* Now scan the tables to copy the data */ + hash_seq_init(&seqstat, LockMethodProcLockHash); + + while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat))) + { + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; + LockInstanceData *instance = &data->locks[el]; + + memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG)); + instance->holdMask = proclock->holdMask; + if (proc->waitLock == proclock->tag.myLock) + instance->waitLockMode = proc->waitLockMode; + else + instance->waitLockMode = NoLock; + instance->backend = proc->backendId; + instance->lxid = proc->lxid; + instance->pid = proc->pid; + instance->leaderPid = proclock->groupLeader->pid; + instance->fastpath = false; + instance->waitStart = (TimestampTz) pg_atomic_read_u64(&proc->waitStart); + + el++; + } + + /* + * And release locks. We do this in reverse order for two reasons: (1) + * Anyone else who needs more than one of the locks will be trying to lock + * them in increasing order; we don't want to release the other process + * until it can get all the locks it needs. (2) This avoids O(N^2) + * behavior inside LWLockRelease. + */ + for (i = NUM_LOCK_PARTITIONS; --i >= 0;) + LWLockRelease(LockHashPartitionLockByIndex(i)); + + Assert(el == data->nelements); + + return data; +} + +/* + * GetBlockerStatusData - Return a summary of the lock manager's state + * concerning locks that are blocking the specified PID or any member of + * the PID's lock group, for use in a user-level reporting function. + * + * For each PID within the lock group that is awaiting some heavyweight lock, + * the return data includes an array of LockInstanceData objects, which are + * the same data structure used by GetLockStatusData; but unlike that function, + * this one reports only the PROCLOCKs associated with the lock that that PID + * is blocked on. (Hence, all the locktags should be the same for any one + * blocked PID.) In addition, we return an array of the PIDs of those backends + * that are ahead of the blocked PID in the lock's wait queue. These can be + * compared with the PIDs in the LockInstanceData objects to determine which + * waiters are ahead of or behind the blocked PID in the queue. + * + * If blocked_pid isn't a valid backend PID or nothing in its lock group is + * waiting on any heavyweight lock, return empty arrays. + * + * The design goal is to hold the LWLocks for as short a time as possible; + * thus, this function simply makes a copy of the necessary data and releases + * the locks, allowing the caller to contemplate and format the data for as + * long as it pleases. + */ +BlockedProcsData * +GetBlockerStatusData(int blocked_pid) +{ + BlockedProcsData *data; + PGPROC *proc; + int i; + + data = (BlockedProcsData *) palloc(sizeof(BlockedProcsData)); + + /* + * Guess how much space we'll need, and preallocate. Most of the time + * this will avoid needing to do repalloc while holding the LWLocks. (We + * assume, but check with an Assert, that MaxBackends is enough entries + * for the procs[] array; the other two could need enlargement, though.) + */ + data->nprocs = data->nlocks = data->npids = 0; + data->maxprocs = data->maxlocks = data->maxpids = MaxBackends; + data->procs = (BlockedProcData *) palloc(sizeof(BlockedProcData) * data->maxprocs); + data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * data->maxlocks); + data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids); + + /* + * In order to search the ProcArray for blocked_pid and assume that that + * entry won't immediately disappear under us, we must hold ProcArrayLock. + * In addition, to examine the lock grouping fields of any other backend, + * we must hold all the hash partition locks. (Only one of those locks is + * actually relevant for any one lock group, but we can't know which one + * ahead of time.) It's fairly annoying to hold all those locks + * throughout this, but it's no worse than GetLockStatusData(), and it + * does have the advantage that we're guaranteed to return a + * self-consistent instantaneous state. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + proc = BackendPidGetProcWithLock(blocked_pid); + + /* Nothing to do if it's gone */ + if (proc != NULL) + { + /* + * Acquire lock on the entire shared lock data structure. See notes + * in GetLockStatusData(). + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED); + + if (proc->lockGroupLeader == NULL) + { + /* Easy case, proc is not a lock group member */ + GetSingleProcBlockerStatusData(proc, data); + } + else + { + /* Examine all procs in proc's lock group */ + dlist_iter iter; + + dlist_foreach(iter, &proc->lockGroupLeader->lockGroupMembers) + { + PGPROC *memberProc; + + memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur); + GetSingleProcBlockerStatusData(memberProc, data); + } + } + + /* + * And release locks. See notes in GetLockStatusData(). + */ + for (i = NUM_LOCK_PARTITIONS; --i >= 0;) + LWLockRelease(LockHashPartitionLockByIndex(i)); + + Assert(data->nprocs <= data->maxprocs); + } + + LWLockRelease(ProcArrayLock); + + return data; +} + +/* Accumulate data about one possibly-blocked proc for GetBlockerStatusData */ +static void +GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data) +{ + LOCK *theLock = blocked_proc->waitLock; + BlockedProcData *bproc; + SHM_QUEUE *procLocks; + PROCLOCK *proclock; + PROC_QUEUE *waitQueue; + PGPROC *proc; + int queue_size; + int i; + + /* Nothing to do if this proc is not blocked */ + if (theLock == NULL) + return; + + /* Set up a procs[] element */ + bproc = &data->procs[data->nprocs++]; + bproc->pid = blocked_proc->pid; + bproc->first_lock = data->nlocks; + bproc->first_waiter = data->npids; + + /* + * We may ignore the proc's fast-path arrays, since nothing in those could + * be related to a contended lock. + */ + + /* Collect all PROCLOCKs associated with theLock */ + procLocks = &(theLock->procLocks); + proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, lockLink)); + while (proclock) + { + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; + LockInstanceData *instance; + + if (data->nlocks >= data->maxlocks) + { + data->maxlocks += MaxBackends; + data->locks = (LockInstanceData *) + repalloc(data->locks, sizeof(LockInstanceData) * data->maxlocks); + } + + instance = &data->locks[data->nlocks]; + memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG)); + instance->holdMask = proclock->holdMask; + if (proc->waitLock == lock) + instance->waitLockMode = proc->waitLockMode; + else + instance->waitLockMode = NoLock; + instance->backend = proc->backendId; + instance->lxid = proc->lxid; + instance->pid = proc->pid; + instance->leaderPid = proclock->groupLeader->pid; + instance->fastpath = false; + data->nlocks++; + + proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, + offsetof(PROCLOCK, lockLink)); + } + + /* Enlarge waiter_pids[] if it's too small to hold all wait queue PIDs */ + waitQueue = &(theLock->waitProcs); + queue_size = waitQueue->size; + + if (queue_size > data->maxpids - data->npids) + { + data->maxpids = Max(data->maxpids + MaxBackends, + data->npids + queue_size); + data->waiter_pids = (int *) repalloc(data->waiter_pids, + sizeof(int) * data->maxpids); + } + + /* Collect PIDs from the lock's wait queue, stopping at blocked_proc */ + proc = (PGPROC *) waitQueue->links.next; + for (i = 0; i < queue_size; i++) + { + if (proc == blocked_proc) + break; + data->waiter_pids[data->npids++] = proc->pid; + proc = (PGPROC *) proc->links.next; + } + + bproc->num_locks = data->nlocks - bproc->first_lock; + bproc->num_waiters = data->npids - bproc->first_waiter; +} + +/* + * Returns a list of currently held AccessExclusiveLocks, for use by + * LogStandbySnapshot(). The result is a palloc'd array, + * with the number of elements returned into *nlocks. + * + * XXX This currently takes a lock on all partitions of the lock table, + * but it's possible to do better. By reference counting locks and storing + * the value in the ProcArray entry for each backend we could tell if any + * locks need recording without having to acquire the partition locks and + * scan the lock table. Whether that's worth the additional overhead + * is pretty dubious though. + */ +xl_standby_lock * +GetRunningTransactionLocks(int *nlocks) +{ + xl_standby_lock *accessExclusiveLocks; + PROCLOCK *proclock; + HASH_SEQ_STATUS seqstat; + int i; + int index; + int els; + + /* + * Acquire lock on the entire shared lock data structure. + * + * Must grab LWLocks in partition-number order to avoid LWLock deadlock. + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED); + + /* Now we can safely count the number of proclocks */ + els = hash_get_num_entries(LockMethodProcLockHash); + + /* + * Allocating enough space for all locks in the lock table is overkill, + * but it's more convenient and faster than having to enlarge the array. + */ + accessExclusiveLocks = palloc(els * sizeof(xl_standby_lock)); + + /* Now scan the tables to copy the data */ + hash_seq_init(&seqstat, LockMethodProcLockHash); + + /* + * If lock is a currently granted AccessExclusiveLock then it will have + * just one proclock holder, so locks are never accessed twice in this + * particular case. Don't copy this code for use elsewhere because in the + * general case this will give you duplicate locks when looking at + * non-exclusive lock types. + */ + index = 0; + while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat))) + { + /* make sure this definition matches the one used in LockAcquire */ + if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) && + proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION) + { + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; + TransactionId xid = proc->xid; + + /* + * Don't record locks for transactions if we know they have + * already issued their WAL record for commit but not yet released + * lock. It is still possible that we see locks held by already + * complete transactions, if they haven't yet zeroed their xids. + */ + if (!TransactionIdIsValid(xid)) + continue; + + accessExclusiveLocks[index].xid = xid; + accessExclusiveLocks[index].dbOid = lock->tag.locktag_field1; + accessExclusiveLocks[index].relOid = lock->tag.locktag_field2; + + index++; + } + } + + Assert(index <= els); + + /* + * And release locks. We do this in reverse order for two reasons: (1) + * Anyone else who needs more than one of the locks will be trying to lock + * them in increasing order; we don't want to release the other process + * until it can get all the locks it needs. (2) This avoids O(N^2) + * behavior inside LWLockRelease. + */ + for (i = NUM_LOCK_PARTITIONS; --i >= 0;) + LWLockRelease(LockHashPartitionLockByIndex(i)); + + *nlocks = index; + return accessExclusiveLocks; +} + +/* Provide the textual name of any lock mode */ +const char * +GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode) +{ + Assert(lockmethodid > 0 && lockmethodid < lengthof(LockMethods)); + Assert(mode > 0 && mode <= LockMethods[lockmethodid]->numLockModes); + return LockMethods[lockmethodid]->lockModeNames[mode]; +} + +#ifdef LOCK_DEBUG +/* + * Dump all locks in the given proc's myProcLocks lists. + * + * Caller is responsible for having acquired appropriate LWLocks. + */ +void +DumpLocks(PGPROC *proc) +{ + SHM_QUEUE *procLocks; + PROCLOCK *proclock; + LOCK *lock; + int i; + + if (proc == NULL) + return; + + if (proc->waitLock) + LOCK_PRINT("DumpLocks: waiting on", proc->waitLock, 0); + + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + { + procLocks = &(proc->myProcLocks[i]); + + proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, procLink)); + + while (proclock) + { + Assert(proclock->tag.myProc == proc); + + lock = proclock->tag.myLock; + + PROCLOCK_PRINT("DumpLocks", proclock); + LOCK_PRINT("DumpLocks", lock, 0); + + proclock = (PROCLOCK *) + SHMQueueNext(procLocks, &proclock->procLink, + offsetof(PROCLOCK, procLink)); + } + } +} + +/* + * Dump all lmgr locks. + * + * Caller is responsible for having acquired appropriate LWLocks. + */ +void +DumpAllLocks(void) +{ + PGPROC *proc; + PROCLOCK *proclock; + LOCK *lock; + HASH_SEQ_STATUS status; + + proc = MyProc; + + if (proc && proc->waitLock) + LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0); + + hash_seq_init(&status, LockMethodProcLockHash); + + while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL) + { + PROCLOCK_PRINT("DumpAllLocks", proclock); + + lock = proclock->tag.myLock; + if (lock) + LOCK_PRINT("DumpAllLocks", lock, 0); + else + elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL"); + } +} +#endif /* LOCK_DEBUG */ + +/* + * LOCK 2PC resource manager's routines + */ + +/* + * Re-acquire a lock belonging to a transaction that was prepared. + * + * Because this function is run at db startup, re-acquiring the locks should + * never conflict with running transactions because there are none. We + * assume that the lock state represented by the stored 2PC files is legal. + * + * When switching from Hot Standby mode to normal operation, the locks will + * be already held by the startup process. The locks are acquired for the new + * procs without checking for conflicts, so we don't get a conflict between the + * startup process and the dummy procs, even though we will momentarily have + * a situation where two procs are holding the same AccessExclusiveLock, + * which isn't normally possible because the conflict. If we're in standby + * mode, but a recovery snapshot hasn't been established yet, it's possible + * that some but not all of the locks are already held by the startup process. + * + * This approach is simple, but also a bit dangerous, because if there isn't + * enough shared memory to acquire the locks, an error will be thrown, which + * is promoted to FATAL and recovery will abort, bringing down postmaster. + * A safer approach would be to transfer the locks like we do in + * AtPrepare_Locks, but then again, in hot standby mode it's possible for + * read-only backends to use up all the shared lock memory anyway, so that + * replaying the WAL record that needs to acquire a lock will throw an error + * and PANIC anyway. + */ +void +lock_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; + PGPROC *proc = TwoPhaseGetDummyProc(xid, false); + LOCKTAG *locktag; + LOCKMODE lockmode; + LOCKMETHODID lockmethodid; + LOCK *lock; + PROCLOCK *proclock; + PROCLOCKTAG proclocktag; + bool found; + uint32 hashcode; + uint32 proclock_hashcode; + int partition; + LWLock *partitionLock; + LockMethod lockMethodTable; + + Assert(len == sizeof(TwoPhaseLockRecord)); + locktag = &rec->locktag; + lockmode = rec->lockmode; + lockmethodid = locktag->locktag_lockmethodid; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + + hashcode = LockTagHashCode(locktag); + partition = LockHashPartition(hashcode); + partitionLock = LockHashPartitionLock(hashcode); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * Find or create a lock with this tag. + */ + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (void *) locktag, + hashcode, + HASH_ENTER_NULL, + &found); + if (!lock) + { + LWLockRelease(partitionLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_locks_per_transaction."))); + } + + /* + * if it's a new lock object, initialize it + */ + if (!found) + { + lock->grantMask = 0; + lock->waitMask = 0; + SHMQueueInit(&(lock->procLocks)); + ProcQueueInit(&(lock->waitProcs)); + lock->nRequested = 0; + lock->nGranted = 0; + MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES); + MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES); + LOCK_PRINT("lock_twophase_recover: new", lock, lockmode); + } + else + { + LOCK_PRINT("lock_twophase_recover: found", lock, lockmode); + Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0)); + Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0)); + Assert(lock->nGranted <= lock->nRequested); + } + + /* + * Create the hash key for the proclock table. + */ + proclocktag.myLock = lock; + proclocktag.myProc = proc; + + proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode); + + /* + * Find or create a proclock entry with this tag + */ + proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, + (void *) &proclocktag, + proclock_hashcode, + HASH_ENTER_NULL, + &found); + if (!proclock) + { + /* Oops, not enough shmem for the proclock */ + if (lock->nRequested == 0) + { + /* + * There are no other requestors of this lock, so garbage-collect + * the lock object. We *must* do this to avoid a permanent leak + * of shared memory, because there won't be anything to cause + * anyone to release the lock object later. + */ + Assert(SHMQueueEmpty(&(lock->procLocks))); + if (!hash_search_with_hash_value(LockMethodLockHash, + (void *) &(lock->tag), + hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "lock table corrupted"); + } + LWLockRelease(partitionLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_locks_per_transaction."))); + } + + /* + * If new, initialize the new entry + */ + if (!found) + { + Assert(proc->lockGroupLeader == NULL); + proclock->groupLeader = proc; + proclock->holdMask = 0; + proclock->releaseMask = 0; + /* Add proclock to appropriate lists */ + SHMQueueInsertBefore(&lock->procLocks, &proclock->lockLink); + SHMQueueInsertBefore(&(proc->myProcLocks[partition]), + &proclock->procLink); + PROCLOCK_PRINT("lock_twophase_recover: new", proclock); + } + else + { + PROCLOCK_PRINT("lock_twophase_recover: found", proclock); + Assert((proclock->holdMask & ~lock->grantMask) == 0); + } + + /* + * lock->nRequested and lock->requested[] count the total number of + * requests, whether granted or waiting, so increment those immediately. + */ + lock->nRequested++; + lock->requested[lockmode]++; + Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0)); + + /* + * We shouldn't already hold the desired lock. + */ + if (proclock->holdMask & LOCKBIT_ON(lockmode)) + elog(ERROR, "lock %s on object %u/%u/%u is already held", + lockMethodTable->lockModeNames[lockmode], + lock->tag.locktag_field1, lock->tag.locktag_field2, + lock->tag.locktag_field3); + + /* + * We ignore any possible conflicts and just grant ourselves the lock. Not + * only because we don't bother, but also to avoid deadlocks when + * switching from standby to normal mode. See function comment. + */ + GrantLock(lock, proclock, lockmode); + + /* + * Bump strong lock count, to make sure any fast-path lock requests won't + * be granted without consulting the primary lock table. + */ + if (ConflictsWithRelationFastPath(&lock->tag, lockmode)) + { + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + FastPathStrongRelationLocks->count[fasthashcode]++; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); + } + + LWLockRelease(partitionLock); +} + +/* + * Re-acquire a lock belonging to a transaction that was prepared, when + * starting up into hot standby mode. + */ +void +lock_twophase_standby_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; + LOCKTAG *locktag; + LOCKMODE lockmode; + LOCKMETHODID lockmethodid; + + Assert(len == sizeof(TwoPhaseLockRecord)); + locktag = &rec->locktag; + lockmode = rec->lockmode; + lockmethodid = locktag->locktag_lockmethodid; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + + if (lockmode == AccessExclusiveLock && + locktag->locktag_type == LOCKTAG_RELATION) + { + StandbyAcquireAccessExclusiveLock(xid, + locktag->locktag_field1 /* dboid */ , + locktag->locktag_field2 /* reloid */ ); + } +} + + +/* + * 2PC processing routine for COMMIT PREPARED case. + * + * Find and release the lock indicated by the 2PC record. + */ +void +lock_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; + PGPROC *proc = TwoPhaseGetDummyProc(xid, true); + LOCKTAG *locktag; + LOCKMETHODID lockmethodid; + LockMethod lockMethodTable; + + Assert(len == sizeof(TwoPhaseLockRecord)); + locktag = &rec->locktag; + lockmethodid = locktag->locktag_lockmethodid; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + + LockRefindAndRelease(lockMethodTable, proc, locktag, rec->lockmode, true); +} + +/* + * 2PC processing routine for ROLLBACK PREPARED case. + * + * This is actually just the same as the COMMIT case. + */ +void +lock_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + lock_twophase_postcommit(xid, info, recdata, len); +} + +/* + * VirtualXactLockTableInsert + * + * Take vxid lock via the fast-path. There can't be any pre-existing + * lockers, as we haven't advertised this vxid via the ProcArray yet. + * + * Since MyProc->fpLocalTransactionId will normally contain the same data + * as MyProc->lxid, you might wonder if we really need both. The + * difference is that MyProc->lxid is set and cleared unlocked, and + * examined by procarray.c, while fpLocalTransactionId is protected by + * fpInfoLock and is used only by the locking subsystem. Doing it this + * way makes it easier to verify that there are no funny race conditions. + * + * We don't bother recording this lock in the local lock table, since it's + * only ever released at the end of a transaction. Instead, + * LockReleaseAll() calls VirtualXactLockTableCleanup(). + */ +void +VirtualXactLockTableInsert(VirtualTransactionId vxid) +{ + Assert(VirtualTransactionIdIsValid(vxid)); + + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + + Assert(MyProc->backendId == vxid.backendId); + Assert(MyProc->fpLocalTransactionId == InvalidLocalTransactionId); + Assert(MyProc->fpVXIDLock == false); + + MyProc->fpVXIDLock = true; + MyProc->fpLocalTransactionId = vxid.localTransactionId; + + LWLockRelease(&MyProc->fpInfoLock); +} + +/* + * VirtualXactLockTableCleanup + * + * Check whether a VXID lock has been materialized; if so, release it, + * unblocking waiters. + */ +void +VirtualXactLockTableCleanup(void) +{ + bool fastpath; + LocalTransactionId lxid; + + Assert(MyProc->backendId != InvalidBackendId); + + /* + * Clean up shared memory state. + */ + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + + fastpath = MyProc->fpVXIDLock; + lxid = MyProc->fpLocalTransactionId; + MyProc->fpVXIDLock = false; + MyProc->fpLocalTransactionId = InvalidLocalTransactionId; + + LWLockRelease(&MyProc->fpInfoLock); + + /* + * If fpVXIDLock has been cleared without touching fpLocalTransactionId, + * that means someone transferred the lock to the main lock table. + */ + if (!fastpath && LocalTransactionIdIsValid(lxid)) + { + VirtualTransactionId vxid; + LOCKTAG locktag; + + vxid.backendId = MyBackendId; + vxid.localTransactionId = lxid; + SET_LOCKTAG_VIRTUALTRANSACTION(locktag, vxid); + + LockRefindAndRelease(LockMethods[DEFAULT_LOCKMETHOD], MyProc, + &locktag, ExclusiveLock, false); + } +} + +/* + * XactLockForVirtualXact + * + * If TransactionIdIsValid(xid), this is essentially XactLockTableWait(xid, + * NULL, NULL, XLTW_None) or ConditionalXactLockTableWait(xid). Unlike those + * functions, it assumes "xid" is never a subtransaction and that "xid" is + * prepared, committed, or aborted. + * + * If !TransactionIdIsValid(xid), this locks every prepared XID having been + * known as "vxid" before its PREPARE TRANSACTION. + */ +static bool +XactLockForVirtualXact(VirtualTransactionId vxid, + TransactionId xid, bool wait) +{ + bool more = false; + + /* There is no point to wait for 2PCs if you have no 2PCs. */ + if (max_prepared_xacts == 0) + return true; + + do + { + LockAcquireResult lar; + LOCKTAG tag; + + /* Clear state from previous iterations. */ + if (more) + { + xid = InvalidTransactionId; + more = false; + } + + /* If we have no xid, try to find one. */ + if (!TransactionIdIsValid(xid)) + xid = TwoPhaseGetXidByVirtualXID(vxid, &more); + if (!TransactionIdIsValid(xid)) + { + Assert(!more); + return true; + } + + /* Check or wait for XID completion. */ + SET_LOCKTAG_TRANSACTION(tag, xid); + lar = LockAcquire(&tag, ShareLock, false, !wait); + if (lar == LOCKACQUIRE_NOT_AVAIL) + return false; + LockRelease(&tag, ShareLock, false); + } while (more); + + return true; +} + +/* + * VirtualXactLock + * + * If wait = true, wait as long as the given VXID or any XID acquired by the + * same transaction is still running. Then, return true. + * + * If wait = false, just check whether that VXID or one of those XIDs is still + * running, and return true or false. + */ +bool +VirtualXactLock(VirtualTransactionId vxid, bool wait) +{ + LOCKTAG tag; + PGPROC *proc; + TransactionId xid = InvalidTransactionId; + + Assert(VirtualTransactionIdIsValid(vxid)); + + if (VirtualTransactionIdIsRecoveredPreparedXact(vxid)) + /* no vxid lock; localTransactionId is a normal, locked XID */ + return XactLockForVirtualXact(vxid, vxid.localTransactionId, wait); + + SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid); + + /* + * If a lock table entry must be made, this is the PGPROC on whose behalf + * it must be done. Note that the transaction might end or the PGPROC + * might be reassigned to a new backend before we get around to examining + * it, but it doesn't matter. If we find upon examination that the + * relevant lxid is no longer running here, that's enough to prove that + * it's no longer running anywhere. + */ + proc = BackendIdGetProc(vxid.backendId); + if (proc == NULL) + return XactLockForVirtualXact(vxid, InvalidTransactionId, wait); + + /* + * We must acquire this lock before checking the backendId and lxid + * against the ones we're waiting for. The target backend will only set + * or clear lxid while holding this lock. + */ + LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE); + + if (proc->backendId != vxid.backendId + || proc->fpLocalTransactionId != vxid.localTransactionId) + { + /* VXID ended */ + LWLockRelease(&proc->fpInfoLock); + return XactLockForVirtualXact(vxid, InvalidTransactionId, wait); + } + + /* + * If we aren't asked to wait, there's no need to set up a lock table + * entry. The transaction is still in progress, so just return false. + */ + if (!wait) + { + LWLockRelease(&proc->fpInfoLock); + return false; + } + + /* + * OK, we're going to need to sleep on the VXID. But first, we must set + * up the primary lock table entry, if needed (ie, convert the proc's + * fast-path lock on its VXID to a regular lock). + */ + if (proc->fpVXIDLock) + { + PROCLOCK *proclock; + uint32 hashcode; + LWLock *partitionLock; + + hashcode = LockTagHashCode(&tag); + + partitionLock = LockHashPartitionLock(hashcode); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + proclock = SetupLockInTable(LockMethods[DEFAULT_LOCKMETHOD], proc, + &tag, hashcode, ExclusiveLock); + if (!proclock) + { + LWLockRelease(partitionLock); + LWLockRelease(&proc->fpInfoLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_locks_per_transaction."))); + } + GrantLock(proclock->tag.myLock, proclock, ExclusiveLock); + + LWLockRelease(partitionLock); + + proc->fpVXIDLock = false; + } + + /* + * If the proc has an XID now, we'll avoid a TwoPhaseGetXidByVirtualXID() + * search. The proc might have assigned this XID but not yet locked it, + * in which case the proc will lock this XID before releasing the VXID. + * The fpInfoLock critical section excludes VirtualXactLockTableCleanup(), + * so we won't save an XID of a different VXID. It doesn't matter whether + * we save this before or after setting up the primary lock table entry. + */ + xid = proc->xid; + + /* Done with proc->fpLockBits */ + LWLockRelease(&proc->fpInfoLock); + + /* Time to wait. */ + (void) LockAcquire(&tag, ShareLock, false, false); + + LockRelease(&tag, ShareLock, false); + return XactLockForVirtualXact(vxid, xid, wait); +} + +/* + * LockWaiterCount + * + * Find the number of lock requester on this locktag + */ +int +LockWaiterCount(const LOCKTAG *locktag) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LOCK *lock; + bool found; + uint32 hashcode; + LWLock *partitionLock; + int waiters = 0; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + + hashcode = LockTagHashCode(locktag); + partitionLock = LockHashPartitionLock(hashcode); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + (const void *) locktag, + hashcode, + HASH_FIND, + &found); + if (found) + { + Assert(lock != NULL); + waiters = lock->nRequested; + } + LWLockRelease(partitionLock); + + return waiters; +} diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c new file mode 100644 index 0000000..07eb6f6 --- /dev/null +++ b/src/backend/storage/lmgr/lwlock.c @@ -0,0 +1,1977 @@ +/*------------------------------------------------------------------------- + * + * lwlock.c + * Lightweight lock manager + * + * Lightweight locks are intended primarily to provide mutual exclusion of + * access to shared-memory data structures. Therefore, they offer both + * exclusive and shared lock modes (to support read/write and read-only + * access to a shared object). There are few other frammishes. User-level + * locking should be done with the full lock manager --- which depends on + * LWLocks to protect its shared state. + * + * In addition to exclusive and shared modes, lightweight locks can be used to + * wait until a variable changes value. The variable is initially not set + * when the lock is acquired with LWLockAcquire, i.e. it remains set to the + * value it was set to when the lock was released last, and can be updated + * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar + * waits for the variable to be updated, or until the lock is free. When + * releasing the lock with LWLockReleaseClearVar() the value can be set to an + * appropriate value for a free lock. The meaning of the variable is up to + * the caller, the lightweight lock code just assigns and compares it. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/lmgr/lwlock.c + * + * NOTES: + * + * This used to be a pretty straight forward reader-writer lock + * implementation, in which the internal state was protected by a + * spinlock. Unfortunately the overhead of taking the spinlock proved to be + * too high for workloads/locks that were taken in shared mode very + * frequently. Often we were spinning in the (obviously exclusive) spinlock, + * while trying to acquire a shared lock that was actually free. + * + * Thus a new implementation was devised that provides wait-free shared lock + * acquisition for locks that aren't exclusively locked. + * + * The basic idea is to have a single atomic variable 'lockcount' instead of + * the formerly separate shared and exclusive counters and to use atomic + * operations to acquire the lock. That's fairly easy to do for plain + * rw-spinlocks, but a lot harder for something like LWLocks that want to wait + * in the OS. + * + * For lock acquisition we use an atomic compare-and-exchange on the lockcount + * variable. For exclusive lock we swap in a sentinel value + * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders. + * + * To release the lock we use an atomic decrement to release the lock. If the + * new value is zero (we get that atomically), we know we can/have to release + * waiters. + * + * Obviously it is important that the sentinel value for exclusive locks + * doesn't conflict with the maximum number of possible share lockers - + * luckily MAX_BACKENDS makes that easily possible. + * + * + * The attentive reader might have noticed that naively doing the above has a + * glaring race condition: We try to lock using the atomic operations and + * notice that we have to wait. Unfortunately by the time we have finished + * queuing, the former locker very well might have already finished it's + * work. That's problematic because we're now stuck waiting inside the OS. + + * To mitigate those races we use a two phased attempt at locking: + * Phase 1: Try to do it atomically, if we succeed, nice + * Phase 2: Add ourselves to the waitqueue of the lock + * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from + * the queue + * Phase 4: Sleep till wake-up, goto Phase 1 + * + * This protects us against the problem from above as nobody can release too + * quick, before we're queued, since after Phase 2 we're already queued. + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "postmaster/postmaster.h" +#include "replication/slot.h" +#include "storage/ipc.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/proclist.h" +#include "storage/spin.h" +#include "utils/memutils.h" + +#ifdef LWLOCK_STATS +#include "utils/hsearch.h" +#endif + + +/* We use the ShmemLock spinlock to protect LWLockCounter */ +extern slock_t *ShmemLock; + +#define LW_FLAG_HAS_WAITERS ((uint32) 1 << 30) +#define LW_FLAG_RELEASE_OK ((uint32) 1 << 29) +#define LW_FLAG_LOCKED ((uint32) 1 << 28) + +#define LW_VAL_EXCLUSIVE ((uint32) 1 << 24) +#define LW_VAL_SHARED 1 + +#define LW_LOCK_MASK ((uint32) ((1 << 25)-1)) +/* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */ +#define LW_SHARED_MASK ((uint32) ((1 << 24)-1)) + +/* + * There are three sorts of LWLock "tranches": + * + * 1. The individually-named locks defined in lwlocknames.h each have their + * own tranche. The names of these tranches appear in IndividualLWLockNames[] + * in lwlocknames.c. + * + * 2. There are some predefined tranches for built-in groups of locks. + * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names + * appear in BuiltinTrancheNames[] below. + * + * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche + * or LWLockRegisterTranche. The names of these that are known in the current + * process appear in LWLockTrancheNames[]. + * + * All these names are user-visible as wait event names, so choose with care + * ... and do not forget to update the documentation's list of wait events. + */ +extern const char *const IndividualLWLockNames[]; /* in lwlocknames.c */ + +static const char *const BuiltinTrancheNames[] = { + /* LWTRANCHE_XACT_BUFFER: */ + "XactBuffer", + /* LWTRANCHE_COMMITTS_BUFFER: */ + "CommitTSBuffer", + /* LWTRANCHE_SUBTRANS_BUFFER: */ + "SubtransBuffer", + /* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */ + "MultiXactOffsetBuffer", + /* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */ + "MultiXactMemberBuffer", + /* LWTRANCHE_NOTIFY_BUFFER: */ + "NotifyBuffer", + /* LWTRANCHE_SERIAL_BUFFER: */ + "SerialBuffer", + /* LWTRANCHE_WAL_INSERT: */ + "WALInsert", + /* LWTRANCHE_BUFFER_CONTENT: */ + "BufferContent", + /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */ + "ReplicationOriginState", + /* LWTRANCHE_REPLICATION_SLOT_IO: */ + "ReplicationSlotIO", + /* LWTRANCHE_LOCK_FASTPATH: */ + "LockFastPath", + /* LWTRANCHE_BUFFER_MAPPING: */ + "BufferMapping", + /* LWTRANCHE_LOCK_MANAGER: */ + "LockManager", + /* LWTRANCHE_PREDICATE_LOCK_MANAGER: */ + "PredicateLockManager", + /* LWTRANCHE_PARALLEL_HASH_JOIN: */ + "ParallelHashJoin", + /* LWTRANCHE_PARALLEL_QUERY_DSA: */ + "ParallelQueryDSA", + /* LWTRANCHE_PER_SESSION_DSA: */ + "PerSessionDSA", + /* LWTRANCHE_PER_SESSION_RECORD_TYPE: */ + "PerSessionRecordType", + /* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */ + "PerSessionRecordTypmod", + /* LWTRANCHE_SHARED_TUPLESTORE: */ + "SharedTupleStore", + /* LWTRANCHE_SHARED_TIDBITMAP: */ + "SharedTidBitmap", + /* LWTRANCHE_PARALLEL_APPEND: */ + "ParallelAppend", + /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */ + "PerXactPredicateList" +}; + +StaticAssertDecl(lengthof(BuiltinTrancheNames) == + LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS, + "missing entries in BuiltinTrancheNames[]"); + +/* + * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and + * stores the names of all dynamically-created tranches known to the current + * process. Any unused entries in the array will contain NULL. + */ +static const char **LWLockTrancheNames = NULL; +static int LWLockTrancheNamesAllocated = 0; + +/* + * This points to the main array of LWLocks in shared memory. Backends inherit + * the pointer by fork from the postmaster (except in the EXEC_BACKEND case, + * where we have special measures to pass it down). + */ +LWLockPadded *MainLWLockArray = NULL; + +/* + * We use this structure to keep track of locked LWLocks for release + * during error recovery. Normally, only a few will be held at once, but + * occasionally the number can be much higher; for example, the pg_buffercache + * extension locks all buffer partitions simultaneously. + */ +#define MAX_SIMUL_LWLOCKS 200 + +/* struct representing the LWLocks we're holding */ +typedef struct LWLockHandle +{ + LWLock *lock; + LWLockMode mode; +} LWLockHandle; + +static int num_held_lwlocks = 0; +static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS]; + +/* struct representing the LWLock tranche request for named tranche */ +typedef struct NamedLWLockTrancheRequest +{ + char tranche_name[NAMEDATALEN]; + int num_lwlocks; +} NamedLWLockTrancheRequest; + +static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; +static int NamedLWLockTrancheRequestsAllocated = 0; + +/* + * NamedLWLockTrancheRequests is both the valid length of the request array, + * and the length of the shared-memory NamedLWLockTrancheArray later on. + * This variable and NamedLWLockTrancheArray are non-static so that + * postmaster.c can copy them to child processes in EXEC_BACKEND builds. + */ +int NamedLWLockTrancheRequests = 0; + +/* points to data in shared memory: */ +NamedLWLockTranche *NamedLWLockTrancheArray = NULL; + +static bool lock_named_request_allowed = true; + +static void InitializeLWLocks(void); +static inline void LWLockReportWaitStart(LWLock *lock); +static inline void LWLockReportWaitEnd(void); +static const char *GetLWTrancheName(uint16 trancheId); + +#define T_NAME(lock) \ + GetLWTrancheName((lock)->tranche) + +#ifdef LWLOCK_STATS +typedef struct lwlock_stats_key +{ + int tranche; + void *instance; +} lwlock_stats_key; + +typedef struct lwlock_stats +{ + lwlock_stats_key key; + int sh_acquire_count; + int ex_acquire_count; + int block_count; + int dequeue_self_count; + int spin_delay_count; +} lwlock_stats; + +static HTAB *lwlock_stats_htab; +static lwlock_stats lwlock_stats_dummy; +#endif + +#ifdef LOCK_DEBUG +bool Trace_lwlocks = false; + +inline static void +PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode) +{ + /* hide statement & context here, otherwise the log is just too verbose */ + if (Trace_lwlocks) + { + uint32 state = pg_atomic_read_u32(&lock->state); + + ereport(LOG, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d", + MyProcPid, + where, T_NAME(lock), lock, + (state & LW_VAL_EXCLUSIVE) != 0, + state & LW_SHARED_MASK, + (state & LW_FLAG_HAS_WAITERS) != 0, + pg_atomic_read_u32(&lock->nwaiters), + (state & LW_FLAG_RELEASE_OK) != 0))); + } +} + +inline static void +LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg) +{ + /* hide statement & context here, otherwise the log is just too verbose */ + if (Trace_lwlocks) + { + ereport(LOG, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("%s(%s %p): %s", where, + T_NAME(lock), lock, msg))); + } +} + +#else /* not LOCK_DEBUG */ +#define PRINT_LWDEBUG(a,b,c) ((void)0) +#define LOG_LWDEBUG(a,b,c) ((void)0) +#endif /* LOCK_DEBUG */ + +#ifdef LWLOCK_STATS + +static void init_lwlock_stats(void); +static void print_lwlock_stats(int code, Datum arg); +static lwlock_stats * get_lwlock_stats_entry(LWLock *lock); + +static void +init_lwlock_stats(void) +{ + HASHCTL ctl; + static MemoryContext lwlock_stats_cxt = NULL; + static bool exit_registered = false; + + if (lwlock_stats_cxt != NULL) + MemoryContextDelete(lwlock_stats_cxt); + + /* + * The LWLock stats will be updated within a critical section, which + * requires allocating new hash entries. Allocations within a critical + * section are normally not allowed because running out of memory would + * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally + * turned on in production, so that's an acceptable risk. The hash entries + * are small, so the risk of running out of memory is minimal in practice. + */ + lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext, + "LWLock stats", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true); + + ctl.keysize = sizeof(lwlock_stats_key); + ctl.entrysize = sizeof(lwlock_stats); + ctl.hcxt = lwlock_stats_cxt; + lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + if (!exit_registered) + { + on_shmem_exit(print_lwlock_stats, 0); + exit_registered = true; + } +} + +static void +print_lwlock_stats(int code, Datum arg) +{ + HASH_SEQ_STATUS scan; + lwlock_stats *lwstats; + + hash_seq_init(&scan, lwlock_stats_htab); + + /* Grab an LWLock to keep different backends from mixing reports */ + LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE); + + while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL) + { + fprintf(stderr, + "PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n", + MyProcPid, GetLWTrancheName(lwstats->key.tranche), + lwstats->key.instance, lwstats->sh_acquire_count, + lwstats->ex_acquire_count, lwstats->block_count, + lwstats->spin_delay_count, lwstats->dequeue_self_count); + } + + LWLockRelease(&MainLWLockArray[0].lock); +} + +static lwlock_stats * +get_lwlock_stats_entry(LWLock *lock) +{ + lwlock_stats_key key; + lwlock_stats *lwstats; + bool found; + + /* + * During shared memory initialization, the hash table doesn't exist yet. + * Stats of that phase aren't very interesting, so just collect operations + * on all locks in a single dummy entry. + */ + if (lwlock_stats_htab == NULL) + return &lwlock_stats_dummy; + + /* Fetch or create the entry. */ + MemSet(&key, 0, sizeof(key)); + key.tranche = lock->tranche; + key.instance = lock; + lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found); + if (!found) + { + lwstats->sh_acquire_count = 0; + lwstats->ex_acquire_count = 0; + lwstats->block_count = 0; + lwstats->dequeue_self_count = 0; + lwstats->spin_delay_count = 0; + } + return lwstats; +} +#endif /* LWLOCK_STATS */ + + +/* + * Compute number of LWLocks required by named tranches. These will be + * allocated in the main array. + */ +static int +NumLWLocksForNamedTranches(void) +{ + int numLocks = 0; + int i; + + for (i = 0; i < NamedLWLockTrancheRequests; i++) + numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks; + + return numLocks; +} + +/* + * Compute shmem space needed for LWLocks and named tranches. + */ +Size +LWLockShmemSize(void) +{ + Size size; + int i; + int numLocks = NUM_FIXED_LWLOCKS; + + /* Calculate total number of locks needed in the main array. */ + numLocks += NumLWLocksForNamedTranches(); + + /* Space for the LWLock array. */ + size = mul_size(numLocks, sizeof(LWLockPadded)); + + /* Space for dynamic allocation counter, plus room for alignment. */ + size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE); + + /* space for named tranches. */ + size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche))); + + /* space for name of each tranche. */ + for (i = 0; i < NamedLWLockTrancheRequests; i++) + size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1); + + /* Disallow adding any more named tranches. */ + lock_named_request_allowed = false; + + return size; +} + +/* + * Allocate shmem space for the main LWLock array and all tranches and + * initialize it. We also register extension LWLock tranches here. + */ +void +CreateLWLocks(void) +{ + StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS, + "MAX_BACKENDS too big for lwlock.c"); + + StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE, + "Miscalculated LWLock padding"); + + if (!IsUnderPostmaster) + { + Size spaceLocks = LWLockShmemSize(); + int *LWLockCounter; + char *ptr; + + /* Allocate space */ + ptr = (char *) ShmemAlloc(spaceLocks); + + /* Leave room for dynamic allocation of tranches */ + ptr += sizeof(int); + + /* Ensure desired alignment of LWLock array */ + ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; + + MainLWLockArray = (LWLockPadded *) ptr; + + /* + * Initialize the dynamic-allocation counter for tranches, which is + * stored just before the first LWLock. + */ + LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); + *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; + + /* Initialize all LWLocks */ + InitializeLWLocks(); + } + + /* Register named extension LWLock tranches in the current process. */ + for (int i = 0; i < NamedLWLockTrancheRequests; i++) + LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId, + NamedLWLockTrancheArray[i].trancheName); +} + +/* + * Initialize LWLocks that are fixed and those belonging to named tranches. + */ +static void +InitializeLWLocks(void) +{ + int numNamedLocks = NumLWLocksForNamedTranches(); + int id; + int i; + int j; + LWLockPadded *lock; + + /* Initialize all individual LWLocks in main array */ + for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++) + LWLockInitialize(&lock->lock, id); + + /* Initialize buffer mapping LWLocks in main array */ + lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET; + for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING); + + /* Initialize lmgrs' LWLocks in main array */ + lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET; + for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER); + + /* Initialize predicate lmgrs' LWLocks in main array */ + lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET; + for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER); + + /* + * Copy the info about any named tranches into shared memory (so that + * other processes can see it), and initialize the requested LWLocks. + */ + if (NamedLWLockTrancheRequests > 0) + { + char *trancheNames; + + NamedLWLockTrancheArray = (NamedLWLockTranche *) + &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks]; + + trancheNames = (char *) NamedLWLockTrancheArray + + (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche)); + lock = &MainLWLockArray[NUM_FIXED_LWLOCKS]; + + for (i = 0; i < NamedLWLockTrancheRequests; i++) + { + NamedLWLockTrancheRequest *request; + NamedLWLockTranche *tranche; + char *name; + + request = &NamedLWLockTrancheRequestArray[i]; + tranche = &NamedLWLockTrancheArray[i]; + + name = trancheNames; + trancheNames += strlen(request->tranche_name) + 1; + strcpy(name, request->tranche_name); + tranche->trancheId = LWLockNewTrancheId(); + tranche->trancheName = name; + + for (j = 0; j < request->num_lwlocks; j++, lock++) + LWLockInitialize(&lock->lock, tranche->trancheId); + } + } +} + +/* + * InitLWLockAccess - initialize backend-local state needed to hold LWLocks + */ +void +InitLWLockAccess(void) +{ +#ifdef LWLOCK_STATS + init_lwlock_stats(); +#endif +} + +/* + * GetNamedLWLockTranche - returns the base address of LWLock from the + * specified tranche. + * + * Caller needs to retrieve the requested number of LWLocks starting from + * the base lock address returned by this API. This can be used for + * tranches that are requested by using RequestNamedLWLockTranche() API. + */ +LWLockPadded * +GetNamedLWLockTranche(const char *tranche_name) +{ + int lock_pos; + int i; + + /* + * Obtain the position of base address of LWLock belonging to requested + * tranche_name in MainLWLockArray. LWLocks for named tranches are placed + * in MainLWLockArray after fixed locks. + */ + lock_pos = NUM_FIXED_LWLOCKS; + for (i = 0; i < NamedLWLockTrancheRequests; i++) + { + if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name, + tranche_name) == 0) + return &MainLWLockArray[lock_pos]; + + lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks; + } + + elog(ERROR, "requested tranche is not registered"); + + /* just to keep compiler quiet */ + return NULL; +} + +/* + * Allocate a new tranche ID. + */ +int +LWLockNewTrancheId(void) +{ + int result; + int *LWLockCounter; + + LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); + SpinLockAcquire(ShmemLock); + result = (*LWLockCounter)++; + SpinLockRelease(ShmemLock); + + return result; +} + +/* + * Register a dynamic tranche name in the lookup table of the current process. + * + * This routine will save a pointer to the tranche name passed as an argument, + * so the name should be allocated in a backend-lifetime context + * (shared memory, TopMemoryContext, static constant, or similar). + * + * The tranche name will be user-visible as a wait event name, so try to + * use a name that fits the style for those. + */ +void +LWLockRegisterTranche(int tranche_id, const char *tranche_name) +{ + /* This should only be called for user-defined tranches. */ + if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED) + return; + + /* Convert to array index. */ + tranche_id -= LWTRANCHE_FIRST_USER_DEFINED; + + /* If necessary, create or enlarge array. */ + if (tranche_id >= LWLockTrancheNamesAllocated) + { + int newalloc; + + newalloc = Max(LWLockTrancheNamesAllocated, 8); + while (newalloc <= tranche_id) + newalloc *= 2; + + if (LWLockTrancheNames == NULL) + LWLockTrancheNames = (const char **) + MemoryContextAllocZero(TopMemoryContext, + newalloc * sizeof(char *)); + else + { + LWLockTrancheNames = (const char **) + repalloc(LWLockTrancheNames, newalloc * sizeof(char *)); + memset(LWLockTrancheNames + LWLockTrancheNamesAllocated, + 0, + (newalloc - LWLockTrancheNamesAllocated) * sizeof(char *)); + } + LWLockTrancheNamesAllocated = newalloc; + } + + LWLockTrancheNames[tranche_id] = tranche_name; +} + +/* + * RequestNamedLWLockTranche + * Request that extra LWLocks be allocated during postmaster + * startup. + * + * This is only useful for extensions if called from the _PG_init hook + * of a library that is loaded into the postmaster via + * shared_preload_libraries. Once shared memory has been allocated, calls + * will be ignored. (We could raise an error, but it seems better to make + * it a no-op, so that libraries containing such calls can be reloaded if + * needed.) + * + * The tranche name will be user-visible as a wait event name, so try to + * use a name that fits the style for those. + */ +void +RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) +{ + NamedLWLockTrancheRequest *request; + + if (IsUnderPostmaster || !lock_named_request_allowed) + return; /* too late */ + + if (NamedLWLockTrancheRequestArray == NULL) + { + NamedLWLockTrancheRequestsAllocated = 16; + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) + MemoryContextAlloc(TopMemoryContext, + NamedLWLockTrancheRequestsAllocated + * sizeof(NamedLWLockTrancheRequest)); + } + + if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated) + { + int i = NamedLWLockTrancheRequestsAllocated; + + while (i <= NamedLWLockTrancheRequests) + i *= 2; + + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) + repalloc(NamedLWLockTrancheRequestArray, + i * sizeof(NamedLWLockTrancheRequest)); + NamedLWLockTrancheRequestsAllocated = i; + } + + request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests]; + Assert(strlen(tranche_name) + 1 <= NAMEDATALEN); + strlcpy(request->tranche_name, tranche_name, NAMEDATALEN); + request->num_lwlocks = num_lwlocks; + NamedLWLockTrancheRequests++; +} + +/* + * LWLockInitialize - initialize a new lwlock; it's initially unlocked + */ +void +LWLockInitialize(LWLock *lock, int tranche_id) +{ + pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK); +#ifdef LOCK_DEBUG + pg_atomic_init_u32(&lock->nwaiters, 0); +#endif + lock->tranche = tranche_id; + proclist_init(&lock->waiters); +} + +/* + * Report start of wait event for light-weight locks. + * + * This function will be used by all the light-weight lock calls which + * needs to wait to acquire the lock. This function distinguishes wait + * event based on tranche and lock id. + */ +static inline void +LWLockReportWaitStart(LWLock *lock) +{ + pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche); +} + +/* + * Report end of wait event for light-weight locks. + */ +static inline void +LWLockReportWaitEnd(void) +{ + pgstat_report_wait_end(); +} + +/* + * Return the name of an LWLock tranche. + */ +static const char * +GetLWTrancheName(uint16 trancheId) +{ + /* Individual LWLock? */ + if (trancheId < NUM_INDIVIDUAL_LWLOCKS) + return IndividualLWLockNames[trancheId]; + + /* Built-in tranche? */ + if (trancheId < LWTRANCHE_FIRST_USER_DEFINED) + return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS]; + + /* + * It's an extension tranche, so look in LWLockTrancheNames[]. However, + * it's possible that the tranche has never been registered in the current + * process, in which case give up and return "extension". + */ + trancheId -= LWTRANCHE_FIRST_USER_DEFINED; + + if (trancheId >= LWLockTrancheNamesAllocated || + LWLockTrancheNames[trancheId] == NULL) + return "extension"; + + return LWLockTrancheNames[trancheId]; +} + +/* + * Return an identifier for an LWLock based on the wait class and event. + */ +const char * +GetLWLockIdentifier(uint32 classId, uint16 eventId) +{ + Assert(classId == PG_WAIT_LWLOCK); + /* The event IDs are just tranche numbers. */ + return GetLWTrancheName(eventId); +} + +/* + * Internal function that tries to atomically acquire the lwlock in the passed + * in mode. + * + * This function will not block waiting for a lock to become free - that's the + * callers job. + * + * Returns true if the lock isn't free and we need to wait. + */ +static bool +LWLockAttemptLock(LWLock *lock, LWLockMode mode) +{ + uint32 old_state; + + AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED); + + /* + * Read once outside the loop, later iterations will get the newer value + * via compare & exchange. + */ + old_state = pg_atomic_read_u32(&lock->state); + + /* loop until we've determined whether we could acquire the lock or not */ + while (true) + { + uint32 desired_state; + bool lock_free; + + desired_state = old_state; + + if (mode == LW_EXCLUSIVE) + { + lock_free = (old_state & LW_LOCK_MASK) == 0; + if (lock_free) + desired_state += LW_VAL_EXCLUSIVE; + } + else + { + lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0; + if (lock_free) + desired_state += LW_VAL_SHARED; + } + + /* + * Attempt to swap in the state we are expecting. If we didn't see + * lock to be free, that's just the old value. If we saw it as free, + * we'll attempt to mark it acquired. The reason that we always swap + * in the value is that this doubles as a memory barrier. We could try + * to be smarter and only swap in values if we saw the lock as free, + * but benchmark haven't shown it as beneficial so far. + * + * Retry if the value changed since we last looked at it. + */ + if (pg_atomic_compare_exchange_u32(&lock->state, + &old_state, desired_state)) + { + if (lock_free) + { + /* Great! Got the lock. */ +#ifdef LOCK_DEBUG + if (mode == LW_EXCLUSIVE) + lock->owner = MyProc; +#endif + return false; + } + else + return true; /* somebody else has the lock */ + } + } + pg_unreachable(); +} + +/* + * Lock the LWLock's wait list against concurrent activity. + * + * NB: even though the wait list is locked, non-conflicting lock operations + * may still happen concurrently. + * + * Time spent holding mutex should be short! + */ +static void +LWLockWaitListLock(LWLock *lock) +{ + uint32 old_state; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + uint32 delays = 0; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + while (true) + { + /* always try once to acquire lock directly */ + old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED); + if (!(old_state & LW_FLAG_LOCKED)) + break; /* got lock */ + + /* and then spin without atomic operations until lock is released */ + { + SpinDelayStatus delayStatus; + + init_local_spin_delay(&delayStatus); + + while (old_state & LW_FLAG_LOCKED) + { + perform_spin_delay(&delayStatus); + old_state = pg_atomic_read_u32(&lock->state); + } +#ifdef LWLOCK_STATS + delays += delayStatus.delays; +#endif + finish_spin_delay(&delayStatus); + } + + /* + * Retry. The lock might obviously already be re-acquired by the time + * we're attempting to get it again. + */ + } + +#ifdef LWLOCK_STATS + lwstats->spin_delay_count += delays; +#endif +} + +/* + * Unlock the LWLock's wait list. + * + * Note that it can be more efficient to manipulate flags and release the + * locks in a single atomic operation. + */ +static void +LWLockWaitListUnlock(LWLock *lock) +{ + uint32 old_state PG_USED_FOR_ASSERTS_ONLY; + + old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED); + + Assert(old_state & LW_FLAG_LOCKED); +} + +/* + * Wakeup all the lockers that currently have a chance to acquire the lock. + */ +static void +LWLockWakeup(LWLock *lock) +{ + bool new_release_ok; + bool wokeup_somebody = false; + proclist_head wakeup; + proclist_mutable_iter iter; + + proclist_init(&wakeup); + + new_release_ok = true; + + /* lock wait list while collecting backends to wake up */ + LWLockWaitListLock(lock); + + proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE) + continue; + + proclist_delete(&lock->waiters, iter.cur, lwWaitLink); + proclist_push_tail(&wakeup, iter.cur, lwWaitLink); + + if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) + { + /* + * Prevent additional wakeups until retryer gets to run. Backends + * that are just waiting for the lock to become free don't retry + * automatically. + */ + new_release_ok = false; + + /* + * Don't wakeup (further) exclusive locks. + */ + wokeup_somebody = true; + } + + /* + * Once we've woken up an exclusive lock, there's no point in waking + * up anybody else. + */ + if (waiter->lwWaitMode == LW_EXCLUSIVE) + break; + } + + Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS); + + /* unset required flags, and release lock, in one fell swoop */ + { + uint32 old_state; + uint32 desired_state; + + old_state = pg_atomic_read_u32(&lock->state); + while (true) + { + desired_state = old_state; + + /* compute desired flags */ + + if (new_release_ok) + desired_state |= LW_FLAG_RELEASE_OK; + else + desired_state &= ~LW_FLAG_RELEASE_OK; + + if (proclist_is_empty(&wakeup)) + desired_state &= ~LW_FLAG_HAS_WAITERS; + + desired_state &= ~LW_FLAG_LOCKED; /* release lock */ + + if (pg_atomic_compare_exchange_u32(&lock->state, &old_state, + desired_state)) + break; + } + } + + /* Awaken any waiters I removed from the queue. */ + proclist_foreach_modify(iter, &wakeup, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + LOG_LWDEBUG("LWLockRelease", lock, "release waiter"); + proclist_delete(&wakeup, iter.cur, lwWaitLink); + + /* + * Guarantee that lwWaiting being unset only becomes visible once the + * unlink from the link has completed. Otherwise the target backend + * could be woken up for other reason and enqueue for a new lock - if + * that happens before the list unlink happens, the list would end up + * being corrupted. + * + * The barrier pairs with the LWLockWaitListLock() when enqueuing for + * another lock. + */ + pg_write_barrier(); + waiter->lwWaiting = false; + PGSemaphoreUnlock(waiter->sem); + } +} + +/* + * Add ourselves to the end of the queue. + * + * NB: Mode can be LW_WAIT_UNTIL_FREE here! + */ +static void +LWLockQueueSelf(LWLock *lock, LWLockMode mode) +{ + /* + * If we don't have a PGPROC structure, there's no way to wait. This + * should never occur, since MyProc should only be null during shared + * memory initialization. + */ + if (MyProc == NULL) + elog(PANIC, "cannot wait without a PGPROC structure"); + + if (MyProc->lwWaiting) + elog(PANIC, "queueing for lock while waiting on another one"); + + LWLockWaitListLock(lock); + + /* setting the flag is protected by the spinlock */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS); + + MyProc->lwWaiting = true; + MyProc->lwWaitMode = mode; + + /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */ + if (mode == LW_WAIT_UNTIL_FREE) + proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink); + else + proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink); + + /* Can release the mutex now */ + LWLockWaitListUnlock(lock); + +#ifdef LOCK_DEBUG + pg_atomic_fetch_add_u32(&lock->nwaiters, 1); +#endif + +} + +/* + * Remove ourselves from the waitlist. + * + * This is used if we queued ourselves because we thought we needed to sleep + * but, after further checking, we discovered that we don't actually need to + * do so. + */ +static void +LWLockDequeueSelf(LWLock *lock) +{ + bool found = false; + proclist_mutable_iter iter; + +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); + + lwstats->dequeue_self_count++; +#endif + + LWLockWaitListLock(lock); + + /* + * Can't just remove ourselves from the list, but we need to iterate over + * all entries as somebody else could have dequeued us. + */ + proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) + { + if (iter.cur == MyProc->pgprocno) + { + found = true; + proclist_delete(&lock->waiters, iter.cur, lwWaitLink); + break; + } + } + + if (proclist_is_empty(&lock->waiters) && + (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0) + { + pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS); + } + + /* XXX: combine with fetch_and above? */ + LWLockWaitListUnlock(lock); + + /* clear waiting state again, nice for debugging */ + if (found) + MyProc->lwWaiting = false; + else + { + int extraWaits = 0; + + /* + * Somebody else dequeued us and has or will wake us up. Deal with the + * superfluous absorption of a wakeup. + */ + + /* + * Reset RELEASE_OK flag if somebody woke us before we removed + * ourselves - they'll have set it to false. + */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + + /* + * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would + * get reset at some inconvenient point later. Most of the time this + * will immediately return. + */ + for (;;) + { + PGSemaphoreLock(MyProc->sem); + if (!MyProc->lwWaiting) + break; + extraWaits++; + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(MyProc->sem); + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif +} + +/* + * LWLockAcquire - acquire a lightweight lock in the specified mode + * + * If the lock is not available, sleep until it is. Returns true if the lock + * was available immediately, false if we had to sleep. + * + * Side effect: cancel/die interrupts are held off until lock release. + */ +bool +LWLockAcquire(LWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + bool result = true; + int extraWaits = 0; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockAcquire", lock, mode); + +#ifdef LWLOCK_STATS + /* Count lock acquisition attempts */ + if (mode == LW_EXCLUSIVE) + lwstats->ex_acquire_count++; + else + lwstats->sh_acquire_count++; +#endif /* LWLOCK_STATS */ + + /* + * We can't wait if we haven't got a PGPROC. This should only occur + * during bootstrap or shared memory initialization. Put an Assert here + * to catch unsafe coding practices. + */ + Assert(!(proc == NULL && IsUnderPostmaster)); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* + * Loop here to try to acquire lock after each time we are signaled by + * LWLockRelease. + * + * NOTE: it might seem better to have LWLockRelease actually grant us the + * lock, rather than retrying and possibly having to go back to sleep. But + * in practice that is no good because it means a process swap for every + * lock acquisition when two or more processes are contending for the same + * lock. Since LWLocks are normally used to protect not-very-long + * sections of computation, a process needs to be able to acquire and + * release the same lock many times during a single CPU time slice, even + * in the presence of contention. The efficiency of being able to do that + * outweighs the inefficiency of sometimes wasting a process dispatch + * cycle because the lock is not free when a released waiter finally gets + * to run. See pgsql-hackers archives for 29-Dec-01. + */ + for (;;) + { + bool mustwait; + + /* + * Try to grab the lock the first time, we're not in the waitqueue + * yet/anymore. + */ + mustwait = LWLockAttemptLock(lock, mode); + + if (!mustwait) + { + LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock"); + break; /* got the lock */ + } + + /* + * Ok, at this point we couldn't grab the lock on the first try. We + * cannot simply queue ourselves to the end of the list and wait to be + * woken up because by now the lock could long have been released. + * Instead add us to the queue and try to grab the lock again. If we + * succeed we need to revert the queuing and be happy, otherwise we + * recheck the lock. If we still couldn't grab it, we know that the + * other locker will see our queue entries when releasing since they + * existed before we checked for the lock. + */ + + /* add to the queue */ + LWLockQueueSelf(lock, mode); + + /* we're now guaranteed to be woken up if necessary */ + mustwait = LWLockAttemptLock(lock, mode); + + /* ok, grabbed the lock the second time round, need to undo queueing */ + if (!mustwait) + { + LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue"); + + LWLockDequeueSelf(lock); + break; + } + + /* + * Wait until awakened. + * + * It is possible that we get awakened for a reason other than being + * signaled by LWLockRelease. If so, loop back and wait again. Once + * we've gotten the LWLock, re-increment the sema by the number of + * additional signals received. + */ + LOG_LWDEBUG("LWLockAcquire", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (!proc->lwWaiting) + break; + extraWaits++; + } + + /* Retrying, allow LWLockRelease to release waiters again. */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockAcquire", lock, "awakened"); + + /* Now loop back and try to acquire lock again. */ + result = false; + } + + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode); + + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + return result; +} + +/* + * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode + * + * If the lock is not available, return false with no side-effects. + * + * If successful, cancel/die interrupts are held off until lock release. + */ +bool +LWLockConditionalAcquire(LWLock *lock, LWLockMode mode) +{ + bool mustwait; + + AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* Check for the lock */ + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + /* Failed to get lock, so release interrupt holdoff */ + RESUME_INTERRUPTS(); + + LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed"); + if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode); + } + else + { + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode); + } + return !mustwait; +} + +/* + * LWLockAcquireOrWait - Acquire lock, or wait until it's free + * + * The semantics of this function are a bit funky. If the lock is currently + * free, it is acquired in the given mode, and the function returns true. If + * the lock isn't immediately free, the function waits until it is released + * and returns false, but does not acquire the lock. + * + * This is currently used for WALWriteLock: when a backend flushes the WAL, + * holding WALWriteLock, it can flush the commit records of many other + * backends as a side-effect. Those other backends need to wait until the + * flush finishes, but don't need to acquire the lock anymore. They can just + * wake up, observe that their records have already been flushed, and return. + */ +bool +LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + bool mustwait; + int extraWaits = 0; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* + * NB: We're using nearly the same twice-in-a-row lock acquisition + * protocol as LWLockAcquire(). Check its comments for details. + */ + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); + + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + /* + * Wait until awakened. Like in LWLockAcquire, be prepared for + * bogus wakeups. + */ + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (!proc->lwWaiting) + break; + extraWaits++; + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened"); + } + else + { + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue"); + + /* + * Got lock in the second attempt, undo queueing. We need to treat + * this as having successfully acquired the lock, otherwise we'd + * not necessarily wake up people we've prevented from acquiring + * the lock. + */ + LWLockDequeueSelf(lock); + } + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + if (mustwait) + { + /* Failed to get lock, so release interrupt holdoff */ + RESUME_INTERRUPTS(); + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed"); + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode); + } + else + { + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded"); + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode); + } + + return !mustwait; +} + +/* + * Does the lwlock in its current state need to wait for the variable value to + * change? + * + * If we don't need to wait, and it's because the value of the variable has + * changed, store the current value in newval. + * + * *result is set to true if the lock was free, and false otherwise. + */ +static bool +LWLockConflictsWithVar(LWLock *lock, + uint64 *valptr, uint64 oldval, uint64 *newval, + bool *result) +{ + bool mustwait; + uint64 value; + + /* + * Test first to see if it the slot is free right now. + * + * XXX: the caller uses a spinlock before this, so we don't need a memory + * barrier here as far as the current usage is concerned. But that might + * not be safe in general. + */ + mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0; + + if (!mustwait) + { + *result = true; + return false; + } + + *result = false; + + /* + * Read value using the lwlock's wait list lock, as we can't generally + * rely on atomic 64 bit reads/stores. TODO: On platforms with a way to + * do atomic 64 bit reads/writes the spinlock should be optimized away. + */ + LWLockWaitListLock(lock); + value = *valptr; + LWLockWaitListUnlock(lock); + + if (value != oldval) + { + mustwait = false; + *newval = value; + } + else + { + mustwait = true; + } + + return mustwait; +} + +/* + * LWLockWaitForVar - Wait until lock is free, or a variable is updated. + * + * If the lock is held and *valptr equals oldval, waits until the lock is + * either freed, or the lock holder updates *valptr by calling + * LWLockUpdateVar. If the lock is free on exit (immediately or after + * waiting), returns true. If the lock is still held, but *valptr no longer + * matches oldval, returns false and sets *newval to the current value in + * *valptr. + * + * Note: this function ignores shared lock holders; if the lock is held + * in shared mode, returns 'true'. + */ +bool +LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) +{ + PGPROC *proc = MyProc; + int extraWaits = 0; + bool result = false; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE); + + /* + * Lock out cancel/die interrupts while we sleep on the lock. There is no + * cleanup mechanism to remove us from the wait queue if we got + * interrupted. + */ + HOLD_INTERRUPTS(); + + /* + * Loop here to check the lock's status after each time we are signaled. + */ + for (;;) + { + bool mustwait; + + mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, + &result); + + if (!mustwait) + break; /* the lock was free or value didn't match */ + + /* + * Add myself to wait queue. Note that this is racy, somebody else + * could wakeup before we're finished queuing. NB: We're using nearly + * the same twice-in-a-row lock acquisition protocol as + * LWLockAcquire(). Check its comments for details. The only + * difference is that we also have to check the variable's values when + * checking the state of the lock. + */ + LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); + + /* + * Set RELEASE_OK flag, to make sure we get woken up as soon as the + * lock is released. + */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + + /* + * We're now guaranteed to be woken up if necessary. Recheck the lock + * and variables state. + */ + mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, + &result); + + /* Ok, no conflict after we queued ourselves. Undo queueing. */ + if (!mustwait) + { + LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue"); + + LWLockDequeueSelf(lock); + break; + } + + /* + * Wait until awakened. + * + * It is possible that we get awakened for a reason other than being + * signaled by LWLockRelease. If so, loop back and wait again. Once + * we've gotten the LWLock, re-increment the sema by the number of + * additional signals received. + */ + LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (!proc->lwWaiting) + break; + extraWaits++; + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened"); + + /* Now loop back and check the status of the lock again. */ + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + /* + * Now okay to allow cancel/die interrupts. + */ + RESUME_INTERRUPTS(); + + return result; +} + + +/* + * LWLockUpdateVar - Update a variable and wake up waiters atomically + * + * Sets *valptr to 'val', and wakes up all processes waiting for us with + * LWLockWaitForVar(). Setting the value and waking up the processes happen + * atomically so that any process calling LWLockWaitForVar() on the same lock + * is guaranteed to see the new value, and act accordingly. + * + * The caller must be holding the lock in exclusive mode. + */ +void +LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val) +{ + proclist_head wakeup; + proclist_mutable_iter iter; + + PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE); + + proclist_init(&wakeup); + + LWLockWaitListLock(lock); + + Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE); + + /* Update the lock's value */ + *valptr = val; + + /* + * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken + * up. They are always in the front of the queue. + */ + proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) + break; + + proclist_delete(&lock->waiters, iter.cur, lwWaitLink); + proclist_push_tail(&wakeup, iter.cur, lwWaitLink); + } + + /* We are done updating shared state of the lock itself. */ + LWLockWaitListUnlock(lock); + + /* + * Awaken any waiters I removed from the queue. + */ + proclist_foreach_modify(iter, &wakeup, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + proclist_delete(&wakeup, iter.cur, lwWaitLink); + /* check comment in LWLockWakeup() about this barrier */ + pg_write_barrier(); + waiter->lwWaiting = false; + PGSemaphoreUnlock(waiter->sem); + } +} + + +/* + * LWLockRelease - release a previously acquired lock + */ +void +LWLockRelease(LWLock *lock) +{ + LWLockMode mode; + uint32 oldstate; + bool check_waiters; + int i; + + /* + * Remove lock from list of locks held. Usually, but not always, it will + * be the latest-acquired lock; so search array backwards. + */ + for (i = num_held_lwlocks; --i >= 0;) + if (lock == held_lwlocks[i].lock) + break; + + if (i < 0) + elog(ERROR, "lock %s is not held", T_NAME(lock)); + + mode = held_lwlocks[i].mode; + + num_held_lwlocks--; + for (; i < num_held_lwlocks; i++) + held_lwlocks[i] = held_lwlocks[i + 1]; + + PRINT_LWDEBUG("LWLockRelease", lock, mode); + + /* + * Release my hold on lock, after that it can immediately be acquired by + * others, even if we still have to wakeup other waiters. + */ + if (mode == LW_EXCLUSIVE) + oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE); + else + oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED); + + /* nobody else can have that kind of lock */ + Assert(!(oldstate & LW_VAL_EXCLUSIVE)); + + if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock)); + + /* + * We're still waiting for backends to get scheduled, don't wake them up + * again. + */ + if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) == + (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) && + (oldstate & LW_LOCK_MASK) == 0) + check_waiters = true; + else + check_waiters = false; + + /* + * As waking up waiters requires the spinlock to be acquired, only do so + * if necessary. + */ + if (check_waiters) + { + /* XXX: remove before commit? */ + LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters"); + LWLockWakeup(lock); + } + + /* + * Now okay to allow cancel/die interrupts. + */ + RESUME_INTERRUPTS(); +} + +/* + * LWLockReleaseClearVar - release a previously acquired lock, reset variable + */ +void +LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val) +{ + LWLockWaitListLock(lock); + + /* + * Set the variable's value before releasing the lock, that prevents race + * a race condition wherein a new locker acquires the lock, but hasn't yet + * set the variables value. + */ + *valptr = val; + LWLockWaitListUnlock(lock); + + LWLockRelease(lock); +} + + +/* + * LWLockReleaseAll - release all currently-held locks + * + * Used to clean up after ereport(ERROR). An important difference between this + * function and retail LWLockRelease calls is that InterruptHoldoffCount is + * unchanged by this operation. This is necessary since InterruptHoldoffCount + * has been set to an appropriate level earlier in error recovery. We could + * decrement it below zero if we allow it to drop for each released lock! + */ +void +LWLockReleaseAll(void) +{ + while (num_held_lwlocks > 0) + { + HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ + + LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock); + } +} + + +/* + * LWLockHeldByMe - test whether my process holds a lock in any mode + * + * This is meant as debug support only. + */ +bool +LWLockHeldByMe(LWLock *l) +{ + int i; + + for (i = 0; i < num_held_lwlocks; i++) + { + if (held_lwlocks[i].lock == l) + return true; + } + return false; +} + +/* + * LWLockHeldByMe - test whether my process holds any of an array of locks + * + * This is meant as debug support only. + */ +bool +LWLockAnyHeldByMe(LWLock *l, int nlocks, size_t stride) +{ + char *held_lock_addr; + char *begin; + char *end; + int i; + + begin = (char *) l; + end = begin + nlocks * stride; + for (i = 0; i < num_held_lwlocks; i++) + { + held_lock_addr = (char *) held_lwlocks[i].lock; + if (held_lock_addr >= begin && + held_lock_addr < end && + (held_lock_addr - begin) % stride == 0) + return true; + } + return false; +} + +/* + * LWLockHeldByMeInMode - test whether my process holds a lock in given mode + * + * This is meant as debug support only. + */ +bool +LWLockHeldByMeInMode(LWLock *l, LWLockMode mode) +{ + int i; + + for (i = 0; i < num_held_lwlocks; i++) + { + if (held_lwlocks[i].lock == l && held_lwlocks[i].mode == mode) + return true; + } + return false; +} diff --git a/src/backend/storage/lmgr/lwlocknames.c b/src/backend/storage/lmgr/lwlocknames.c new file mode 100644 index 0000000..65f7c5b --- /dev/null +++ b/src/backend/storage/lmgr/lwlocknames.c @@ -0,0 +1,52 @@ +/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */ + +const char *const IndividualLWLockNames[] = { + "<unassigned:0>", + "ShmemIndex", + "OidGen", + "XidGen", + "ProcArray", + "SInvalRead", + "SInvalWrite", + "WALBufMapping", + "WALWrite", + "ControlFile", + "<unassigned:10>", + "XactSLRU", + "SubtransSLRU", + "MultiXactGen", + "MultiXactOffsetSLRU", + "MultiXactMemberSLRU", + "RelCacheInit", + "CheckpointerComm", + "TwoPhaseState", + "TablespaceCreate", + "BtreeVacuum", + "AddinShmemInit", + "Autovacuum", + "AutovacuumSchedule", + "SyncScan", + "RelationMapping", + "NotifySLRU", + "NotifyQueue", + "SerializableXactHash", + "SerializableFinishedList", + "SerializablePredicateList", + "SerialSLRU", + "SyncRep", + "BackgroundWorker", + "DynamicSharedMemoryControl", + "AutoFile", + "ReplicationSlotAllocation", + "ReplicationSlotControl", + "CommitTsSLRU", + "CommitTs", + "ReplicationOrigin", + "MultiXactTruncation", + "OldSnapshotTimeMap", + "LogicalRepWorker", + "XactTruncation", + "<unassigned:45>", + "WrapLimitsVacuum", + "NotifyQueueTail" +}; diff --git a/src/backend/storage/lmgr/lwlocknames.h b/src/backend/storage/lmgr/lwlocknames.h new file mode 100644 index 0000000..e279f72 --- /dev/null +++ b/src/backend/storage/lmgr/lwlocknames.h @@ -0,0 +1,50 @@ +/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */ +/* there is deliberately not an #ifndef LWLOCKNAMES_H here */ + +#define ShmemIndexLock (&MainLWLockArray[1].lock) +#define OidGenLock (&MainLWLockArray[2].lock) +#define XidGenLock (&MainLWLockArray[3].lock) +#define ProcArrayLock (&MainLWLockArray[4].lock) +#define SInvalReadLock (&MainLWLockArray[5].lock) +#define SInvalWriteLock (&MainLWLockArray[6].lock) +#define WALBufMappingLock (&MainLWLockArray[7].lock) +#define WALWriteLock (&MainLWLockArray[8].lock) +#define ControlFileLock (&MainLWLockArray[9].lock) +#define XactSLRULock (&MainLWLockArray[11].lock) +#define SubtransSLRULock (&MainLWLockArray[12].lock) +#define MultiXactGenLock (&MainLWLockArray[13].lock) +#define MultiXactOffsetSLRULock (&MainLWLockArray[14].lock) +#define MultiXactMemberSLRULock (&MainLWLockArray[15].lock) +#define RelCacheInitLock (&MainLWLockArray[16].lock) +#define CheckpointerCommLock (&MainLWLockArray[17].lock) +#define TwoPhaseStateLock (&MainLWLockArray[18].lock) +#define TablespaceCreateLock (&MainLWLockArray[19].lock) +#define BtreeVacuumLock (&MainLWLockArray[20].lock) +#define AddinShmemInitLock (&MainLWLockArray[21].lock) +#define AutovacuumLock (&MainLWLockArray[22].lock) +#define AutovacuumScheduleLock (&MainLWLockArray[23].lock) +#define SyncScanLock (&MainLWLockArray[24].lock) +#define RelationMappingLock (&MainLWLockArray[25].lock) +#define NotifySLRULock (&MainLWLockArray[26].lock) +#define NotifyQueueLock (&MainLWLockArray[27].lock) +#define SerializableXactHashLock (&MainLWLockArray[28].lock) +#define SerializableFinishedListLock (&MainLWLockArray[29].lock) +#define SerializablePredicateListLock (&MainLWLockArray[30].lock) +#define SerialSLRULock (&MainLWLockArray[31].lock) +#define SyncRepLock (&MainLWLockArray[32].lock) +#define BackgroundWorkerLock (&MainLWLockArray[33].lock) +#define DynamicSharedMemoryControlLock (&MainLWLockArray[34].lock) +#define AutoFileLock (&MainLWLockArray[35].lock) +#define ReplicationSlotAllocationLock (&MainLWLockArray[36].lock) +#define ReplicationSlotControlLock (&MainLWLockArray[37].lock) +#define CommitTsSLRULock (&MainLWLockArray[38].lock) +#define CommitTsLock (&MainLWLockArray[39].lock) +#define ReplicationOriginLock (&MainLWLockArray[40].lock) +#define MultiXactTruncationLock (&MainLWLockArray[41].lock) +#define OldSnapshotTimeMapLock (&MainLWLockArray[42].lock) +#define LogicalRepWorkerLock (&MainLWLockArray[43].lock) +#define XactTruncationLock (&MainLWLockArray[44].lock) +#define WrapLimitsVacuumLock (&MainLWLockArray[46].lock) +#define NotifyQueueTailLock (&MainLWLockArray[47].lock) + +#define NUM_INDIVIDUAL_LWLOCKS 48 diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt new file mode 100644 index 0000000..6c7cf6c --- /dev/null +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -0,0 +1,55 @@ +# Some commonly-used locks have predefined positions within MainLWLockArray; +# these are defined here. If you add a lock, add it to the end to avoid +# renumbering the existing locks; if you remove a lock, consider leaving a gap +# in the numbering sequence for the benefit of DTrace and other external +# debugging scripts. Also, do not forget to update the list of wait events +# in the user documentation. + +# 0 is available; was formerly BufFreelistLock +ShmemIndexLock 1 +OidGenLock 2 +XidGenLock 3 +ProcArrayLock 4 +SInvalReadLock 5 +SInvalWriteLock 6 +WALBufMappingLock 7 +WALWriteLock 8 +ControlFileLock 9 +# 10 was CheckpointLock +XactSLRULock 11 +SubtransSLRULock 12 +MultiXactGenLock 13 +MultiXactOffsetSLRULock 14 +MultiXactMemberSLRULock 15 +RelCacheInitLock 16 +CheckpointerCommLock 17 +TwoPhaseStateLock 18 +TablespaceCreateLock 19 +BtreeVacuumLock 20 +AddinShmemInitLock 21 +AutovacuumLock 22 +AutovacuumScheduleLock 23 +SyncScanLock 24 +RelationMappingLock 25 +NotifySLRULock 26 +NotifyQueueLock 27 +SerializableXactHashLock 28 +SerializableFinishedListLock 29 +SerializablePredicateListLock 30 +SerialSLRULock 31 +SyncRepLock 32 +BackgroundWorkerLock 33 +DynamicSharedMemoryControlLock 34 +AutoFileLock 35 +ReplicationSlotAllocationLock 36 +ReplicationSlotControlLock 37 +CommitTsSLRULock 38 +CommitTsLock 39 +ReplicationOriginLock 40 +MultiXactTruncationLock 41 +OldSnapshotTimeMapLock 42 +LogicalRepWorkerLock 43 +XactTruncationLock 44 +# 45 was XactTruncationLock until removal of BackendRandomLock +WrapLimitsVacuumLock 46 +NotifyQueueTailLock 47 diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c new file mode 100644 index 0000000..d493aee --- /dev/null +++ b/src/backend/storage/lmgr/predicate.c @@ -0,0 +1,5203 @@ +/*------------------------------------------------------------------------- + * + * predicate.c + * POSTGRES predicate locking + * to support full serializable transaction isolation + * + * + * The approach taken is to implement Serializable Snapshot Isolation (SSI) + * as initially described in this paper: + * + * Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008. + * Serializable isolation for snapshot databases. + * In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD + * international conference on Management of data, + * pages 729-738, New York, NY, USA. ACM. + * http://doi.acm.org/10.1145/1376616.1376690 + * + * and further elaborated in Cahill's doctoral thesis: + * + * Michael James Cahill. 2009. + * Serializable Isolation for Snapshot Databases. + * Sydney Digital Theses. + * University of Sydney, School of Information Technologies. + * http://hdl.handle.net/2123/5353 + * + * + * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD + * locks, which are so different from normal locks that a distinct set of + * structures is required to handle them. They are needed to detect + * rw-conflicts when the read happens before the write. (When the write + * occurs first, the reading transaction can check for a conflict by + * examining the MVCC data.) + * + * (1) Besides tuples actually read, they must cover ranges of tuples + * which would have been read based on the predicate. This will + * require modelling the predicates through locks against database + * objects such as pages, index ranges, or entire tables. + * + * (2) They must be kept in RAM for quick access. Because of this, it + * isn't possible to always maintain tuple-level granularity -- when + * the space allocated to store these approaches exhaustion, a + * request for a lock may need to scan for situations where a single + * transaction holds many fine-grained locks which can be coalesced + * into a single coarser-grained lock. + * + * (3) They never block anything; they are more like flags than locks + * in that regard; although they refer to database objects and are + * used to identify rw-conflicts with normal write locks. + * + * (4) While they are associated with a transaction, they must survive + * a successful COMMIT of that transaction, and remain until all + * overlapping transactions complete. This even means that they + * must survive termination of the transaction's process. If a + * top level transaction is rolled back, however, it is immediately + * flagged so that it can be ignored, and its SIREAD locks can be + * released any time after that. + * + * (5) The only transactions which create SIREAD locks or check for + * conflicts with them are serializable transactions. + * + * (6) When a write lock for a top level transaction is found to cover + * an existing SIREAD lock for the same transaction, the SIREAD lock + * can be deleted. + * + * (7) A write from a serializable transaction must ensure that an xact + * record exists for the transaction, with the same lifespan (until + * all concurrent transaction complete or the transaction is rolled + * back) so that rw-dependencies to that transaction can be + * detected. + * + * We use an optimization for read-only transactions. Under certain + * circumstances, a read-only transaction's snapshot can be shown to + * never have conflicts with other transactions. This is referred to + * as a "safe" snapshot (and one known not to be is "unsafe"). + * However, it can't be determined whether a snapshot is safe until + * all concurrent read/write transactions complete. + * + * Once a read-only transaction is known to have a safe snapshot, it + * can release its predicate locks and exempt itself from further + * predicate lock tracking. READ ONLY DEFERRABLE transactions run only + * on safe snapshots, waiting as necessary for one to be available. + * + * + * Lightweight locks to manage access to the predicate locking shared + * memory objects must be taken in this order, and should be released in + * reverse order: + * + * SerializableFinishedListLock + * - Protects the list of transactions which have completed but which + * may yet matter because they overlap still-active transactions. + * + * SerializablePredicateListLock + * - Protects the linked list of locks held by a transaction. Note + * that the locks themselves are also covered by the partition + * locks of their respective lock targets; this lock only affects + * the linked list connecting the locks related to a transaction. + * - All transactions share this single lock (with no partitioning). + * - There is never a need for a process other than the one running + * an active transaction to walk the list of locks held by that + * transaction, except parallel query workers sharing the leader's + * transaction. In the parallel case, an extra per-sxact lock is + * taken; see below. + * - It is relatively infrequent that another process needs to + * modify the list for a transaction, but it does happen for such + * things as index page splits for pages with predicate locks and + * freeing of predicate locked pages by a vacuum process. When + * removing a lock in such cases, the lock itself contains the + * pointers needed to remove it from the list. When adding a + * lock in such cases, the lock can be added using the anchor in + * the transaction structure. Neither requires walking the list. + * - Cleaning up the list for a terminated transaction is sometimes + * not done on a retail basis, in which case no lock is required. + * - Due to the above, a process accessing its active transaction's + * list always uses a shared lock, regardless of whether it is + * walking or maintaining the list. This improves concurrency + * for the common access patterns. + * - A process which needs to alter the list of a transaction other + * than its own active transaction must acquire an exclusive + * lock. + * + * SERIALIZABLEXACT's member 'perXactPredicateListLock' + * - Protects the linked list of predicate locks held by a transaction. + * Only needed for parallel mode, where multiple backends share the + * same SERIALIZABLEXACT object. Not needed if + * SerializablePredicateListLock is held exclusively. + * + * PredicateLockHashPartitionLock(hashcode) + * - The same lock protects a target, all locks on that target, and + * the linked list of locks on the target. + * - When more than one is needed, acquire in ascending address order. + * - When all are needed (rare), acquire in ascending index order with + * PredicateLockHashPartitionLockByIndex(index). + * + * SerializableXactHashLock + * - Protects both PredXact and SerializableXidHash. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/predicate.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * + * housekeeping for setting up shared memory predicate lock structures + * InitPredicateLocks(void) + * PredicateLockShmemSize(void) + * + * predicate lock reporting + * GetPredicateLockStatusData(void) + * PageIsPredicateLocked(Relation relation, BlockNumber blkno) + * + * predicate lock maintenance + * GetSerializableTransactionSnapshot(Snapshot snapshot) + * SetSerializableTransactionSnapshot(Snapshot snapshot, + * VirtualTransactionId *sourcevxid) + * RegisterPredicateLockingXid(void) + * PredicateLockRelation(Relation relation, Snapshot snapshot) + * PredicateLockPage(Relation relation, BlockNumber blkno, + * Snapshot snapshot) + * PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, + * TransactionId insert_xid) + * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, + * BlockNumber newblkno) + * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, + * BlockNumber newblkno) + * TransferPredicateLocksToHeapRelation(Relation relation) + * ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe) + * + * conflict detection (may also trigger rollback) + * CheckForSerializableConflictOut(Relation relation, TransactionId xid, + * Snapshot snapshot) + * CheckForSerializableConflictIn(Relation relation, ItemPointer tid, + * BlockNumber blkno) + * CheckTableForSerializableConflictIn(Relation relation) + * + * final rollback checking + * PreCommit_CheckForSerializationFailure(void) + * + * two-phase commit support + * AtPrepare_PredicateLocks(void); + * PostPrepare_PredicateLocks(TransactionId xid); + * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit); + * predicatelock_twophase_recover(TransactionId xid, uint16 info, + * void *recdata, uint32 len); + */ + +#include "postgres.h" + +#include "access/parallel.h" +#include "access/slru.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "storage/predicate_internals.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +/* Uncomment the next line to test the graceful degradation code. */ +/* #define TEST_SUMMARIZE_SERIAL */ + +/* + * Test the most selective fields first, for performance. + * + * a is covered by b if all of the following hold: + * 1) a.database = b.database + * 2) a.relation = b.relation + * 3) b.offset is invalid (b is page-granularity or higher) + * 4) either of the following: + * 4a) a.offset is valid (a is tuple-granularity) and a.page = b.page + * or 4b) a.offset is invalid and b.page is invalid (a is + * page-granularity and b is relation-granularity + */ +#define TargetTagIsCoveredBy(covered_target, covering_target) \ + ((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */ \ + GET_PREDICATELOCKTARGETTAG_RELATION(covering_target)) \ + && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) == \ + InvalidOffsetNumber) /* (3) */ \ + && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) != \ + InvalidOffsetNumber) /* (4a) */ \ + && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \ + GET_PREDICATELOCKTARGETTAG_PAGE(covered_target))) \ + || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \ + InvalidBlockNumber) /* (4b) */ \ + && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target) \ + != InvalidBlockNumber))) \ + && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) == /* (1) */ \ + GET_PREDICATELOCKTARGETTAG_DB(covering_target))) + +/* + * The predicate locking target and lock shared hash tables are partitioned to + * reduce contention. To determine which partition a given target belongs to, + * compute the tag's hash code with PredicateLockTargetTagHashCode(), then + * apply one of these macros. + * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2! + */ +#define PredicateLockHashPartition(hashcode) \ + ((hashcode) % NUM_PREDICATELOCK_PARTITIONS) +#define PredicateLockHashPartitionLock(hashcode) \ + (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \ + PredicateLockHashPartition(hashcode)].lock) +#define PredicateLockHashPartitionLockByIndex(i) \ + (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock) + +#define NPREDICATELOCKTARGETENTS() \ + mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) + +#define SxactIsOnFinishedList(sxact) (!SHMQueueIsDetached(&((sxact)->finishedLink))) + +/* + * Note that a sxact is marked "prepared" once it has passed + * PreCommit_CheckForSerializationFailure, even if it isn't using + * 2PC. This is the point at which it can no longer be aborted. + * + * The PREPARED flag remains set after commit, so SxactIsCommitted + * implies SxactIsPrepared. + */ +#define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0) +#define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0) +#define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0) +#define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0) +#define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0) +#define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0) +#define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0) +/* + * The following macro actually means that the specified transaction has a + * conflict out *to a transaction which committed ahead of it*. It's hard + * to get that into a name of a reasonable length. + */ +#define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0) +#define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0) +#define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0) +#define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0) +#define SxactIsPartiallyReleased(sxact) (((sxact)->flags & SXACT_FLAG_PARTIALLY_RELEASED) != 0) + +/* + * Compute the hash code associated with a PREDICATELOCKTARGETTAG. + * + * To avoid unnecessary recomputations of the hash code, we try to do this + * just once per function, and then pass it around as needed. Aside from + * passing the hashcode to hash_search_with_hash_value(), we can extract + * the lock partition number from the hashcode. + */ +#define PredicateLockTargetTagHashCode(predicatelocktargettag) \ + get_hash_value(PredicateLockTargetHash, predicatelocktargettag) + +/* + * Given a predicate lock tag, and the hash for its target, + * compute the lock hash. + * + * To make the hash code also depend on the transaction, we xor the sxid + * struct's address into the hash code, left-shifted so that the + * partition-number bits don't change. Since this is only a hash, we + * don't care if we lose high-order bits of the address; use an + * intermediate variable to suppress cast-pointer-to-int warnings. + */ +#define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \ + ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \ + << LOG2_NUM_PREDICATELOCK_PARTITIONS) + + +/* + * The SLRU buffer area through which we access the old xids. + */ +static SlruCtlData SerialSlruCtlData; + +#define SerialSlruCtl (&SerialSlruCtlData) + +#define SERIAL_PAGESIZE BLCKSZ +#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo) +#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE) + +/* + * Set maximum pages based on the number needed to track all transactions. + */ +#define SERIAL_MAX_PAGE (MaxTransactionId / SERIAL_ENTRIESPERPAGE) + +#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1) + +#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \ + (SerialSlruCtl->shared->page_buffer[slotno] + \ + ((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE)))) + +#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE) + +typedef struct SerialControlData +{ + int headPage; /* newest initialized page */ + TransactionId headXid; /* newest valid Xid in the SLRU */ + TransactionId tailXid; /* oldest xmin we might be interested in */ +} SerialControlData; + +typedef struct SerialControlData *SerialControl; + +static SerialControl serialControl; + +/* + * When the oldest committed transaction on the "finished" list is moved to + * SLRU, its predicate locks will be moved to this "dummy" transaction, + * collapsing duplicate targets. When a duplicate is found, the later + * commitSeqNo is used. + */ +static SERIALIZABLEXACT *OldCommittedSxact; + + +/* + * These configuration variables are used to set the predicate lock table size + * and to control promotion of predicate locks to coarser granularity in an + * attempt to degrade performance (mostly as false positive serialization + * failure) gracefully in the face of memory pressure. + */ +int max_predicate_locks_per_xact; /* set by guc.c */ +int max_predicate_locks_per_relation; /* set by guc.c */ +int max_predicate_locks_per_page; /* set by guc.c */ + +/* + * This provides a list of objects in order to track transactions + * participating in predicate locking. Entries in the list are fixed size, + * and reside in shared memory. The memory address of an entry must remain + * fixed during its lifetime. The list will be protected from concurrent + * update externally; no provision is made in this code to manage that. The + * number of entries in the list, and the size allowed for each entry is + * fixed upon creation. + */ +static PredXactList PredXact; + +/* + * This provides a pool of RWConflict data elements to use in conflict lists + * between transactions. + */ +static RWConflictPoolHeader RWConflictPool; + +/* + * The predicate locking hash tables are in shared memory. + * Each backend keeps pointers to them. + */ +static HTAB *SerializableXidHash; +static HTAB *PredicateLockTargetHash; +static HTAB *PredicateLockHash; +static SHM_QUEUE *FinishedSerializableTransactions; + +/* + * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing + * this entry, you can ensure that there's enough scratch space available for + * inserting one entry in the hash table. This is an otherwise-invalid tag. + */ +static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0}; +static uint32 ScratchTargetTagHash; +static LWLock *ScratchPartitionLock; + +/* + * The local hash table used to determine when to combine multiple fine- + * grained locks into a single courser-grained lock. + */ +static HTAB *LocalPredicateLockHash = NULL; + +/* + * Keep a pointer to the currently-running serializable transaction (if any) + * for quick reference. Also, remember if we have written anything that could + * cause a rw-conflict. + */ +static SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact; +static bool MyXactDidWrite = false; + +/* + * The SXACT_FLAG_RO_UNSAFE optimization might lead us to release + * MySerializableXact early. If that happens in a parallel query, the leader + * needs to defer the destruction of the SERIALIZABLEXACT until end of + * transaction, because the workers still have a reference to it. In that + * case, the leader stores it here. + */ +static SERIALIZABLEXACT *SavedSerializableXact = InvalidSerializableXact; + +/* local functions */ + +static SERIALIZABLEXACT *CreatePredXact(void); +static void ReleasePredXact(SERIALIZABLEXACT *sxact); +static SERIALIZABLEXACT *FirstPredXact(void); +static SERIALIZABLEXACT *NextPredXact(SERIALIZABLEXACT *sxact); + +static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer); +static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer); +static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact); +static void ReleaseRWConflict(RWConflict conflict); +static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact); + +static bool SerialPagePrecedesLogically(int page1, int page2); +static void SerialInit(void); +static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo); +static SerCommitSeqNo SerialGetMinConflictCommitSeqNo(TransactionId xid); +static void SerialSetActiveSerXmin(TransactionId xid); + +static uint32 predicatelock_hash(const void *key, Size keysize); +static void SummarizeOldestCommittedSxact(void); +static Snapshot GetSafeSnapshot(Snapshot snapshot); +static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, + VirtualTransactionId *sourcevxid, + int sourcepid); +static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag); +static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag, + PREDICATELOCKTARGETTAG *parent); +static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag); +static void RemoveScratchTarget(bool lockheld); +static void RestoreScratchTarget(bool lockheld); +static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, + uint32 targettaghash); +static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag); +static int MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag); +static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag); +static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag); +static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag, + uint32 targettaghash, + SERIALIZABLEXACT *sxact); +static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash); +static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag, + PREDICATELOCKTARGETTAG newtargettag, + bool removeOld); +static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag); +static void DropAllPredicateLocksFromTable(Relation relation, + bool transfer); +static void SetNewSxactGlobalXmin(void); +static void ClearOldPredicateLocks(void); +static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial, + bool summarize); +static bool XidIsConcurrent(TransactionId xid); +static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag); +static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer); +static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, + SERIALIZABLEXACT *writer); +static void CreateLocalPredicateLockHash(void); +static void ReleasePredicateLocksLocal(void); + + +/*------------------------------------------------------------------------*/ + +/* + * Does this relation participate in predicate locking? Temporary and system + * relations are exempt, as are materialized views. + */ +static inline bool +PredicateLockingNeededForRelation(Relation relation) +{ + return !(relation->rd_id < FirstBootstrapObjectId || + RelationUsesLocalBuffers(relation) || + relation->rd_rel->relkind == RELKIND_MATVIEW); +} + +/* + * When a public interface method is called for a read, this is the test to + * see if we should do a quick return. + * + * Note: this function has side-effects! If this transaction has been flagged + * as RO-safe since the last call, we release all predicate locks and reset + * MySerializableXact. That makes subsequent calls to return quickly. + * + * This is marked as 'inline' to eliminate the function call overhead in the + * common case that serialization is not needed. + */ +static inline bool +SerializationNeededForRead(Relation relation, Snapshot snapshot) +{ + /* Nothing to do if this is not a serializable transaction */ + if (MySerializableXact == InvalidSerializableXact) + return false; + + /* + * Don't acquire locks or conflict when scanning with a special snapshot. + * This excludes things like CLUSTER and REINDEX. They use the wholesale + * functions TransferPredicateLocksToHeapRelation() and + * CheckTableForSerializableConflictIn() to participate in serialization, + * but the scans involved don't need serialization. + */ + if (!IsMVCCSnapshot(snapshot)) + return false; + + /* + * Check if we have just become "RO-safe". If we have, immediately release + * all locks as they're not needed anymore. This also resets + * MySerializableXact, so that subsequent calls to this function can exit + * quickly. + * + * A transaction is flagged as RO_SAFE if all concurrent R/W transactions + * commit without having conflicts out to an earlier snapshot, thus + * ensuring that no conflicts are possible for this transaction. + */ + if (SxactIsROSafe(MySerializableXact)) + { + ReleasePredicateLocks(false, true); + return false; + } + + /* Check if the relation doesn't participate in predicate locking */ + if (!PredicateLockingNeededForRelation(relation)) + return false; + + return true; /* no excuse to skip predicate locking */ +} + +/* + * Like SerializationNeededForRead(), but called on writes. + * The logic is the same, but there is no snapshot and we can't be RO-safe. + */ +static inline bool +SerializationNeededForWrite(Relation relation) +{ + /* Nothing to do if this is not a serializable transaction */ + if (MySerializableXact == InvalidSerializableXact) + return false; + + /* Check if the relation doesn't participate in predicate locking */ + if (!PredicateLockingNeededForRelation(relation)) + return false; + + return true; /* no excuse to skip predicate locking */ +} + + +/*------------------------------------------------------------------------*/ + +/* + * These functions are a simple implementation of a list for this specific + * type of struct. If there is ever a generalized shared memory list, we + * should probably switch to that. + */ +static SERIALIZABLEXACT * +CreatePredXact(void) +{ + PredXactListElement ptle; + + ptle = (PredXactListElement) + SHMQueueNext(&PredXact->availableList, + &PredXact->availableList, + offsetof(PredXactListElementData, link)); + if (!ptle) + return NULL; + + SHMQueueDelete(&ptle->link); + SHMQueueInsertBefore(&PredXact->activeList, &ptle->link); + return &ptle->sxact; +} + +static void +ReleasePredXact(SERIALIZABLEXACT *sxact) +{ + PredXactListElement ptle; + + Assert(ShmemAddrIsValid(sxact)); + + ptle = (PredXactListElement) + (((char *) sxact) + - offsetof(PredXactListElementData, sxact) + + offsetof(PredXactListElementData, link)); + SHMQueueDelete(&ptle->link); + SHMQueueInsertBefore(&PredXact->availableList, &ptle->link); +} + +static SERIALIZABLEXACT * +FirstPredXact(void) +{ + PredXactListElement ptle; + + ptle = (PredXactListElement) + SHMQueueNext(&PredXact->activeList, + &PredXact->activeList, + offsetof(PredXactListElementData, link)); + if (!ptle) + return NULL; + + return &ptle->sxact; +} + +static SERIALIZABLEXACT * +NextPredXact(SERIALIZABLEXACT *sxact) +{ + PredXactListElement ptle; + + Assert(ShmemAddrIsValid(sxact)); + + ptle = (PredXactListElement) + (((char *) sxact) + - offsetof(PredXactListElementData, sxact) + + offsetof(PredXactListElementData, link)); + ptle = (PredXactListElement) + SHMQueueNext(&PredXact->activeList, + &ptle->link, + offsetof(PredXactListElementData, link)); + if (!ptle) + return NULL; + + return &ptle->sxact; +} + +/*------------------------------------------------------------------------*/ + +/* + * These functions manage primitive access to the RWConflict pool and lists. + */ +static bool +RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer) +{ + RWConflict conflict; + + Assert(reader != writer); + + /* Check the ends of the purported conflict first. */ + if (SxactIsDoomed(reader) + || SxactIsDoomed(writer) + || SHMQueueEmpty(&reader->outConflicts) + || SHMQueueEmpty(&writer->inConflicts)) + return false; + + /* A conflict is possible; walk the list to find out. */ + conflict = (RWConflict) + SHMQueueNext(&reader->outConflicts, + &reader->outConflicts, + offsetof(RWConflictData, outLink)); + while (conflict) + { + if (conflict->sxactIn == writer) + return true; + conflict = (RWConflict) + SHMQueueNext(&reader->outConflicts, + &conflict->outLink, + offsetof(RWConflictData, outLink)); + } + + /* No conflict found. */ + return false; +} + +static void +SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer) +{ + RWConflict conflict; + + Assert(reader != writer); + Assert(!RWConflictExists(reader, writer)); + + conflict = (RWConflict) + SHMQueueNext(&RWConflictPool->availableList, + &RWConflictPool->availableList, + offsetof(RWConflictData, outLink)); + if (!conflict) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("not enough elements in RWConflictPool to record a read/write conflict"), + errhint("You might need to run fewer transactions at a time or increase max_connections."))); + + SHMQueueDelete(&conflict->outLink); + + conflict->sxactOut = reader; + conflict->sxactIn = writer; + SHMQueueInsertBefore(&reader->outConflicts, &conflict->outLink); + SHMQueueInsertBefore(&writer->inConflicts, &conflict->inLink); +} + +static void +SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, + SERIALIZABLEXACT *activeXact) +{ + RWConflict conflict; + + Assert(roXact != activeXact); + Assert(SxactIsReadOnly(roXact)); + Assert(!SxactIsReadOnly(activeXact)); + + conflict = (RWConflict) + SHMQueueNext(&RWConflictPool->availableList, + &RWConflictPool->availableList, + offsetof(RWConflictData, outLink)); + if (!conflict) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"), + errhint("You might need to run fewer transactions at a time or increase max_connections."))); + + SHMQueueDelete(&conflict->outLink); + + conflict->sxactOut = activeXact; + conflict->sxactIn = roXact; + SHMQueueInsertBefore(&activeXact->possibleUnsafeConflicts, + &conflict->outLink); + SHMQueueInsertBefore(&roXact->possibleUnsafeConflicts, + &conflict->inLink); +} + +static void +ReleaseRWConflict(RWConflict conflict) +{ + SHMQueueDelete(&conflict->inLink); + SHMQueueDelete(&conflict->outLink); + SHMQueueInsertBefore(&RWConflictPool->availableList, &conflict->outLink); +} + +static void +FlagSxactUnsafe(SERIALIZABLEXACT *sxact) +{ + RWConflict conflict, + nextConflict; + + Assert(SxactIsReadOnly(sxact)); + Assert(!SxactIsROSafe(sxact)); + + sxact->flags |= SXACT_FLAG_RO_UNSAFE; + + /* + * We know this isn't a safe snapshot, so we can stop looking for other + * potential conflicts. + */ + conflict = (RWConflict) + SHMQueueNext(&sxact->possibleUnsafeConflicts, + &sxact->possibleUnsafeConflicts, + offsetof(RWConflictData, inLink)); + while (conflict) + { + nextConflict = (RWConflict) + SHMQueueNext(&sxact->possibleUnsafeConflicts, + &conflict->inLink, + offsetof(RWConflictData, inLink)); + + Assert(!SxactIsReadOnly(conflict->sxactOut)); + Assert(sxact == conflict->sxactIn); + + ReleaseRWConflict(conflict); + + conflict = nextConflict; + } +} + +/*------------------------------------------------------------------------*/ + +/* + * Decide whether a Serial page number is "older" for truncation purposes. + * Analogous to CLOGPagePrecedes(). + */ +static bool +SerialPagePrecedesLogically(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * SERIAL_ENTRIESPERPAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * SERIAL_ENTRIESPERPAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + SERIAL_ENTRIESPERPAGE - 1)); +} + +#ifdef USE_ASSERT_CHECKING +static void +SerialPagePrecedesLogicallyUnitTests(void) +{ + int per_page = SERIAL_ENTRIESPERPAGE, + offset = per_page / 2; + int newestPage, + oldestPage, + headPage, + targetPage; + TransactionId newestXact, + oldestXact; + + /* GetNewTransactionId() has assigned the last XID it can safely use. */ + newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1; /* nothing special */ + newestXact = newestPage * per_page + offset; + Assert(newestXact / per_page == newestPage); + oldestXact = newestXact + 1; + oldestXact -= 1U << 31; + oldestPage = oldestXact / per_page; + + /* + * In this scenario, the SLRU headPage pertains to the last ~1000 XIDs + * assigned. oldestXact finishes, ~2B XIDs having elapsed since it + * started. Further transactions cause us to summarize oldestXact to + * tailPage. Function must return false so SerialAdd() doesn't zero + * tailPage (which may contain entries for other old, recently-finished + * XIDs) and half the SLRU. Reaching this requires burning ~2B XIDs in + * single-user mode, a negligible possibility. + */ + headPage = newestPage; + targetPage = oldestPage; + Assert(!SerialPagePrecedesLogically(headPage, targetPage)); + + /* + * In this scenario, the SLRU headPage pertains to oldestXact. We're + * summarizing an XID near newestXact. (Assume few other XIDs used + * SERIALIZABLE, hence the minimal headPage advancement. Assume + * oldestXact was long-running and only recently reached the SLRU.) + * Function must return true to make SerialAdd() create targetPage. + * + * Today's implementation mishandles this case, but it doesn't matter + * enough to fix. Verify that the defect affects just one page by + * asserting correct treatment of its prior page. Reaching this case + * requires burning ~2B XIDs in single-user mode, a negligible + * possibility. Moreover, if it does happen, the consequence would be + * mild, namely a new transaction failing in SimpleLruReadPage(). + */ + headPage = oldestPage; + targetPage = newestPage; + Assert(SerialPagePrecedesLogically(headPage, targetPage - 1)); +#if 0 + Assert(SerialPagePrecedesLogically(headPage, targetPage)); +#endif +} +#endif + +/* + * Initialize for the tracking of old serializable committed xids. + */ +static void +SerialInit(void) +{ + bool found; + + /* + * Set up SLRU management of the pg_serial data. + */ + SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically; + SimpleLruInit(SerialSlruCtl, "Serial", + NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial", + LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE); +#ifdef USE_ASSERT_CHECKING + SerialPagePrecedesLogicallyUnitTests(); +#endif + SlruPagePrecedesUnitTests(SerialSlruCtl, SERIAL_ENTRIESPERPAGE); + + /* + * Create or attach to the SerialControl structure. + */ + serialControl = (SerialControl) + ShmemInitStruct("SerialControlData", sizeof(SerialControlData), &found); + + Assert(found == IsUnderPostmaster); + if (!found) + { + /* + * Set control information to reflect empty SLRU. + */ + serialControl->headPage = -1; + serialControl->headXid = InvalidTransactionId; + serialControl->tailXid = InvalidTransactionId; + } +} + +/* + * Record a committed read write serializable xid and the minimum + * commitSeqNo of any transactions to which this xid had a rw-conflict out. + * An invalid commitSeqNo means that there were no conflicts out from xid. + */ +static void +SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo) +{ + TransactionId tailXid; + int targetPage; + int slotno; + int firstZeroPage; + bool isNewPage; + + Assert(TransactionIdIsValid(xid)); + + targetPage = SerialPage(xid); + + LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + + /* + * If no serializable transactions are active, there shouldn't be anything + * to push out to the SLRU. Hitting this assert would mean there's + * something wrong with the earlier cleanup logic. + */ + tailXid = serialControl->tailXid; + Assert(TransactionIdIsValid(tailXid)); + + /* + * If the SLRU is currently unused, zero out the whole active region from + * tailXid to headXid before taking it into use. Otherwise zero out only + * any new pages that enter the tailXid-headXid range as we advance + * headXid. + */ + if (serialControl->headPage < 0) + { + firstZeroPage = SerialPage(tailXid); + isNewPage = true; + } + else + { + firstZeroPage = SerialNextPage(serialControl->headPage); + isNewPage = SerialPagePrecedesLogically(serialControl->headPage, + targetPage); + } + + if (!TransactionIdIsValid(serialControl->headXid) + || TransactionIdFollows(xid, serialControl->headXid)) + serialControl->headXid = xid; + if (isNewPage) + serialControl->headPage = targetPage; + + if (isNewPage) + { + /* Initialize intervening pages. */ + while (firstZeroPage != targetPage) + { + (void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage); + firstZeroPage = SerialNextPage(firstZeroPage); + } + slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage); + } + else + slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid); + + SerialValue(slotno, xid) = minConflictCommitSeqNo; + SerialSlruCtl->shared->page_dirty[slotno] = true; + + LWLockRelease(SerialSLRULock); +} + +/* + * Get the minimum commitSeqNo for any conflict out for the given xid. For + * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo + * will be returned. + */ +static SerCommitSeqNo +SerialGetMinConflictCommitSeqNo(TransactionId xid) +{ + TransactionId headXid; + TransactionId tailXid; + SerCommitSeqNo val; + int slotno; + + Assert(TransactionIdIsValid(xid)); + + LWLockAcquire(SerialSLRULock, LW_SHARED); + headXid = serialControl->headXid; + tailXid = serialControl->tailXid; + LWLockRelease(SerialSLRULock); + + if (!TransactionIdIsValid(headXid)) + return 0; + + Assert(TransactionIdIsValid(tailXid)); + + if (TransactionIdPrecedes(xid, tailXid) + || TransactionIdFollows(xid, headXid)) + return 0; + + /* + * The following function must be called without holding SerialSLRULock, + * but will return with that lock held, which must then be released. + */ + slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl, + SerialPage(xid), xid); + val = SerialValue(slotno, xid); + LWLockRelease(SerialSLRULock); + return val; +} + +/* + * Call this whenever there is a new xmin for active serializable + * transactions. We don't need to keep information on transactions which + * precede that. InvalidTransactionId means none active, so everything in + * the SLRU can be discarded. + */ +static void +SerialSetActiveSerXmin(TransactionId xid) +{ + LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + + /* + * When no sxacts are active, nothing overlaps, set the xid values to + * invalid to show that there are no valid entries. Don't clear headPage, + * though. A new xmin might still land on that page, and we don't want to + * repeatedly zero out the same page. + */ + if (!TransactionIdIsValid(xid)) + { + serialControl->tailXid = InvalidTransactionId; + serialControl->headXid = InvalidTransactionId; + LWLockRelease(SerialSLRULock); + return; + } + + /* + * When we're recovering prepared transactions, the global xmin might move + * backwards depending on the order they're recovered. Normally that's not + * OK, but during recovery no serializable transactions will commit, so + * the SLRU is empty and we can get away with it. + */ + if (RecoveryInProgress()) + { + Assert(serialControl->headPage < 0); + if (!TransactionIdIsValid(serialControl->tailXid) + || TransactionIdPrecedes(xid, serialControl->tailXid)) + { + serialControl->tailXid = xid; + } + LWLockRelease(SerialSLRULock); + return; + } + + Assert(!TransactionIdIsValid(serialControl->tailXid) + || TransactionIdFollows(xid, serialControl->tailXid)); + + serialControl->tailXid = xid; + + LWLockRelease(SerialSLRULock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + * + * We don't have any data that needs to survive a restart, but this is a + * convenient place to truncate the SLRU. + */ +void +CheckPointPredicate(void) +{ + int tailPage; + + LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + + /* Exit quickly if the SLRU is currently not in use. */ + if (serialControl->headPage < 0) + { + LWLockRelease(SerialSLRULock); + return; + } + + if (TransactionIdIsValid(serialControl->tailXid)) + { + /* We can truncate the SLRU up to the page containing tailXid */ + tailPage = SerialPage(serialControl->tailXid); + } + else + { + /*---------- + * The SLRU is no longer needed. Truncate to head before we set head + * invalid. + * + * XXX: It's possible that the SLRU is not needed again until XID + * wrap-around has happened, so that the segment containing headPage + * that we leave behind will appear to be new again. In that case it + * won't be removed until XID horizon advances enough to make it + * current again. + * + * XXX: This should happen in vac_truncate_clog(), not in checkpoints. + * Consider this scenario, starting from a system with no in-progress + * transactions and VACUUM FREEZE having maximized oldestXact: + * - Start a SERIALIZABLE transaction. + * - Start, finish, and summarize a SERIALIZABLE transaction, creating + * one SLRU page. + * - Consume XIDs to reach xidStopLimit. + * - Finish all transactions. Due to the long-running SERIALIZABLE + * transaction, earlier checkpoints did not touch headPage. The + * next checkpoint will change it, but that checkpoint happens after + * the end of the scenario. + * - VACUUM to advance XID limits. + * - Consume ~2M XIDs, crossing the former xidWrapLimit. + * - Start, finish, and summarize a SERIALIZABLE transaction. + * SerialAdd() declines to create the targetPage, because headPage + * is not regarded as in the past relative to that targetPage. The + * transaction instigating the summarize fails in + * SimpleLruReadPage(). + */ + tailPage = serialControl->headPage; + serialControl->headPage = -1; + } + + LWLockRelease(SerialSLRULock); + + /* Truncate away pages that are no longer required */ + SimpleLruTruncate(SerialSlruCtl, tailPage); + + /* + * Write dirty SLRU pages to disk + * + * This is not actually necessary from a correctness point of view. We do + * it merely as a debugging aid. + * + * We're doing this after the truncation to avoid writing pages right + * before deleting the file in which they sit, which would be completely + * pointless. + */ + SimpleLruWriteAll(SerialSlruCtl, true); +} + +/*------------------------------------------------------------------------*/ + +/* + * InitPredicateLocks -- Initialize the predicate locking data structures. + * + * This is called from CreateSharedMemoryAndSemaphores(), which see for + * more comments. In the normal postmaster case, the shared hash tables + * are created here. Backends inherit the pointers + * to the shared tables via fork(). In the EXEC_BACKEND case, each + * backend re-executes this code to obtain pointers to the already existing + * shared hash tables. + */ +void +InitPredicateLocks(void) +{ + HASHCTL info; + long max_table_size; + Size requestSize; + bool found; + +#ifndef EXEC_BACKEND + Assert(!IsUnderPostmaster); +#endif + + /* + * Compute size of predicate lock target hashtable. Note these + * calculations must agree with PredicateLockShmemSize! + */ + max_table_size = NPREDICATELOCKTARGETENTS(); + + /* + * Allocate hash table for PREDICATELOCKTARGET structs. This stores + * per-predicate-lock-target information. + */ + info.keysize = sizeof(PREDICATELOCKTARGETTAG); + info.entrysize = sizeof(PREDICATELOCKTARGET); + info.num_partitions = NUM_PREDICATELOCK_PARTITIONS; + + PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash", + max_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | + HASH_PARTITION | HASH_FIXED_SIZE); + + /* + * Reserve a dummy entry in the hash table; we use it to make sure there's + * always one entry available when we need to split or combine a page, + * because running out of space there could mean aborting a + * non-serializable transaction. + */ + if (!IsUnderPostmaster) + { + (void) hash_search(PredicateLockTargetHash, &ScratchTargetTag, + HASH_ENTER, &found); + Assert(!found); + } + + /* Pre-calculate the hash and partition lock of the scratch entry */ + ScratchTargetTagHash = PredicateLockTargetTagHashCode(&ScratchTargetTag); + ScratchPartitionLock = PredicateLockHashPartitionLock(ScratchTargetTagHash); + + /* + * Allocate hash table for PREDICATELOCK structs. This stores per + * xact-lock-of-a-target information. + */ + info.keysize = sizeof(PREDICATELOCKTAG); + info.entrysize = sizeof(PREDICATELOCK); + info.hash = predicatelock_hash; + info.num_partitions = NUM_PREDICATELOCK_PARTITIONS; + + /* Assume an average of 2 xacts per target */ + max_table_size *= 2; + + PredicateLockHash = ShmemInitHash("PREDICATELOCK hash", + max_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_FUNCTION | + HASH_PARTITION | HASH_FIXED_SIZE); + + /* + * Compute size for serializable transaction hashtable. Note these + * calculations must agree with PredicateLockShmemSize! + */ + max_table_size = (MaxBackends + max_prepared_xacts); + + /* + * Allocate a list to hold information on transactions participating in + * predicate locking. + * + * Assume an average of 10 predicate locking transactions per backend. + * This allows aggressive cleanup while detail is present before data must + * be summarized for storage in SLRU and the "dummy" transaction. + */ + max_table_size *= 10; + + PredXact = ShmemInitStruct("PredXactList", + PredXactListDataSize, + &found); + Assert(found == IsUnderPostmaster); + if (!found) + { + int i; + + SHMQueueInit(&PredXact->availableList); + SHMQueueInit(&PredXact->activeList); + PredXact->SxactGlobalXmin = InvalidTransactionId; + PredXact->SxactGlobalXminCount = 0; + PredXact->WritableSxactCount = 0; + PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1; + PredXact->CanPartialClearThrough = 0; + PredXact->HavePartialClearedThrough = 0; + requestSize = mul_size((Size) max_table_size, + PredXactListElementDataSize); + PredXact->element = ShmemAlloc(requestSize); + /* Add all elements to available list, clean. */ + memset(PredXact->element, 0, requestSize); + for (i = 0; i < max_table_size; i++) + { + LWLockInitialize(&PredXact->element[i].sxact.perXactPredicateListLock, + LWTRANCHE_PER_XACT_PREDICATE_LIST); + SHMQueueInsertBefore(&(PredXact->availableList), + &(PredXact->element[i].link)); + } + PredXact->OldCommittedSxact = CreatePredXact(); + SetInvalidVirtualTransactionId(PredXact->OldCommittedSxact->vxid); + PredXact->OldCommittedSxact->prepareSeqNo = 0; + PredXact->OldCommittedSxact->commitSeqNo = 0; + PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0; + SHMQueueInit(&PredXact->OldCommittedSxact->outConflicts); + SHMQueueInit(&PredXact->OldCommittedSxact->inConflicts); + SHMQueueInit(&PredXact->OldCommittedSxact->predicateLocks); + SHMQueueInit(&PredXact->OldCommittedSxact->finishedLink); + SHMQueueInit(&PredXact->OldCommittedSxact->possibleUnsafeConflicts); + PredXact->OldCommittedSxact->topXid = InvalidTransactionId; + PredXact->OldCommittedSxact->finishedBefore = InvalidTransactionId; + PredXact->OldCommittedSxact->xmin = InvalidTransactionId; + PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED; + PredXact->OldCommittedSxact->pid = 0; + } + /* This never changes, so let's keep a local copy. */ + OldCommittedSxact = PredXact->OldCommittedSxact; + + /* + * Allocate hash table for SERIALIZABLEXID structs. This stores per-xid + * information for serializable transactions which have accessed data. + */ + info.keysize = sizeof(SERIALIZABLEXIDTAG); + info.entrysize = sizeof(SERIALIZABLEXID); + + SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash", + max_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | + HASH_FIXED_SIZE); + + /* + * Allocate space for tracking rw-conflicts in lists attached to the + * transactions. + * + * Assume an average of 5 conflicts per transaction. Calculations suggest + * that this will prevent resource exhaustion in even the most pessimal + * loads up to max_connections = 200 with all 200 connections pounding the + * database with serializable transactions. Beyond that, there may be + * occasional transactions canceled when trying to flag conflicts. That's + * probably OK. + */ + max_table_size *= 5; + + RWConflictPool = ShmemInitStruct("RWConflictPool", + RWConflictPoolHeaderDataSize, + &found); + Assert(found == IsUnderPostmaster); + if (!found) + { + int i; + + SHMQueueInit(&RWConflictPool->availableList); + requestSize = mul_size((Size) max_table_size, + RWConflictDataSize); + RWConflictPool->element = ShmemAlloc(requestSize); + /* Add all elements to available list, clean. */ + memset(RWConflictPool->element, 0, requestSize); + for (i = 0; i < max_table_size; i++) + { + SHMQueueInsertBefore(&(RWConflictPool->availableList), + &(RWConflictPool->element[i].outLink)); + } + } + + /* + * Create or attach to the header for the list of finished serializable + * transactions. + */ + FinishedSerializableTransactions = (SHM_QUEUE *) + ShmemInitStruct("FinishedSerializableTransactions", + sizeof(SHM_QUEUE), + &found); + Assert(found == IsUnderPostmaster); + if (!found) + SHMQueueInit(FinishedSerializableTransactions); + + /* + * Initialize the SLRU storage for old committed serializable + * transactions. + */ + SerialInit(); +} + +/* + * Estimate shared-memory space used for predicate lock table + */ +Size +PredicateLockShmemSize(void) +{ + Size size = 0; + long max_table_size; + + /* predicate lock target hash table */ + max_table_size = NPREDICATELOCKTARGETENTS(); + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(PREDICATELOCKTARGET))); + + /* predicate lock hash table */ + max_table_size *= 2; + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(PREDICATELOCK))); + + /* + * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety + * margin. + */ + size = add_size(size, size / 10); + + /* transaction list */ + max_table_size = MaxBackends + max_prepared_xacts; + max_table_size *= 10; + size = add_size(size, PredXactListDataSize); + size = add_size(size, mul_size((Size) max_table_size, + PredXactListElementDataSize)); + + /* transaction xid table */ + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(SERIALIZABLEXID))); + + /* rw-conflict pool */ + max_table_size *= 5; + size = add_size(size, RWConflictPoolHeaderDataSize); + size = add_size(size, mul_size((Size) max_table_size, + RWConflictDataSize)); + + /* Head for list of finished serializable transactions. */ + size = add_size(size, sizeof(SHM_QUEUE)); + + /* Shared memory structures for SLRU tracking of old committed xids. */ + size = add_size(size, sizeof(SerialControlData)); + size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0)); + + return size; +} + + +/* + * Compute the hash code associated with a PREDICATELOCKTAG. + * + * Because we want to use just one set of partition locks for both the + * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure + * that PREDICATELOCKs fall into the same partition number as their + * associated PREDICATELOCKTARGETs. dynahash.c expects the partition number + * to be the low-order bits of the hash code, and therefore a + * PREDICATELOCKTAG's hash code must have the same low-order bits as the + * associated PREDICATELOCKTARGETTAG's hash code. We achieve this with this + * specialized hash function. + */ +static uint32 +predicatelock_hash(const void *key, Size keysize) +{ + const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key; + uint32 targethash; + + Assert(keysize == sizeof(PREDICATELOCKTAG)); + + /* Look into the associated target object, and compute its hash code */ + targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag); + + return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash); +} + + +/* + * GetPredicateLockStatusData + * Return a table containing the internal state of the predicate + * lock manager for use in pg_lock_status. + * + * Like GetLockStatusData, this function tries to hold the partition LWLocks + * for as short a time as possible by returning two arrays that simply + * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock + * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and + * SERIALIZABLEXACT will likely appear. + */ +PredicateLockData * +GetPredicateLockStatusData(void) +{ + PredicateLockData *data; + int i; + int els, + el; + HASH_SEQ_STATUS seqstat; + PREDICATELOCK *predlock; + + data = (PredicateLockData *) palloc(sizeof(PredicateLockData)); + + /* + * To ensure consistency, take simultaneous locks on all partition locks + * in ascending order, then SerializableXactHashLock. + */ + for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + + /* Get number of locks and allocate appropriately-sized arrays. */ + els = hash_get_num_entries(PredicateLockHash); + data->nelements = els; + data->locktags = (PREDICATELOCKTARGETTAG *) + palloc(sizeof(PREDICATELOCKTARGETTAG) * els); + data->xacts = (SERIALIZABLEXACT *) + palloc(sizeof(SERIALIZABLEXACT) * els); + + + /* Scan through PredicateLockHash and copy contents */ + hash_seq_init(&seqstat, PredicateLockHash); + + el = 0; + + while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat))) + { + data->locktags[el] = predlock->tag.myTarget->tag; + data->xacts[el] = *predlock->tag.myXact; + el++; + } + + Assert(el == els); + + /* Release locks in reverse order */ + LWLockRelease(SerializableXactHashLock); + for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); + + return data; +} + +/* + * Free up shared memory structures by pushing the oldest sxact (the one at + * the front of the SummarizeOldestCommittedSxact queue) into summary form. + * Each call will free exactly one SERIALIZABLEXACT structure and may also + * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK, + * PREDICATELOCKTARGET, RWConflictData. + */ +static void +SummarizeOldestCommittedSxact(void) +{ + SERIALIZABLEXACT *sxact; + + LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE); + + /* + * This function is only called if there are no sxact slots available. + * Some of them must belong to old, already-finished transactions, so + * there should be something in FinishedSerializableTransactions list that + * we can summarize. However, there's a race condition: while we were not + * holding any locks, a transaction might have ended and cleaned up all + * the finished sxact entries already, freeing up their sxact slots. In + * that case, we have nothing to do here. The caller will find one of the + * slots released by the other backend when it retries. + */ + if (SHMQueueEmpty(FinishedSerializableTransactions)) + { + LWLockRelease(SerializableFinishedListLock); + return; + } + + /* + * Grab the first sxact off the finished list -- this will be the earliest + * commit. Remove it from the list. + */ + sxact = (SERIALIZABLEXACT *) + SHMQueueNext(FinishedSerializableTransactions, + FinishedSerializableTransactions, + offsetof(SERIALIZABLEXACT, finishedLink)); + SHMQueueDelete(&(sxact->finishedLink)); + + /* Add to SLRU summary information. */ + if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact)) + SerialAdd(sxact->topXid, SxactHasConflictOut(sxact) + ? sxact->SeqNo.earliestOutConflictCommit : InvalidSerCommitSeqNo); + + /* Summarize and release the detail. */ + ReleaseOneSerializableXact(sxact, false, true); + + LWLockRelease(SerializableFinishedListLock); +} + +/* + * GetSafeSnapshot + * Obtain and register a snapshot for a READ ONLY DEFERRABLE + * transaction. Ensures that the snapshot is "safe", i.e. a + * read-only transaction running on it can execute serializably + * without further checks. This requires waiting for concurrent + * transactions to complete, and retrying with a new snapshot if + * one of them could possibly create a conflict. + * + * As with GetSerializableTransactionSnapshot (which this is a subroutine + * for), the passed-in Snapshot pointer should reference a static data + * area that can safely be passed to GetSnapshotData. + */ +static Snapshot +GetSafeSnapshot(Snapshot origSnapshot) +{ + Snapshot snapshot; + + Assert(XactReadOnly && XactDeferrable); + + while (true) + { + /* + * GetSerializableTransactionSnapshotInt is going to call + * GetSnapshotData, so we need to provide it the static snapshot area + * our caller passed to us. The pointer returned is actually the same + * one passed to it, but we avoid assuming that here. + */ + snapshot = GetSerializableTransactionSnapshotInt(origSnapshot, + NULL, InvalidPid); + + if (MySerializableXact == InvalidSerializableXact) + return snapshot; /* no concurrent r/w xacts; it's safe */ + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Wait for concurrent transactions to finish. Stop early if one of + * them marked us as conflicted. + */ + MySerializableXact->flags |= SXACT_FLAG_DEFERRABLE_WAITING; + while (!(SHMQueueEmpty(&MySerializableXact->possibleUnsafeConflicts) || + SxactIsROUnsafe(MySerializableXact))) + { + LWLockRelease(SerializableXactHashLock); + ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + } + MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING; + + if (!SxactIsROUnsafe(MySerializableXact)) + { + LWLockRelease(SerializableXactHashLock); + break; /* success */ + } + + LWLockRelease(SerializableXactHashLock); + + /* else, need to retry... */ + ereport(DEBUG2, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg_internal("deferrable snapshot was unsafe; trying a new one"))); + ReleasePredicateLocks(false, false); + } + + /* + * Now we have a safe snapshot, so we don't need to do any further checks. + */ + Assert(SxactIsROSafe(MySerializableXact)); + ReleasePredicateLocks(false, true); + + return snapshot; +} + +/* + * GetSafeSnapshotBlockingPids + * If the specified process is currently blocked in GetSafeSnapshot, + * write the process IDs of all processes that it is blocked by + * into the caller-supplied buffer output[]. The list is truncated at + * output_size, and the number of PIDs written into the buffer is + * returned. Returns zero if the given PID is not currently blocked + * in GetSafeSnapshot. + */ +int +GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size) +{ + int num_written = 0; + SERIALIZABLEXACT *sxact; + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + + /* Find blocked_pid's SERIALIZABLEXACT by linear search. */ + for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact)) + { + if (sxact->pid == blocked_pid) + break; + } + + /* Did we find it, and is it currently waiting in GetSafeSnapshot? */ + if (sxact != NULL && SxactIsDeferrableWaiting(sxact)) + { + RWConflict possibleUnsafeConflict; + + /* Traverse the list of possible unsafe conflicts collecting PIDs. */ + possibleUnsafeConflict = (RWConflict) + SHMQueueNext(&sxact->possibleUnsafeConflicts, + &sxact->possibleUnsafeConflicts, + offsetof(RWConflictData, inLink)); + + while (possibleUnsafeConflict != NULL && num_written < output_size) + { + output[num_written++] = possibleUnsafeConflict->sxactOut->pid; + possibleUnsafeConflict = (RWConflict) + SHMQueueNext(&sxact->possibleUnsafeConflicts, + &possibleUnsafeConflict->inLink, + offsetof(RWConflictData, inLink)); + } + } + + LWLockRelease(SerializableXactHashLock); + + return num_written; +} + +/* + * Acquire a snapshot that can be used for the current transaction. + * + * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact. + * It should be current for this process and be contained in PredXact. + * + * The passed-in Snapshot pointer should reference a static data area that + * can safely be passed to GetSnapshotData. The return value is actually + * always this same pointer; no new snapshot data structure is allocated + * within this function. + */ +Snapshot +GetSerializableTransactionSnapshot(Snapshot snapshot) +{ + Assert(IsolationIsSerializable()); + + /* + * Can't use serializable mode while recovery is still active, as it is, + * for example, on a hot standby. We could get here despite the check in + * check_XactIsoLevel() if default_transaction_isolation is set to + * serializable, so phrase the hint accordingly. + */ + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use serializable mode in a hot standby"), + errdetail("\"default_transaction_isolation\" is set to \"serializable\"."), + errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default."))); + + /* + * A special optimization is available for SERIALIZABLE READ ONLY + * DEFERRABLE transactions -- we can wait for a suitable snapshot and + * thereby avoid all SSI overhead once it's running. + */ + if (XactReadOnly && XactDeferrable) + return GetSafeSnapshot(snapshot); + + return GetSerializableTransactionSnapshotInt(snapshot, + NULL, InvalidPid); +} + +/* + * Import a snapshot to be used for the current transaction. + * + * This is nearly the same as GetSerializableTransactionSnapshot, except that + * we don't take a new snapshot, but rather use the data we're handed. + * + * The caller must have verified that the snapshot came from a serializable + * transaction; and if we're read-write, the source transaction must not be + * read-only. + */ +void +SetSerializableTransactionSnapshot(Snapshot snapshot, + VirtualTransactionId *sourcevxid, + int sourcepid) +{ + Assert(IsolationIsSerializable()); + + /* + * If this is called by parallel.c in a parallel worker, we don't want to + * create a SERIALIZABLEXACT just yet because the leader's + * SERIALIZABLEXACT will be installed with AttachSerializableXact(). We + * also don't want to reject SERIALIZABLE READ ONLY DEFERRABLE in this + * case, because the leader has already determined that the snapshot it + * has passed us is safe. So there is nothing for us to do. + */ + if (IsParallelWorker()) + return; + + /* + * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to + * import snapshots, since there's no way to wait for a safe snapshot when + * we're using the snap we're told to. (XXX instead of throwing an error, + * we could just ignore the XactDeferrable flag?) + */ + if (XactReadOnly && XactDeferrable) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE"))); + + (void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid, + sourcepid); +} + +/* + * Guts of GetSerializableTransactionSnapshot + * + * If sourcevxid is valid, this is actually an import operation and we should + * skip calling GetSnapshotData, because the snapshot contents are already + * loaded up. HOWEVER: to avoid race conditions, we must check that the + * source xact is still running after we acquire SerializableXactHashLock. + * We do that by calling ProcArrayInstallImportedXmin. + */ +static Snapshot +GetSerializableTransactionSnapshotInt(Snapshot snapshot, + VirtualTransactionId *sourcevxid, + int sourcepid) +{ + PGPROC *proc; + VirtualTransactionId vxid; + SERIALIZABLEXACT *sxact, + *othersxact; + + /* We only do this for serializable transactions. Once. */ + Assert(MySerializableXact == InvalidSerializableXact); + + Assert(!RecoveryInProgress()); + + /* + * Since all parts of a serializable transaction must use the same + * snapshot, it is too late to establish one after a parallel operation + * has begun. + */ + if (IsInParallelMode()) + elog(ERROR, "cannot establish serializable snapshot during a parallel operation"); + + proc = MyProc; + Assert(proc != NULL); + GET_VXID_FROM_PGPROC(vxid, *proc); + + /* + * First we get the sxact structure, which may involve looping and access + * to the "finished" list to free a structure for use. + * + * We must hold SerializableXactHashLock when taking/checking the snapshot + * to avoid race conditions, for much the same reasons that + * GetSnapshotData takes the ProcArrayLock. Since we might have to + * release SerializableXactHashLock to call SummarizeOldestCommittedSxact, + * this means we have to create the sxact first, which is a bit annoying + * (in particular, an elog(ERROR) in procarray.c would cause us to leak + * the sxact). Consider refactoring to avoid this. + */ +#ifdef TEST_SUMMARIZE_SERIAL + SummarizeOldestCommittedSxact(); +#endif + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + do + { + sxact = CreatePredXact(); + /* If null, push out committed sxact to SLRU summary & retry. */ + if (!sxact) + { + LWLockRelease(SerializableXactHashLock); + SummarizeOldestCommittedSxact(); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + } + } while (!sxact); + + /* Get the snapshot, or check that it's safe to use */ + if (!sourcevxid) + snapshot = GetSnapshotData(snapshot); + else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid)) + { + ReleasePredXact(sxact); + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not import the requested snapshot"), + errdetail("The source process with PID %d is not running anymore.", + sourcepid))); + } + + /* + * If there are no serializable transactions which are not read-only, we + * can "opt out" of predicate locking and conflict checking for a + * read-only transaction. + * + * The reason this is safe is that a read-only transaction can only become + * part of a dangerous structure if it overlaps a writable transaction + * which in turn overlaps a writable transaction which committed before + * the read-only transaction started. A new writable transaction can + * overlap this one, but it can't meet the other condition of overlapping + * a transaction which committed before this one started. + */ + if (XactReadOnly && PredXact->WritableSxactCount == 0) + { + ReleasePredXact(sxact); + LWLockRelease(SerializableXactHashLock); + return snapshot; + } + + /* Maintain serializable global xmin info. */ + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) + { + Assert(PredXact->SxactGlobalXminCount == 0); + PredXact->SxactGlobalXmin = snapshot->xmin; + PredXact->SxactGlobalXminCount = 1; + SerialSetActiveSerXmin(snapshot->xmin); + } + else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin)) + { + Assert(PredXact->SxactGlobalXminCount > 0); + PredXact->SxactGlobalXminCount++; + } + else + { + Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin)); + } + + /* Initialize the structure. */ + sxact->vxid = vxid; + sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo; + sxact->prepareSeqNo = InvalidSerCommitSeqNo; + sxact->commitSeqNo = InvalidSerCommitSeqNo; + SHMQueueInit(&(sxact->outConflicts)); + SHMQueueInit(&(sxact->inConflicts)); + SHMQueueInit(&(sxact->possibleUnsafeConflicts)); + sxact->topXid = GetTopTransactionIdIfAny(); + sxact->finishedBefore = InvalidTransactionId; + sxact->xmin = snapshot->xmin; + sxact->pid = MyProcPid; + SHMQueueInit(&(sxact->predicateLocks)); + SHMQueueElemInit(&(sxact->finishedLink)); + sxact->flags = 0; + if (XactReadOnly) + { + sxact->flags |= SXACT_FLAG_READ_ONLY; + + /* + * Register all concurrent r/w transactions as possible conflicts; if + * all of them commit without any outgoing conflicts to earlier + * transactions then this snapshot can be deemed safe (and we can run + * without tracking predicate locks). + */ + for (othersxact = FirstPredXact(); + othersxact != NULL; + othersxact = NextPredXact(othersxact)) + { + if (!SxactIsCommitted(othersxact) + && !SxactIsDoomed(othersxact) + && !SxactIsReadOnly(othersxact)) + { + SetPossibleUnsafeConflict(sxact, othersxact); + } + } + } + else + { + ++(PredXact->WritableSxactCount); + Assert(PredXact->WritableSxactCount <= + (MaxBackends + max_prepared_xacts)); + } + + MySerializableXact = sxact; + MyXactDidWrite = false; /* haven't written anything yet */ + + LWLockRelease(SerializableXactHashLock); + + CreateLocalPredicateLockHash(); + + return snapshot; +} + +static void +CreateLocalPredicateLockHash(void) +{ + HASHCTL hash_ctl; + + /* Initialize the backend-local hash table of parent locks */ + Assert(LocalPredicateLockHash == NULL); + hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG); + hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK); + LocalPredicateLockHash = hash_create("Local predicate lock", + max_predicate_locks_per_xact, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); +} + +/* + * Register the top level XID in SerializableXidHash. + * Also store it for easy reference in MySerializableXact. + */ +void +RegisterPredicateLockingXid(TransactionId xid) +{ + SERIALIZABLEXIDTAG sxidtag; + SERIALIZABLEXID *sxid; + bool found; + + /* + * If we're not tracking predicate lock data for this transaction, we + * should ignore the request and return quickly. + */ + if (MySerializableXact == InvalidSerializableXact) + return; + + /* We should have a valid XID and be at the top level. */ + Assert(TransactionIdIsValid(xid)); + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* This should only be done once per transaction. */ + Assert(MySerializableXact->topXid == InvalidTransactionId); + + MySerializableXact->topXid = xid; + + sxidtag.xid = xid; + sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash, + &sxidtag, + HASH_ENTER, &found); + Assert(!found); + + /* Initialize the structure. */ + sxid->myXact = MySerializableXact; + LWLockRelease(SerializableXactHashLock); +} + + +/* + * Check whether there are any predicate locks held by any transaction + * for the page at the given block number. + * + * Note that the transaction may be completed but not yet subject to + * cleanup due to overlapping serializable transactions. This must + * return valid information regardless of transaction isolation level. + * + * Also note that this doesn't check for a conflicting relation lock, + * just a lock specifically on the given page. + * + * One use is to support proper behavior during GiST index vacuum. + */ +bool +PageIsPredicateLocked(Relation relation, BlockNumber blkno) +{ + PREDICATELOCKTARGETTAG targettag; + uint32 targettaghash; + LWLock *partitionLock; + PREDICATELOCKTARGET *target; + + SET_PREDICATELOCKTARGETTAG_PAGE(targettag, + relation->rd_node.dbNode, + relation->rd_id, + blkno); + + targettaghash = PredicateLockTargetTagHashCode(&targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + LWLockAcquire(partitionLock, LW_SHARED); + target = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + &targettag, targettaghash, + HASH_FIND, NULL); + LWLockRelease(partitionLock); + + return (target != NULL); +} + + +/* + * Check whether a particular lock is held by this transaction. + * + * Important note: this function may return false even if the lock is + * being held, because it uses the local lock table which is not + * updated if another transaction modifies our lock list (e.g. to + * split an index page). It can also return true when a coarser + * granularity lock that covers this target is being held. Be careful + * to only use this function in circumstances where such errors are + * acceptable! + */ +static bool +PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag) +{ + LOCALPREDICATELOCK *lock; + + /* check local hash table */ + lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash, + targettag, + HASH_FIND, NULL); + + if (!lock) + return false; + + /* + * Found entry in the table, but still need to check whether it's actually + * held -- it could just be a parent of some held lock. + */ + return lock->held; +} + +/* + * Return the parent lock tag in the lock hierarchy: the next coarser + * lock that covers the provided tag. + * + * Returns true and sets *parent to the parent tag if one exists, + * returns false if none exists. + */ +static bool +GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag, + PREDICATELOCKTARGETTAG *parent) +{ + switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag)) + { + case PREDLOCKTAG_RELATION: + /* relation locks have no parent lock */ + return false; + + case PREDLOCKTAG_PAGE: + /* parent lock is relation lock */ + SET_PREDICATELOCKTARGETTAG_RELATION(*parent, + GET_PREDICATELOCKTARGETTAG_DB(*tag), + GET_PREDICATELOCKTARGETTAG_RELATION(*tag)); + + return true; + + case PREDLOCKTAG_TUPLE: + /* parent lock is page lock */ + SET_PREDICATELOCKTARGETTAG_PAGE(*parent, + GET_PREDICATELOCKTARGETTAG_DB(*tag), + GET_PREDICATELOCKTARGETTAG_RELATION(*tag), + GET_PREDICATELOCKTARGETTAG_PAGE(*tag)); + return true; + } + + /* not reachable */ + Assert(false); + return false; +} + +/* + * Check whether the lock we are considering is already covered by a + * coarser lock for our transaction. + * + * Like PredicateLockExists, this function might return a false + * negative, but it will never return a false positive. + */ +static bool +CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag) +{ + PREDICATELOCKTARGETTAG targettag, + parenttag; + + targettag = *newtargettag; + + /* check parents iteratively until no more */ + while (GetParentPredicateLockTag(&targettag, &parenttag)) + { + targettag = parenttag; + if (PredicateLockExists(&targettag)) + return true; + } + + /* no more parents to check; lock is not covered */ + return false; +} + +/* + * Remove the dummy entry from the predicate lock target hash, to free up some + * scratch space. The caller must be holding SerializablePredicateListLock, + * and must restore the entry with RestoreScratchTarget() before releasing the + * lock. + * + * If lockheld is true, the caller is already holding the partition lock + * of the partition containing the scratch entry. + */ +static void +RemoveScratchTarget(bool lockheld) +{ + bool found; + + Assert(LWLockHeldByMe(SerializablePredicateListLock)); + + if (!lockheld) + LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE); + hash_search_with_hash_value(PredicateLockTargetHash, + &ScratchTargetTag, + ScratchTargetTagHash, + HASH_REMOVE, &found); + Assert(found); + if (!lockheld) + LWLockRelease(ScratchPartitionLock); +} + +/* + * Re-insert the dummy entry in predicate lock target hash. + */ +static void +RestoreScratchTarget(bool lockheld) +{ + bool found; + + Assert(LWLockHeldByMe(SerializablePredicateListLock)); + + if (!lockheld) + LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE); + hash_search_with_hash_value(PredicateLockTargetHash, + &ScratchTargetTag, + ScratchTargetTagHash, + HASH_ENTER, &found); + Assert(!found); + if (!lockheld) + LWLockRelease(ScratchPartitionLock); +} + +/* + * Check whether the list of related predicate locks is empty for a + * predicate lock target, and remove the target if it is. + */ +static void +RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash) +{ + PREDICATELOCKTARGET *rmtarget PG_USED_FOR_ASSERTS_ONLY; + + Assert(LWLockHeldByMe(SerializablePredicateListLock)); + + /* Can't remove it until no locks at this target. */ + if (!SHMQueueEmpty(&target->predicateLocks)) + return; + + /* Actually remove the target. */ + rmtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &target->tag, + targettaghash, + HASH_REMOVE, NULL); + Assert(rmtarget == target); +} + +/* + * Delete child target locks owned by this process. + * This implementation is assuming that the usage of each target tag field + * is uniform. No need to make this hard if we don't have to. + * + * We acquire an LWLock in the case of parallel mode, because worker + * backends have access to the leader's SERIALIZABLEXACT. Otherwise, + * we aren't acquiring LWLocks for the predicate lock or lock + * target structures associated with this transaction unless we're going + * to modify them, because no other process is permitted to modify our + * locks. + */ +static void +DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag) +{ + SERIALIZABLEXACT *sxact; + PREDICATELOCK *predlock; + + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + sxact = MySerializableXact; + if (IsInParallelMode()) + LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE); + predlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + &(sxact->predicateLocks), + offsetof(PREDICATELOCK, xactLink)); + while (predlock) + { + SHM_QUEUE *predlocksxactlink; + PREDICATELOCK *nextpredlock; + PREDICATELOCKTAG oldlocktag; + PREDICATELOCKTARGET *oldtarget; + PREDICATELOCKTARGETTAG oldtargettag; + + predlocksxactlink = &(predlock->xactLink); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + predlocksxactlink, + offsetof(PREDICATELOCK, xactLink)); + + oldlocktag = predlock->tag; + Assert(oldlocktag.myXact == sxact); + oldtarget = oldlocktag.myTarget; + oldtargettag = oldtarget->tag; + + if (TargetTagIsCoveredBy(oldtargettag, *newtargettag)) + { + uint32 oldtargettaghash; + LWLock *partitionLock; + PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY; + + oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag); + partitionLock = PredicateLockHashPartitionLock(oldtargettaghash); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + SHMQueueDelete(predlocksxactlink); + SHMQueueDelete(&(predlock->targetLink)); + rmpredlock = hash_search_with_hash_value + (PredicateLockHash, + &oldlocktag, + PredicateLockHashCodeFromTargetHashCode(&oldlocktag, + oldtargettaghash), + HASH_REMOVE, NULL); + Assert(rmpredlock == predlock); + + RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash); + + LWLockRelease(partitionLock); + + DecrementParentLocks(&oldtargettag); + } + + predlock = nextpredlock; + } + if (IsInParallelMode()) + LWLockRelease(&sxact->perXactPredicateListLock); + LWLockRelease(SerializablePredicateListLock); +} + +/* + * Returns the promotion limit for a given predicate lock target. This is the + * max number of descendant locks allowed before promoting to the specified + * tag. Note that the limit includes non-direct descendants (e.g., both tuples + * and pages for a relation lock). + * + * Currently the default limit is 2 for a page lock, and half of the value of + * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior + * of earlier releases when upgrading. + * + * TODO SSI: We should probably add additional GUCs to allow a maximum ratio + * of page and tuple locks based on the pages in a relation, and the maximum + * ratio of tuple locks to tuples in a page. This would provide more + * generally "balanced" allocation of locks to where they are most useful, + * while still allowing the absolute numbers to prevent one relation from + * tying up all predicate lock resources. + */ +static int +MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag) +{ + switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag)) + { + case PREDLOCKTAG_RELATION: + return max_predicate_locks_per_relation < 0 + ? (max_predicate_locks_per_xact + / (-max_predicate_locks_per_relation)) - 1 + : max_predicate_locks_per_relation; + + case PREDLOCKTAG_PAGE: + return max_predicate_locks_per_page; + + case PREDLOCKTAG_TUPLE: + + /* + * not reachable: nothing is finer-granularity than a tuple, so we + * should never try to promote to it. + */ + Assert(false); + return 0; + } + + /* not reachable */ + Assert(false); + return 0; +} + +/* + * For all ancestors of a newly-acquired predicate lock, increment + * their child count in the parent hash table. If any of them have + * more descendants than their promotion threshold, acquire the + * coarsest such lock. + * + * Returns true if a parent lock was acquired and false otherwise. + */ +static bool +CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag) +{ + PREDICATELOCKTARGETTAG targettag, + nexttag, + promotiontag; + LOCALPREDICATELOCK *parentlock; + bool found, + promote; + + promote = false; + + targettag = *reqtag; + + /* check parents iteratively */ + while (GetParentPredicateLockTag(&targettag, &nexttag)) + { + targettag = nexttag; + parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash, + &targettag, + HASH_ENTER, + &found); + if (!found) + { + parentlock->held = false; + parentlock->childLocks = 1; + } + else + parentlock->childLocks++; + + if (parentlock->childLocks > + MaxPredicateChildLocks(&targettag)) + { + /* + * We should promote to this parent lock. Continue to check its + * ancestors, however, both to get their child counts right and to + * check whether we should just go ahead and promote to one of + * them. + */ + promotiontag = targettag; + promote = true; + } + } + + if (promote) + { + /* acquire coarsest ancestor eligible for promotion */ + PredicateLockAcquire(&promotiontag); + return true; + } + else + return false; +} + +/* + * When releasing a lock, decrement the child count on all ancestor + * locks. + * + * This is called only when releasing a lock via + * DeleteChildTargetLocks (i.e. when a lock becomes redundant because + * we've acquired its parent, possibly due to promotion) or when a new + * MVCC write lock makes the predicate lock unnecessary. There's no + * point in calling it when locks are released at transaction end, as + * this information is no longer needed. + */ +static void +DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag) +{ + PREDICATELOCKTARGETTAG parenttag, + nexttag; + + parenttag = *targettag; + + while (GetParentPredicateLockTag(&parenttag, &nexttag)) + { + uint32 targettaghash; + LOCALPREDICATELOCK *parentlock, + *rmlock PG_USED_FOR_ASSERTS_ONLY; + + parenttag = nexttag; + targettaghash = PredicateLockTargetTagHashCode(&parenttag); + parentlock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + &parenttag, targettaghash, + HASH_FIND, NULL); + + /* + * There's a small chance the parent lock doesn't exist in the lock + * table. This can happen if we prematurely removed it because an + * index split caused the child refcount to be off. + */ + if (parentlock == NULL) + continue; + + parentlock->childLocks--; + + /* + * Under similar circumstances the parent lock's refcount might be + * zero. This only happens if we're holding that lock (otherwise we + * would have removed the entry). + */ + if (parentlock->childLocks < 0) + { + Assert(parentlock->held); + parentlock->childLocks = 0; + } + + if ((parentlock->childLocks == 0) && (!parentlock->held)) + { + rmlock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + &parenttag, targettaghash, + HASH_REMOVE, NULL); + Assert(rmlock == parentlock); + } + } +} + +/* + * Indicate that a predicate lock on the given target is held by the + * specified transaction. Has no effect if the lock is already held. + * + * This updates the lock table and the sxact's lock list, and creates + * the lock target if necessary, but does *not* do anything related to + * granularity promotion or the local lock table. See + * PredicateLockAcquire for that. + */ +static void +CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag, + uint32 targettaghash, + SERIALIZABLEXACT *sxact) +{ + PREDICATELOCKTARGET *target; + PREDICATELOCKTAG locktag; + PREDICATELOCK *lock; + LWLock *partitionLock; + bool found; + + partitionLock = PredicateLockHashPartitionLock(targettaghash); + + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + if (IsInParallelMode()) + LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* Make sure that the target is represented. */ + target = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + targettag, targettaghash, + HASH_ENTER_NULL, &found); + if (!target) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_pred_locks_per_transaction."))); + if (!found) + SHMQueueInit(&(target->predicateLocks)); + + /* We've got the sxact and target, make sure they're joined. */ + locktag.myTarget = target; + locktag.myXact = sxact; + lock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, &locktag, + PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash), + HASH_ENTER_NULL, &found); + if (!lock) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_pred_locks_per_transaction."))); + + if (!found) + { + SHMQueueInsertBefore(&(target->predicateLocks), &(lock->targetLink)); + SHMQueueInsertBefore(&(sxact->predicateLocks), + &(lock->xactLink)); + lock->commitSeqNo = InvalidSerCommitSeqNo; + } + + LWLockRelease(partitionLock); + if (IsInParallelMode()) + LWLockRelease(&sxact->perXactPredicateListLock); + LWLockRelease(SerializablePredicateListLock); +} + +/* + * Acquire a predicate lock on the specified target for the current + * connection if not already held. This updates the local lock table + * and uses it to implement granularity promotion. It will consolidate + * multiple locks into a coarser lock if warranted, and will release + * any finer-grained locks covered by the new one. + */ +static void +PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag) +{ + uint32 targettaghash; + bool found; + LOCALPREDICATELOCK *locallock; + + /* Do we have the lock already, or a covering lock? */ + if (PredicateLockExists(targettag)) + return; + + if (CoarserLockCovers(targettag)) + return; + + /* the same hash and LW lock apply to the lock target and the local lock. */ + targettaghash = PredicateLockTargetTagHashCode(targettag); + + /* Acquire lock in local table */ + locallock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + targettag, targettaghash, + HASH_ENTER, &found); + locallock->held = true; + if (!found) + locallock->childLocks = 0; + + /* Actually create the lock */ + CreatePredicateLock(targettag, targettaghash, MySerializableXact); + + /* + * Lock has been acquired. Check whether it should be promoted to a + * coarser granularity, or whether there are finer-granularity locks to + * clean up. + */ + if (CheckAndPromotePredicateLockRequest(targettag)) + { + /* + * Lock request was promoted to a coarser-granularity lock, and that + * lock was acquired. It will delete this lock and any of its + * children, so we're done. + */ + } + else + { + /* Clean up any finer-granularity locks */ + if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE) + DeleteChildTargetLocks(targettag); + } +} + + +/* + * PredicateLockRelation + * + * Gets a predicate lock at the relation level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + * Clear any finer-grained predicate locks this session has on the relation. + */ +void +PredicateLockRelation(Relation relation, Snapshot snapshot) +{ + PREDICATELOCKTARGETTAG tag; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + + SET_PREDICATELOCKTARGETTAG_RELATION(tag, + relation->rd_node.dbNode, + relation->rd_id); + PredicateLockAcquire(&tag); +} + +/* + * PredicateLockPage + * + * Gets a predicate lock at the page level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + * Skip if a coarser predicate lock already covers this page. + * Clear any finer-grained predicate locks this session has on the relation. + */ +void +PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot) +{ + PREDICATELOCKTARGETTAG tag; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + + SET_PREDICATELOCKTARGETTAG_PAGE(tag, + relation->rd_node.dbNode, + relation->rd_id, + blkno); + PredicateLockAcquire(&tag); +} + +/* + * PredicateLockTID + * + * Gets a predicate lock at the tuple level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + */ +void +PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, + TransactionId tuple_xid) +{ + PREDICATELOCKTARGETTAG tag; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + + /* + * Return if this xact wrote it. + */ + if (relation->rd_index == NULL) + { + /* If we wrote it; we already have a write lock. */ + if (TransactionIdIsCurrentTransactionId(tuple_xid)) + return; + } + + /* + * Do quick-but-not-definitive test for a relation lock first. This will + * never cause a return when the relation is *not* locked, but will + * occasionally let the check continue when there really *is* a relation + * level lock. + */ + SET_PREDICATELOCKTARGETTAG_RELATION(tag, + relation->rd_node.dbNode, + relation->rd_id); + if (PredicateLockExists(&tag)) + return; + + SET_PREDICATELOCKTARGETTAG_TUPLE(tag, + relation->rd_node.dbNode, + relation->rd_id, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + PredicateLockAcquire(&tag); +} + + +/* + * DeleteLockTarget + * + * Remove a predicate lock target along with any locks held for it. + * + * Caller must hold SerializablePredicateListLock and the + * appropriate hash partition lock for the target. + */ +static void +DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash) +{ + PREDICATELOCK *predlock; + SHM_QUEUE *predlocktargetlink; + PREDICATELOCK *nextpredlock; + bool found; + + Assert(LWLockHeldByMeInMode(SerializablePredicateListLock, + LW_EXCLUSIVE)); + Assert(LWLockHeldByMe(PredicateLockHashPartitionLock(targettaghash))); + + predlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + &(target->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + while (predlock) + { + predlocktargetlink = &(predlock->targetLink); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + predlocktargetlink, + offsetof(PREDICATELOCK, targetLink)); + + SHMQueueDelete(&(predlock->xactLink)); + SHMQueueDelete(&(predlock->targetLink)); + + hash_search_with_hash_value + (PredicateLockHash, + &predlock->tag, + PredicateLockHashCodeFromTargetHashCode(&predlock->tag, + targettaghash), + HASH_REMOVE, &found); + Assert(found); + + predlock = nextpredlock; + } + LWLockRelease(SerializableXactHashLock); + + /* Remove the target itself, if possible. */ + RemoveTargetIfNoLongerUsed(target, targettaghash); +} + + +/* + * TransferPredicateLocksToNewTarget + * + * Move or copy all the predicate locks for a lock target, for use by + * index page splits/combines and other things that create or replace + * lock targets. If 'removeOld' is true, the old locks and the target + * will be removed. + * + * Returns true on success, or false if we ran out of shared memory to + * allocate the new target or locks. Guaranteed to always succeed if + * removeOld is set (by using the scratch entry in PredicateLockTargetHash + * for scratch space). + * + * Warning: the "removeOld" option should be used only with care, + * because this function does not (indeed, can not) update other + * backends' LocalPredicateLockHash. If we are only adding new + * entries, this is not a problem: the local lock table is used only + * as a hint, so missing entries for locks that are held are + * OK. Having entries for locks that are no longer held, as can happen + * when using "removeOld", is not in general OK. We can only use it + * safely when replacing a lock with a coarser-granularity lock that + * covers it, or if we are absolutely certain that no one will need to + * refer to that lock in the future. + * + * Caller must hold SerializablePredicateListLock exclusively. + */ +static bool +TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag, + PREDICATELOCKTARGETTAG newtargettag, + bool removeOld) +{ + uint32 oldtargettaghash; + LWLock *oldpartitionLock; + PREDICATELOCKTARGET *oldtarget; + uint32 newtargettaghash; + LWLock *newpartitionLock; + bool found; + bool outOfShmem = false; + + Assert(LWLockHeldByMeInMode(SerializablePredicateListLock, + LW_EXCLUSIVE)); + + oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag); + newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag); + oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash); + newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash); + + if (removeOld) + { + /* + * Remove the dummy entry to give us scratch space, so we know we'll + * be able to create the new lock target. + */ + RemoveScratchTarget(false); + } + + /* + * We must get the partition locks in ascending sequence to avoid + * deadlocks. If old and new partitions are the same, we must request the + * lock only once. + */ + if (oldpartitionLock < newpartitionLock) + { + LWLockAcquire(oldpartitionLock, + (removeOld ? LW_EXCLUSIVE : LW_SHARED)); + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + } + else if (oldpartitionLock > newpartitionLock) + { + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + LWLockAcquire(oldpartitionLock, + (removeOld ? LW_EXCLUSIVE : LW_SHARED)); + } + else + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + + /* + * Look for the old target. If not found, that's OK; no predicate locks + * are affected, so we can just clean up and return. If it does exist, + * walk its list of predicate locks and move or copy them to the new + * target. + */ + oldtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &oldtargettag, + oldtargettaghash, + HASH_FIND, NULL); + + if (oldtarget) + { + PREDICATELOCKTARGET *newtarget; + PREDICATELOCK *oldpredlock; + PREDICATELOCKTAG newpredlocktag; + + newtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &newtargettag, + newtargettaghash, + HASH_ENTER_NULL, &found); + + if (!newtarget) + { + /* Failed to allocate due to insufficient shmem */ + outOfShmem = true; + goto exit; + } + + /* If we created a new entry, initialize it */ + if (!found) + SHMQueueInit(&(newtarget->predicateLocks)); + + newpredlocktag.myTarget = newtarget; + + /* + * Loop through all the locks on the old target, replacing them with + * locks on the new target. + */ + oldpredlock = (PREDICATELOCK *) + SHMQueueNext(&(oldtarget->predicateLocks), + &(oldtarget->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + while (oldpredlock) + { + SHM_QUEUE *predlocktargetlink; + PREDICATELOCK *nextpredlock; + PREDICATELOCK *newpredlock; + SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo; + + predlocktargetlink = &(oldpredlock->targetLink); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(oldtarget->predicateLocks), + predlocktargetlink, + offsetof(PREDICATELOCK, targetLink)); + newpredlocktag.myXact = oldpredlock->tag.myXact; + + if (removeOld) + { + SHMQueueDelete(&(oldpredlock->xactLink)); + SHMQueueDelete(&(oldpredlock->targetLink)); + + hash_search_with_hash_value + (PredicateLockHash, + &oldpredlock->tag, + PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag, + oldtargettaghash), + HASH_REMOVE, &found); + Assert(found); + } + + newpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &newpredlocktag, + PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, + newtargettaghash), + HASH_ENTER_NULL, + &found); + if (!newpredlock) + { + /* Out of shared memory. Undo what we've done so far. */ + LWLockRelease(SerializableXactHashLock); + DeleteLockTarget(newtarget, newtargettaghash); + outOfShmem = true; + goto exit; + } + if (!found) + { + SHMQueueInsertBefore(&(newtarget->predicateLocks), + &(newpredlock->targetLink)); + SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks), + &(newpredlock->xactLink)); + newpredlock->commitSeqNo = oldCommitSeqNo; + } + else + { + if (newpredlock->commitSeqNo < oldCommitSeqNo) + newpredlock->commitSeqNo = oldCommitSeqNo; + } + + Assert(newpredlock->commitSeqNo != 0); + Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo) + || (newpredlock->tag.myXact == OldCommittedSxact)); + + oldpredlock = nextpredlock; + } + LWLockRelease(SerializableXactHashLock); + + if (removeOld) + { + Assert(SHMQueueEmpty(&oldtarget->predicateLocks)); + RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash); + } + } + + +exit: + /* Release partition locks in reverse order of acquisition. */ + if (oldpartitionLock < newpartitionLock) + { + LWLockRelease(newpartitionLock); + LWLockRelease(oldpartitionLock); + } + else if (oldpartitionLock > newpartitionLock) + { + LWLockRelease(oldpartitionLock); + LWLockRelease(newpartitionLock); + } + else + LWLockRelease(newpartitionLock); + + if (removeOld) + { + /* We shouldn't run out of memory if we're moving locks */ + Assert(!outOfShmem); + + /* Put the scratch entry back */ + RestoreScratchTarget(false); + } + + return !outOfShmem; +} + +/* + * Drop all predicate locks of any granularity from the specified relation, + * which can be a heap relation or an index relation. If 'transfer' is true, + * acquire a relation lock on the heap for any transactions with any lock(s) + * on the specified relation. + * + * This requires grabbing a lot of LW locks and scanning the entire lock + * target table for matches. That makes this more expensive than most + * predicate lock management functions, but it will only be called for DDL + * type commands that are expensive anyway, and there are fast returns when + * no serializable transactions are active or the relation is temporary. + * + * We don't use the TransferPredicateLocksToNewTarget function because it + * acquires its own locks on the partitions of the two targets involved, + * and we'll already be holding all partition locks. + * + * We can't throw an error from here, because the call could be from a + * transaction which is not serializable. + * + * NOTE: This is currently only called with transfer set to true, but that may + * change. If we decide to clean up the locks from a table on commit of a + * transaction which executed DROP TABLE, the false condition will be useful. + */ +static void +DropAllPredicateLocksFromTable(Relation relation, bool transfer) +{ + HASH_SEQ_STATUS seqstat; + PREDICATELOCKTARGET *oldtarget; + PREDICATELOCKTARGET *heaptarget; + Oid dbId; + Oid relId; + Oid heapId; + int i; + bool isIndex; + bool found; + uint32 heaptargettaghash; + + /* + * Bail out quickly if there are no serializable transactions running. + * It's safe to check this without taking locks because the caller is + * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which + * would matter here can be acquired while that is held. + */ + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) + return; + + if (!PredicateLockingNeededForRelation(relation)) + return; + + dbId = relation->rd_node.dbNode; + relId = relation->rd_id; + if (relation->rd_index == NULL) + { + isIndex = false; + heapId = relId; + } + else + { + isIndex = true; + heapId = relation->rd_index->indrelid; + } + Assert(heapId != InvalidOid); + Assert(transfer || !isIndex); /* index OID only makes sense with + * transfer */ + + /* Retrieve first time needed, then keep. */ + heaptargettaghash = 0; + heaptarget = NULL; + + /* Acquire locks on all lock partitions */ + LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE); + for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Remove the dummy entry to give us scratch space, so we know we'll be + * able to create the new lock target. + */ + if (transfer) + RemoveScratchTarget(true); + + /* Scan through target map */ + hash_seq_init(&seqstat, PredicateLockTargetHash); + + while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat))) + { + PREDICATELOCK *oldpredlock; + + /* + * Check whether this is a target which needs attention. + */ + if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId) + continue; /* wrong relation id */ + if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId) + continue; /* wrong database id */ + if (transfer && !isIndex + && GET_PREDICATELOCKTARGETTAG_TYPE(oldtarget->tag) == PREDLOCKTAG_RELATION) + continue; /* already the right lock */ + + /* + * If we made it here, we have work to do. We make sure the heap + * relation lock exists, then we walk the list of predicate locks for + * the old target we found, moving all locks to the heap relation lock + * -- unless they already hold that. + */ + + /* + * First make sure we have the heap relation target. We only need to + * do this once. + */ + if (transfer && heaptarget == NULL) + { + PREDICATELOCKTARGETTAG heaptargettag; + + SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId); + heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag); + heaptarget = hash_search_with_hash_value(PredicateLockTargetHash, + &heaptargettag, + heaptargettaghash, + HASH_ENTER, &found); + if (!found) + SHMQueueInit(&heaptarget->predicateLocks); + } + + /* + * Loop through all the locks on the old target, replacing them with + * locks on the new target. + */ + oldpredlock = (PREDICATELOCK *) + SHMQueueNext(&(oldtarget->predicateLocks), + &(oldtarget->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + while (oldpredlock) + { + PREDICATELOCK *nextpredlock; + PREDICATELOCK *newpredlock; + SerCommitSeqNo oldCommitSeqNo; + SERIALIZABLEXACT *oldXact; + + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(oldtarget->predicateLocks), + &(oldpredlock->targetLink), + offsetof(PREDICATELOCK, targetLink)); + + /* + * Remove the old lock first. This avoids the chance of running + * out of lock structure entries for the hash table. + */ + oldCommitSeqNo = oldpredlock->commitSeqNo; + oldXact = oldpredlock->tag.myXact; + + SHMQueueDelete(&(oldpredlock->xactLink)); + + /* + * No need for retail delete from oldtarget list, we're removing + * the whole target anyway. + */ + hash_search(PredicateLockHash, + &oldpredlock->tag, + HASH_REMOVE, &found); + Assert(found); + + if (transfer) + { + PREDICATELOCKTAG newpredlocktag; + + newpredlocktag.myTarget = heaptarget; + newpredlocktag.myXact = oldXact; + newpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &newpredlocktag, + PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, + heaptargettaghash), + HASH_ENTER, + &found); + if (!found) + { + SHMQueueInsertBefore(&(heaptarget->predicateLocks), + &(newpredlock->targetLink)); + SHMQueueInsertBefore(&(newpredlocktag.myXact->predicateLocks), + &(newpredlock->xactLink)); + newpredlock->commitSeqNo = oldCommitSeqNo; + } + else + { + if (newpredlock->commitSeqNo < oldCommitSeqNo) + newpredlock->commitSeqNo = oldCommitSeqNo; + } + + Assert(newpredlock->commitSeqNo != 0); + Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo) + || (newpredlock->tag.myXact == OldCommittedSxact)); + } + + oldpredlock = nextpredlock; + } + + hash_search(PredicateLockTargetHash, &oldtarget->tag, HASH_REMOVE, + &found); + Assert(found); + } + + /* Put the scratch entry back */ + if (transfer) + RestoreScratchTarget(true); + + /* Release locks in reverse order */ + LWLockRelease(SerializableXactHashLock); + for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); + LWLockRelease(SerializablePredicateListLock); +} + +/* + * TransferPredicateLocksToHeapRelation + * For all transactions, transfer all predicate locks for the given + * relation to a single relation lock on the heap. + */ +void +TransferPredicateLocksToHeapRelation(Relation relation) +{ + DropAllPredicateLocksFromTable(relation, true); +} + + +/* + * PredicateLockPageSplit + * + * Copies any predicate locks for the old page to the new page. + * Skip if this is a temporary table or toast table. + * + * NOTE: A page split (or overflow) affects all serializable transactions, + * even if it occurs in the context of another transaction isolation level. + * + * NOTE: This currently leaves the local copy of the locks without + * information on the new lock which is in shared memory. This could cause + * problems if enough page splits occur on locked pages without the processes + * which hold the locks getting in and noticing. + */ +void +PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, + BlockNumber newblkno) +{ + PREDICATELOCKTARGETTAG oldtargettag; + PREDICATELOCKTARGETTAG newtargettag; + bool success; + + /* + * Bail out quickly if there are no serializable transactions running. + * + * It's safe to do this check without taking any additional locks. Even if + * a serializable transaction starts concurrently, we know it can't take + * any SIREAD locks on the page being split because the caller is holding + * the associated buffer page lock. Memory reordering isn't an issue; the + * memory barrier in the LWLock acquisition guarantees that this read + * occurs while the buffer page lock is held. + */ + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) + return; + + if (!PredicateLockingNeededForRelation(relation)) + return; + + Assert(oldblkno != newblkno); + Assert(BlockNumberIsValid(oldblkno)); + Assert(BlockNumberIsValid(newblkno)); + + SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag, + relation->rd_node.dbNode, + relation->rd_id, + oldblkno); + SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag, + relation->rd_node.dbNode, + relation->rd_id, + newblkno); + + LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE); + + /* + * Try copying the locks over to the new page's tag, creating it if + * necessary. + */ + success = TransferPredicateLocksToNewTarget(oldtargettag, + newtargettag, + false); + + if (!success) + { + /* + * No more predicate lock entries are available. Failure isn't an + * option here, so promote the page lock to a relation lock. + */ + + /* Get the parent relation lock's lock tag */ + success = GetParentPredicateLockTag(&oldtargettag, + &newtargettag); + Assert(success); + + /* + * Move the locks to the parent. This shouldn't fail. + * + * Note that here we are removing locks held by other backends, + * leading to a possible inconsistency in their local lock hash table. + * This is OK because we're replacing it with a lock that covers the + * old one. + */ + success = TransferPredicateLocksToNewTarget(oldtargettag, + newtargettag, + true); + Assert(success); + } + + LWLockRelease(SerializablePredicateListLock); +} + +/* + * PredicateLockPageCombine + * + * Combines predicate locks for two existing pages. + * Skip if this is a temporary table or toast table. + * + * NOTE: A page combine affects all serializable transactions, even if it + * occurs in the context of another transaction isolation level. + */ +void +PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, + BlockNumber newblkno) +{ + /* + * Page combines differ from page splits in that we ought to be able to + * remove the locks on the old page after transferring them to the new + * page, instead of duplicating them. However, because we can't edit other + * backends' local lock tables, removing the old lock would leave them + * with an entry in their LocalPredicateLockHash for a lock they're not + * holding, which isn't acceptable. So we wind up having to do the same + * work as a page split, acquiring a lock on the new page and keeping the + * old page locked too. That can lead to some false positives, but should + * be rare in practice. + */ + PredicateLockPageSplit(relation, oldblkno, newblkno); +} + +/* + * Walk the list of in-progress serializable transactions and find the new + * xmin. + */ +static void +SetNewSxactGlobalXmin(void) +{ + SERIALIZABLEXACT *sxact; + + Assert(LWLockHeldByMe(SerializableXactHashLock)); + + PredXact->SxactGlobalXmin = InvalidTransactionId; + PredXact->SxactGlobalXminCount = 0; + + for (sxact = FirstPredXact(); sxact != NULL; sxact = NextPredXact(sxact)) + { + if (!SxactIsRolledBack(sxact) + && !SxactIsCommitted(sxact) + && sxact != OldCommittedSxact) + { + Assert(sxact->xmin != InvalidTransactionId); + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin) + || TransactionIdPrecedes(sxact->xmin, + PredXact->SxactGlobalXmin)) + { + PredXact->SxactGlobalXmin = sxact->xmin; + PredXact->SxactGlobalXminCount = 1; + } + else if (TransactionIdEquals(sxact->xmin, + PredXact->SxactGlobalXmin)) + PredXact->SxactGlobalXminCount++; + } + } + + SerialSetActiveSerXmin(PredXact->SxactGlobalXmin); +} + +/* + * ReleasePredicateLocks + * + * Releases predicate locks based on completion of the current transaction, + * whether committed or rolled back. It can also be called for a read only + * transaction when it becomes impossible for the transaction to become + * part of a dangerous structure. + * + * We do nothing unless this is a serializable transaction. + * + * This method must ensure that shared memory hash tables are cleaned + * up in some relatively timely fashion. + * + * If this transaction is committing and is holding any predicate locks, + * it must be added to a list of completed serializable transactions still + * holding locks. + * + * If isReadOnlySafe is true, then predicate locks are being released before + * the end of the transaction because MySerializableXact has been determined + * to be RO_SAFE. In non-parallel mode we can release it completely, but it + * in parallel mode we partially release the SERIALIZABLEXACT and keep it + * around until the end of the transaction, allowing each backend to clear its + * MySerializableXact variable and benefit from the optimization in its own + * time. + */ +void +ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe) +{ + bool needToClear; + RWConflict conflict, + nextConflict, + possibleUnsafeConflict; + SERIALIZABLEXACT *roXact; + + /* + * We can't trust XactReadOnly here, because a transaction which started + * as READ WRITE can show as READ ONLY later, e.g., within + * subtransactions. We want to flag a transaction as READ ONLY if it + * commits without writing so that de facto READ ONLY transactions get the + * benefit of some RO optimizations, so we will use this local variable to + * get some cleanup logic right which is based on whether the transaction + * was declared READ ONLY at the top level. + */ + bool topLevelIsDeclaredReadOnly; + + /* We can't be both committing and releasing early due to RO_SAFE. */ + Assert(!(isCommit && isReadOnlySafe)); + + /* Are we at the end of a transaction, that is, a commit or abort? */ + if (!isReadOnlySafe) + { + /* + * Parallel workers mustn't release predicate locks at the end of + * their transaction. The leader will do that at the end of its + * transaction. + */ + if (IsParallelWorker()) + { + ReleasePredicateLocksLocal(); + return; + } + + /* + * By the time the leader in a parallel query reaches end of + * transaction, it has waited for all workers to exit. + */ + Assert(!ParallelContextActive()); + + /* + * If the leader in a parallel query earlier stashed a partially + * released SERIALIZABLEXACT for final clean-up at end of transaction + * (because workers might still have been accessing it), then it's + * time to restore it. + */ + if (SavedSerializableXact != InvalidSerializableXact) + { + Assert(MySerializableXact == InvalidSerializableXact); + MySerializableXact = SavedSerializableXact; + SavedSerializableXact = InvalidSerializableXact; + Assert(SxactIsPartiallyReleased(MySerializableXact)); + } + } + + if (MySerializableXact == InvalidSerializableXact) + { + Assert(LocalPredicateLockHash == NULL); + return; + } + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * If the transaction is committing, but it has been partially released + * already, then treat this as a roll back. It was marked as rolled back. + */ + if (isCommit && SxactIsPartiallyReleased(MySerializableXact)) + isCommit = false; + + /* + * If we're called in the middle of a transaction because we discovered + * that the SXACT_FLAG_RO_SAFE flag was set, then we'll partially release + * it (that is, release the predicate locks and conflicts, but not the + * SERIALIZABLEXACT itself) if we're the first backend to have noticed. + */ + if (isReadOnlySafe && IsInParallelMode()) + { + /* + * The leader needs to stash a pointer to it, so that it can + * completely release it at end-of-transaction. + */ + if (!IsParallelWorker()) + SavedSerializableXact = MySerializableXact; + + /* + * The first backend to reach this condition will partially release + * the SERIALIZABLEXACT. All others will just clear their + * backend-local state so that they stop doing SSI checks for the rest + * of the transaction. + */ + if (SxactIsPartiallyReleased(MySerializableXact)) + { + LWLockRelease(SerializableXactHashLock); + ReleasePredicateLocksLocal(); + return; + } + else + { + MySerializableXact->flags |= SXACT_FLAG_PARTIALLY_RELEASED; + /* ... and proceed to perform the partial release below. */ + } + } + Assert(!isCommit || SxactIsPrepared(MySerializableXact)); + Assert(!isCommit || !SxactIsDoomed(MySerializableXact)); + Assert(!SxactIsCommitted(MySerializableXact)); + Assert(SxactIsPartiallyReleased(MySerializableXact) + || !SxactIsRolledBack(MySerializableXact)); + + /* may not be serializable during COMMIT/ROLLBACK PREPARED */ + Assert(MySerializableXact->pid == 0 || IsolationIsSerializable()); + + /* We'd better not already be on the cleanup list. */ + Assert(!SxactIsOnFinishedList(MySerializableXact)); + + topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact); + + /* + * We don't hold XidGenLock lock here, assuming that TransactionId is + * atomic! + * + * If this value is changing, we don't care that much whether we get the + * old or new value -- it is just used to determine how far + * SxactGlobalXmin must advance before this transaction can be fully + * cleaned up. The worst that could happen is we wait for one more + * transaction to complete before freeing some RAM; correctness of visible + * behavior is not affected. + */ + MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid); + + /* + * If it's not a commit it's either a rollback or a read-only transaction + * flagged SXACT_FLAG_RO_SAFE, and we can clear our locks immediately. + */ + if (isCommit) + { + MySerializableXact->flags |= SXACT_FLAG_COMMITTED; + MySerializableXact->commitSeqNo = ++(PredXact->LastSxactCommitSeqNo); + /* Recognize implicit read-only transaction (commit without write). */ + if (!MyXactDidWrite) + MySerializableXact->flags |= SXACT_FLAG_READ_ONLY; + } + else + { + /* + * The DOOMED flag indicates that we intend to roll back this + * transaction and so it should not cause serialization failures for + * other transactions that conflict with it. Note that this flag might + * already be set, if another backend marked this transaction for + * abort. + * + * The ROLLED_BACK flag further indicates that ReleasePredicateLocks + * has been called, and so the SerializableXact is eligible for + * cleanup. This means it should not be considered when calculating + * SxactGlobalXmin. + */ + MySerializableXact->flags |= SXACT_FLAG_DOOMED; + MySerializableXact->flags |= SXACT_FLAG_ROLLED_BACK; + + /* + * If the transaction was previously prepared, but is now failing due + * to a ROLLBACK PREPARED or (hopefully very rare) error after the + * prepare, clear the prepared flag. This simplifies conflict + * checking. + */ + MySerializableXact->flags &= ~SXACT_FLAG_PREPARED; + } + + if (!topLevelIsDeclaredReadOnly) + { + Assert(PredXact->WritableSxactCount > 0); + if (--(PredXact->WritableSxactCount) == 0) + { + /* + * Release predicate locks and rw-conflicts in for all committed + * transactions. There are no longer any transactions which might + * conflict with the locks and no chance for new transactions to + * overlap. Similarly, existing conflicts in can't cause pivots, + * and any conflicts in which could have completed a dangerous + * structure would already have caused a rollback, so any + * remaining ones must be benign. + */ + PredXact->CanPartialClearThrough = PredXact->LastSxactCommitSeqNo; + } + } + else + { + /* + * Read-only transactions: clear the list of transactions that might + * make us unsafe. Note that we use 'inLink' for the iteration as + * opposed to 'outLink' for the r/w xacts. + */ + possibleUnsafeConflict = (RWConflict) + SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts, + &MySerializableXact->possibleUnsafeConflicts, + offsetof(RWConflictData, inLink)); + while (possibleUnsafeConflict) + { + nextConflict = (RWConflict) + SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts, + &possibleUnsafeConflict->inLink, + offsetof(RWConflictData, inLink)); + + Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut)); + Assert(MySerializableXact == possibleUnsafeConflict->sxactIn); + + ReleaseRWConflict(possibleUnsafeConflict); + + possibleUnsafeConflict = nextConflict; + } + } + + /* Check for conflict out to old committed transactions. */ + if (isCommit + && !SxactIsReadOnly(MySerializableXact) + && SxactHasSummaryConflictOut(MySerializableXact)) + { + /* + * we don't know which old committed transaction we conflicted with, + * so be conservative and use FirstNormalSerCommitSeqNo here + */ + MySerializableXact->SeqNo.earliestOutConflictCommit = + FirstNormalSerCommitSeqNo; + MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT; + } + + /* + * Release all outConflicts to committed transactions. If we're rolling + * back clear them all. Set SXACT_FLAG_CONFLICT_OUT if any point to + * previously committed transactions. + */ + conflict = (RWConflict) + SHMQueueNext(&MySerializableXact->outConflicts, + &MySerializableXact->outConflicts, + offsetof(RWConflictData, outLink)); + while (conflict) + { + nextConflict = (RWConflict) + SHMQueueNext(&MySerializableXact->outConflicts, + &conflict->outLink, + offsetof(RWConflictData, outLink)); + + if (isCommit + && !SxactIsReadOnly(MySerializableXact) + && SxactIsCommitted(conflict->sxactIn)) + { + if ((MySerializableXact->flags & SXACT_FLAG_CONFLICT_OUT) == 0 + || conflict->sxactIn->prepareSeqNo < MySerializableXact->SeqNo.earliestOutConflictCommit) + MySerializableXact->SeqNo.earliestOutConflictCommit = conflict->sxactIn->prepareSeqNo; + MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT; + } + + if (!isCommit + || SxactIsCommitted(conflict->sxactIn) + || (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo)) + ReleaseRWConflict(conflict); + + conflict = nextConflict; + } + + /* + * Release all inConflicts from committed and read-only transactions. If + * we're rolling back, clear them all. + */ + conflict = (RWConflict) + SHMQueueNext(&MySerializableXact->inConflicts, + &MySerializableXact->inConflicts, + offsetof(RWConflictData, inLink)); + while (conflict) + { + nextConflict = (RWConflict) + SHMQueueNext(&MySerializableXact->inConflicts, + &conflict->inLink, + offsetof(RWConflictData, inLink)); + + if (!isCommit + || SxactIsCommitted(conflict->sxactOut) + || SxactIsReadOnly(conflict->sxactOut)) + ReleaseRWConflict(conflict); + + conflict = nextConflict; + } + + if (!topLevelIsDeclaredReadOnly) + { + /* + * Remove ourselves from the list of possible conflicts for concurrent + * READ ONLY transactions, flagging them as unsafe if we have a + * conflict out. If any are waiting DEFERRABLE transactions, wake them + * up if they are known safe or known unsafe. + */ + possibleUnsafeConflict = (RWConflict) + SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts, + &MySerializableXact->possibleUnsafeConflicts, + offsetof(RWConflictData, outLink)); + while (possibleUnsafeConflict) + { + nextConflict = (RWConflict) + SHMQueueNext(&MySerializableXact->possibleUnsafeConflicts, + &possibleUnsafeConflict->outLink, + offsetof(RWConflictData, outLink)); + + roXact = possibleUnsafeConflict->sxactIn; + Assert(MySerializableXact == possibleUnsafeConflict->sxactOut); + Assert(SxactIsReadOnly(roXact)); + + /* Mark conflicted if necessary. */ + if (isCommit + && MyXactDidWrite + && SxactHasConflictOut(MySerializableXact) + && (MySerializableXact->SeqNo.earliestOutConflictCommit + <= roXact->SeqNo.lastCommitBeforeSnapshot)) + { + /* + * This releases possibleUnsafeConflict (as well as all other + * possible conflicts for roXact) + */ + FlagSxactUnsafe(roXact); + } + else + { + ReleaseRWConflict(possibleUnsafeConflict); + + /* + * If we were the last possible conflict, flag it safe. The + * transaction can now safely release its predicate locks (but + * that transaction's backend has to do that itself). + */ + if (SHMQueueEmpty(&roXact->possibleUnsafeConflicts)) + roXact->flags |= SXACT_FLAG_RO_SAFE; + } + + /* + * Wake up the process for a waiting DEFERRABLE transaction if we + * now know it's either safe or conflicted. + */ + if (SxactIsDeferrableWaiting(roXact) && + (SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact))) + ProcSendSignal(roXact->pid); + + possibleUnsafeConflict = nextConflict; + } + } + + /* + * Check whether it's time to clean up old transactions. This can only be + * done when the last serializable transaction with the oldest xmin among + * serializable transactions completes. We then find the "new oldest" + * xmin and purge any transactions which finished before this transaction + * was launched. + */ + needToClear = false; + if (TransactionIdEquals(MySerializableXact->xmin, PredXact->SxactGlobalXmin)) + { + Assert(PredXact->SxactGlobalXminCount > 0); + if (--(PredXact->SxactGlobalXminCount) == 0) + { + SetNewSxactGlobalXmin(); + needToClear = true; + } + } + + LWLockRelease(SerializableXactHashLock); + + LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE); + + /* Add this to the list of transactions to check for later cleanup. */ + if (isCommit) + SHMQueueInsertBefore(FinishedSerializableTransactions, + &MySerializableXact->finishedLink); + + /* + * If we're releasing a RO_SAFE transaction in parallel mode, we'll only + * partially release it. That's necessary because other backends may have + * a reference to it. The leader will release the SERIALIZABLEXACT itself + * at the end of the transaction after workers have stopped running. + */ + if (!isCommit) + ReleaseOneSerializableXact(MySerializableXact, + isReadOnlySafe && IsInParallelMode(), + false); + + LWLockRelease(SerializableFinishedListLock); + + if (needToClear) + ClearOldPredicateLocks(); + + ReleasePredicateLocksLocal(); +} + +static void +ReleasePredicateLocksLocal(void) +{ + MySerializableXact = InvalidSerializableXact; + MyXactDidWrite = false; + + /* Delete per-transaction lock table */ + if (LocalPredicateLockHash != NULL) + { + hash_destroy(LocalPredicateLockHash); + LocalPredicateLockHash = NULL; + } +} + +/* + * Clear old predicate locks, belonging to committed transactions that are no + * longer interesting to any in-progress transaction. + */ +static void +ClearOldPredicateLocks(void) +{ + SERIALIZABLEXACT *finishedSxact; + PREDICATELOCK *predlock; + + /* + * Loop through finished transactions. They are in commit order, so we can + * stop as soon as we find one that's still interesting. + */ + LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE); + finishedSxact = (SERIALIZABLEXACT *) + SHMQueueNext(FinishedSerializableTransactions, + FinishedSerializableTransactions, + offsetof(SERIALIZABLEXACT, finishedLink)); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + while (finishedSxact) + { + SERIALIZABLEXACT *nextSxact; + + nextSxact = (SERIALIZABLEXACT *) + SHMQueueNext(FinishedSerializableTransactions, + &(finishedSxact->finishedLink), + offsetof(SERIALIZABLEXACT, finishedLink)); + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin) + || TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore, + PredXact->SxactGlobalXmin)) + { + /* + * This transaction committed before any in-progress transaction + * took its snapshot. It's no longer interesting. + */ + LWLockRelease(SerializableXactHashLock); + SHMQueueDelete(&(finishedSxact->finishedLink)); + ReleaseOneSerializableXact(finishedSxact, false, false); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough + && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough) + { + /* + * Any active transactions that took their snapshot before this + * transaction committed are read-only, so we can clear part of + * its state. + */ + LWLockRelease(SerializableXactHashLock); + + if (SxactIsReadOnly(finishedSxact)) + { + /* A read-only transaction can be removed entirely */ + SHMQueueDelete(&(finishedSxact->finishedLink)); + ReleaseOneSerializableXact(finishedSxact, false, false); + } + else + { + /* + * A read-write transaction can only be partially cleared. We + * need to keep the SERIALIZABLEXACT but can release the + * SIREAD locks and conflicts in. + */ + ReleaseOneSerializableXact(finishedSxact, true, false); + } + + PredXact->HavePartialClearedThrough = finishedSxact->commitSeqNo; + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + else + { + /* Still interesting. */ + break; + } + finishedSxact = nextSxact; + } + LWLockRelease(SerializableXactHashLock); + + /* + * Loop through predicate locks on dummy transaction for summarized data. + */ + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + predlock = (PREDICATELOCK *) + SHMQueueNext(&OldCommittedSxact->predicateLocks, + &OldCommittedSxact->predicateLocks, + offsetof(PREDICATELOCK, xactLink)); + while (predlock) + { + PREDICATELOCK *nextpredlock; + bool canDoPartialCleanup; + + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&OldCommittedSxact->predicateLocks, + &predlock->xactLink, + offsetof(PREDICATELOCK, xactLink)); + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + Assert(predlock->commitSeqNo != 0); + Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo); + canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough); + LWLockRelease(SerializableXactHashLock); + + /* + * If this lock originally belonged to an old enough transaction, we + * can release it. + */ + if (canDoPartialCleanup) + { + PREDICATELOCKTAG tag; + PREDICATELOCKTARGET *target; + PREDICATELOCKTARGETTAG targettag; + uint32 targettaghash; + LWLock *partitionLock; + + tag = predlock->tag; + target = tag.myTarget; + targettag = target->tag; + targettaghash = PredicateLockTargetTagHashCode(&targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + SHMQueueDelete(&(predlock->targetLink)); + SHMQueueDelete(&(predlock->xactLink)); + + hash_search_with_hash_value(PredicateLockHash, &tag, + PredicateLockHashCodeFromTargetHashCode(&tag, + targettaghash), + HASH_REMOVE, NULL); + RemoveTargetIfNoLongerUsed(target, targettaghash); + + LWLockRelease(partitionLock); + } + + predlock = nextpredlock; + } + + LWLockRelease(SerializablePredicateListLock); + LWLockRelease(SerializableFinishedListLock); +} + +/* + * This is the normal way to delete anything from any of the predicate + * locking hash tables. Given a transaction which we know can be deleted: + * delete all predicate locks held by that transaction and any predicate + * lock targets which are now unreferenced by a lock; delete all conflicts + * for the transaction; delete all xid values for the transaction; then + * delete the transaction. + * + * When the partial flag is set, we can release all predicate locks and + * in-conflict information -- we've established that there are no longer + * any overlapping read write transactions for which this transaction could + * matter -- but keep the transaction entry itself and any outConflicts. + * + * When the summarize flag is set, we've run short of room for sxact data + * and must summarize to the SLRU. Predicate locks are transferred to a + * dummy "old" transaction, with duplicate locks on a single target + * collapsing to a single lock with the "latest" commitSeqNo from among + * the conflicting locks.. + */ +static void +ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial, + bool summarize) +{ + PREDICATELOCK *predlock; + SERIALIZABLEXIDTAG sxidtag; + RWConflict conflict, + nextConflict; + + Assert(sxact != NULL); + Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact)); + Assert(partial || !SxactIsOnFinishedList(sxact)); + Assert(LWLockHeldByMe(SerializableFinishedListLock)); + + /* + * First release all the predicate locks held by this xact (or transfer + * them to OldCommittedSxact if summarize is true) + */ + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + if (IsInParallelMode()) + LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE); + predlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + &(sxact->predicateLocks), + offsetof(PREDICATELOCK, xactLink)); + while (predlock) + { + PREDICATELOCK *nextpredlock; + PREDICATELOCKTAG tag; + SHM_QUEUE *targetLink; + PREDICATELOCKTARGET *target; + PREDICATELOCKTARGETTAG targettag; + uint32 targettaghash; + LWLock *partitionLock; + + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + &(predlock->xactLink), + offsetof(PREDICATELOCK, xactLink)); + + tag = predlock->tag; + targetLink = &(predlock->targetLink); + target = tag.myTarget; + targettag = target->tag; + targettaghash = PredicateLockTargetTagHashCode(&targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + SHMQueueDelete(targetLink); + + hash_search_with_hash_value(PredicateLockHash, &tag, + PredicateLockHashCodeFromTargetHashCode(&tag, + targettaghash), + HASH_REMOVE, NULL); + if (summarize) + { + bool found; + + /* Fold into dummy transaction list. */ + tag.myXact = OldCommittedSxact; + predlock = hash_search_with_hash_value(PredicateLockHash, &tag, + PredicateLockHashCodeFromTargetHashCode(&tag, + targettaghash), + HASH_ENTER_NULL, &found); + if (!predlock) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase max_pred_locks_per_transaction."))); + if (found) + { + Assert(predlock->commitSeqNo != 0); + Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo); + if (predlock->commitSeqNo < sxact->commitSeqNo) + predlock->commitSeqNo = sxact->commitSeqNo; + } + else + { + SHMQueueInsertBefore(&(target->predicateLocks), + &(predlock->targetLink)); + SHMQueueInsertBefore(&(OldCommittedSxact->predicateLocks), + &(predlock->xactLink)); + predlock->commitSeqNo = sxact->commitSeqNo; + } + } + else + RemoveTargetIfNoLongerUsed(target, targettaghash); + + LWLockRelease(partitionLock); + + predlock = nextpredlock; + } + + /* + * Rather than retail removal, just re-init the head after we've run + * through the list. + */ + SHMQueueInit(&sxact->predicateLocks); + + if (IsInParallelMode()) + LWLockRelease(&sxact->perXactPredicateListLock); + LWLockRelease(SerializablePredicateListLock); + + sxidtag.xid = sxact->topXid; + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* Release all outConflicts (unless 'partial' is true) */ + if (!partial) + { + conflict = (RWConflict) + SHMQueueNext(&sxact->outConflicts, + &sxact->outConflicts, + offsetof(RWConflictData, outLink)); + while (conflict) + { + nextConflict = (RWConflict) + SHMQueueNext(&sxact->outConflicts, + &conflict->outLink, + offsetof(RWConflictData, outLink)); + if (summarize) + conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN; + ReleaseRWConflict(conflict); + conflict = nextConflict; + } + } + + /* Release all inConflicts. */ + conflict = (RWConflict) + SHMQueueNext(&sxact->inConflicts, + &sxact->inConflicts, + offsetof(RWConflictData, inLink)); + while (conflict) + { + nextConflict = (RWConflict) + SHMQueueNext(&sxact->inConflicts, + &conflict->inLink, + offsetof(RWConflictData, inLink)); + if (summarize) + conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT; + ReleaseRWConflict(conflict); + conflict = nextConflict; + } + + /* Finally, get rid of the xid and the record of the transaction itself. */ + if (!partial) + { + if (sxidtag.xid != InvalidTransactionId) + hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL); + ReleasePredXact(sxact); + } + + LWLockRelease(SerializableXactHashLock); +} + +/* + * Tests whether the given top level transaction is concurrent with + * (overlaps) our current transaction. + * + * We need to identify the top level transaction for SSI, anyway, so pass + * that to this function to save the overhead of checking the snapshot's + * subxip array. + */ +static bool +XidIsConcurrent(TransactionId xid) +{ + Snapshot snap; + uint32 i; + + Assert(TransactionIdIsValid(xid)); + Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); + + snap = GetTransactionSnapshot(); + + if (TransactionIdPrecedes(xid, snap->xmin)) + return false; + + if (TransactionIdFollowsOrEquals(xid, snap->xmax)) + return true; + + for (i = 0; i < snap->xcnt; i++) + { + if (xid == snap->xip[i]) + return true; + } + + return false; +} + +bool +CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot) +{ + if (!SerializationNeededForRead(relation, snapshot)) + return false; + + /* Check if someone else has already decided that we need to die */ + if (SxactIsDoomed(MySerializableXact)) + { + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."), + errhint("The transaction might succeed if retried."))); + } + + return true; +} + +/* + * CheckForSerializableConflictOut + * A table AM is reading a tuple that has been modified. If it determines + * that the tuple version it is reading is not visible to us, it should + * pass in the top level xid of the transaction that created it. + * Otherwise, if it determines that it is visible to us but it has been + * deleted or there is a newer version available due to an update, it + * should pass in the top level xid of the modifying transaction. + * + * This function will check for overlap with our own transaction. If the given + * xid is also serializable and the transactions overlap (i.e., they cannot see + * each other's writes), then we have a conflict out. + */ +void +CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot) +{ + SERIALIZABLEXIDTAG sxidtag; + SERIALIZABLEXID *sxid; + SERIALIZABLEXACT *sxact; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + + /* Check if someone else has already decided that we need to die */ + if (SxactIsDoomed(MySerializableXact)) + { + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."), + errhint("The transaction might succeed if retried."))); + } + Assert(TransactionIdIsValid(xid)); + + if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) + return; + + /* + * Find sxact or summarized info for the top level xid. + */ + sxidtag.xid = xid; + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + sxid = (SERIALIZABLEXID *) + hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL); + if (!sxid) + { + /* + * Transaction not found in "normal" SSI structures. Check whether it + * got pushed out to SLRU storage for "old committed" transactions. + */ + SerCommitSeqNo conflictCommitSeqNo; + + conflictCommitSeqNo = SerialGetMinConflictCommitSeqNo(xid); + if (conflictCommitSeqNo != 0) + { + if (conflictCommitSeqNo != InvalidSerCommitSeqNo + && (!SxactIsReadOnly(MySerializableXact) + || conflictCommitSeqNo + <= MySerializableXact->SeqNo.lastCommitBeforeSnapshot)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid), + errhint("The transaction might succeed if retried."))); + + if (SxactHasSummaryConflictIn(MySerializableXact) + || !SHMQueueEmpty(&MySerializableXact->inConflicts)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid), + errhint("The transaction might succeed if retried."))); + + MySerializableXact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT; + } + + /* It's not serializable or otherwise not important. */ + LWLockRelease(SerializableXactHashLock); + return; + } + sxact = sxid->myXact; + Assert(TransactionIdEquals(sxact->topXid, xid)); + if (sxact == MySerializableXact || SxactIsDoomed(sxact)) + { + /* Can't conflict with ourself or a transaction that will roll back. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + /* + * We have a conflict out to a transaction which has a conflict out to a + * summarized transaction. That summarized transaction must have + * committed first, and we can't tell when it committed in relation to our + * snapshot acquisition, so something needs to be canceled. + */ + if (SxactHasSummaryConflictOut(sxact)) + { + if (!SxactIsPrepared(sxact)) + { + sxact->flags |= SXACT_FLAG_DOOMED; + LWLockRelease(SerializableXactHashLock); + return; + } + else + { + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on conflict out to old pivot."), + errhint("The transaction might succeed if retried."))); + } + } + + /* + * If this is a read-only transaction and the writing transaction has + * committed, and it doesn't have a rw-conflict to a transaction which + * committed before it, no conflict. + */ + if (SxactIsReadOnly(MySerializableXact) + && SxactIsCommitted(sxact) + && !SxactHasSummaryConflictOut(sxact) + && (!SxactHasConflictOut(sxact) + || MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit)) + { + /* Read-only transaction will appear to run first. No conflict. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + if (!XidIsConcurrent(xid)) + { + /* This write was already in our snapshot; no conflict. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + if (RWConflictExists(MySerializableXact, sxact)) + { + /* We don't want duplicate conflict records in the list. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + /* + * Flag the conflict. But first, if this conflict creates a dangerous + * structure, ereport an error. + */ + FlagRWConflict(MySerializableXact, sxact); + LWLockRelease(SerializableXactHashLock); +} + +/* + * Check a particular target for rw-dependency conflict in. A subroutine of + * CheckForSerializableConflictIn(). + */ +static void +CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag) +{ + uint32 targettaghash; + LWLock *partitionLock; + PREDICATELOCKTARGET *target; + PREDICATELOCK *predlock; + PREDICATELOCK *mypredlock = NULL; + PREDICATELOCKTAG mypredlocktag; + + Assert(MySerializableXact != InvalidSerializableXact); + + /* + * The same hash and LW lock apply to the lock target and the lock itself. + */ + targettaghash = PredicateLockTargetTagHashCode(targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + LWLockAcquire(partitionLock, LW_SHARED); + target = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + targettag, targettaghash, + HASH_FIND, NULL); + if (!target) + { + /* Nothing has this target locked; we're done here. */ + LWLockRelease(partitionLock); + return; + } + + /* + * Each lock for an overlapping transaction represents a conflict: a + * rw-dependency in to this transaction. + */ + predlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + &(target->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + while (predlock) + { + SHM_QUEUE *predlocktargetlink; + PREDICATELOCK *nextpredlock; + SERIALIZABLEXACT *sxact; + + predlocktargetlink = &(predlock->targetLink); + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + predlocktargetlink, + offsetof(PREDICATELOCK, targetLink)); + + sxact = predlock->tag.myXact; + if (sxact == MySerializableXact) + { + /* + * If we're getting a write lock on a tuple, we don't need a + * predicate (SIREAD) lock on the same tuple. We can safely remove + * our SIREAD lock, but we'll defer doing so until after the loop + * because that requires upgrading to an exclusive partition lock. + * + * We can't use this optimization within a subtransaction because + * the subtransaction could roll back, and we would be left + * without any lock at the top level. + */ + if (!IsSubTransaction() + && GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag)) + { + mypredlock = predlock; + mypredlocktag = predlock->tag; + } + } + else if (!SxactIsDoomed(sxact) + && (!SxactIsCommitted(sxact) + || TransactionIdPrecedes(GetTransactionSnapshot()->xmin, + sxact->finishedBefore)) + && !RWConflictExists(sxact, MySerializableXact)) + { + LWLockRelease(SerializableXactHashLock); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Re-check after getting exclusive lock because the other + * transaction may have flagged a conflict. + */ + if (!SxactIsDoomed(sxact) + && (!SxactIsCommitted(sxact) + || TransactionIdPrecedes(GetTransactionSnapshot()->xmin, + sxact->finishedBefore)) + && !RWConflictExists(sxact, MySerializableXact)) + { + FlagRWConflict(sxact, MySerializableXact); + } + + LWLockRelease(SerializableXactHashLock); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + + predlock = nextpredlock; + } + LWLockRelease(SerializableXactHashLock); + LWLockRelease(partitionLock); + + /* + * If we found one of our own SIREAD locks to remove, remove it now. + * + * At this point our transaction already has a RowExclusiveLock on the + * relation, so we are OK to drop the predicate lock on the tuple, if + * found, without fearing that another write against the tuple will occur + * before the MVCC information makes it to the buffer. + */ + if (mypredlock != NULL) + { + uint32 predlockhashcode; + PREDICATELOCK *rmpredlock; + + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + if (IsInParallelMode()) + LWLockAcquire(&MySerializableXact->perXactPredicateListLock, LW_EXCLUSIVE); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Remove the predicate lock from shared memory, if it wasn't removed + * while the locks were released. One way that could happen is from + * autovacuum cleaning up an index. + */ + predlockhashcode = PredicateLockHashCodeFromTargetHashCode + (&mypredlocktag, targettaghash); + rmpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &mypredlocktag, + predlockhashcode, + HASH_FIND, NULL); + if (rmpredlock != NULL) + { + Assert(rmpredlock == mypredlock); + + SHMQueueDelete(&(mypredlock->targetLink)); + SHMQueueDelete(&(mypredlock->xactLink)); + + rmpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &mypredlocktag, + predlockhashcode, + HASH_REMOVE, NULL); + Assert(rmpredlock == mypredlock); + + RemoveTargetIfNoLongerUsed(target, targettaghash); + } + + LWLockRelease(SerializableXactHashLock); + LWLockRelease(partitionLock); + if (IsInParallelMode()) + LWLockRelease(&MySerializableXact->perXactPredicateListLock); + LWLockRelease(SerializablePredicateListLock); + + if (rmpredlock != NULL) + { + /* + * Remove entry in local lock table if it exists. It's OK if it + * doesn't exist; that means the lock was transferred to a new + * target by a different backend. + */ + hash_search_with_hash_value(LocalPredicateLockHash, + targettag, targettaghash, + HASH_REMOVE, NULL); + + DecrementParentLocks(targettag); + } + } +} + +/* + * CheckForSerializableConflictIn + * We are writing the given tuple. If that indicates a rw-conflict + * in from another serializable transaction, take appropriate action. + * + * Skip checking for any granularity for which a parameter is missing. + * + * A tuple update or delete is in conflict if we have a predicate lock + * against the relation or page in which the tuple exists, or against the + * tuple itself. + */ +void +CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno) +{ + PREDICATELOCKTARGETTAG targettag; + + if (!SerializationNeededForWrite(relation)) + return; + + /* Check if someone else has already decided that we need to die */ + if (SxactIsDoomed(MySerializableXact)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."), + errhint("The transaction might succeed if retried."))); + + /* + * We're doing a write which might cause rw-conflicts now or later. + * Memorize that fact. + */ + MyXactDidWrite = true; + + /* + * It is important that we check for locks from the finest granularity to + * the coarsest granularity, so that granularity promotion doesn't cause + * us to miss a lock. The new (coarser) lock will be acquired before the + * old (finer) locks are released. + * + * It is not possible to take and hold a lock across the checks for all + * granularities because each target could be in a separate partition. + */ + if (tid != NULL) + { + SET_PREDICATELOCKTARGETTAG_TUPLE(targettag, + relation->rd_node.dbNode, + relation->rd_id, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + CheckTargetForConflictsIn(&targettag); + } + + if (blkno != InvalidBlockNumber) + { + SET_PREDICATELOCKTARGETTAG_PAGE(targettag, + relation->rd_node.dbNode, + relation->rd_id, + blkno); + CheckTargetForConflictsIn(&targettag); + } + + SET_PREDICATELOCKTARGETTAG_RELATION(targettag, + relation->rd_node.dbNode, + relation->rd_id); + CheckTargetForConflictsIn(&targettag); +} + +/* + * CheckTableForSerializableConflictIn + * The entire table is going through a DDL-style logical mass delete + * like TRUNCATE or DROP TABLE. If that causes a rw-conflict in from + * another serializable transaction, take appropriate action. + * + * While these operations do not operate entirely within the bounds of + * snapshot isolation, they can occur inside a serializable transaction, and + * will logically occur after any reads which saw rows which were destroyed + * by these operations, so we do what we can to serialize properly under + * SSI. + * + * The relation passed in must be a heap relation. Any predicate lock of any + * granularity on the heap will cause a rw-conflict in to this transaction. + * Predicate locks on indexes do not matter because they only exist to guard + * against conflicting inserts into the index, and this is a mass *delete*. + * When a table is truncated or dropped, the index will also be truncated + * or dropped, and we'll deal with locks on the index when that happens. + * + * Dropping or truncating a table also needs to drop any existing predicate + * locks on heap tuples or pages, because they're about to go away. This + * should be done before altering the predicate locks because the transaction + * could be rolled back because of a conflict, in which case the lock changes + * are not needed. (At the moment, we don't actually bother to drop the + * existing locks on a dropped or truncated table at the moment. That might + * lead to some false positives, but it doesn't seem worth the trouble.) + */ +void +CheckTableForSerializableConflictIn(Relation relation) +{ + HASH_SEQ_STATUS seqstat; + PREDICATELOCKTARGET *target; + Oid dbId; + Oid heapId; + int i; + + /* + * Bail out quickly if there are no serializable transactions running. + * It's safe to check this without taking locks because the caller is + * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which + * would matter here can be acquired while that is held. + */ + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) + return; + + if (!SerializationNeededForWrite(relation)) + return; + + /* + * We're doing a write which might cause rw-conflicts now or later. + * Memorize that fact. + */ + MyXactDidWrite = true; + + Assert(relation->rd_index == NULL); /* not an index relation */ + + dbId = relation->rd_node.dbNode; + heapId = relation->rd_id; + + LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE); + for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* Scan through target list */ + hash_seq_init(&seqstat, PredicateLockTargetHash); + + while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat))) + { + PREDICATELOCK *predlock; + + /* + * Check whether this is a target which needs attention. + */ + if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId) + continue; /* wrong relation id */ + if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId) + continue; /* wrong database id */ + + /* + * Loop through locks for this target and flag conflicts. + */ + predlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + &(target->predicateLocks), + offsetof(PREDICATELOCK, targetLink)); + while (predlock) + { + PREDICATELOCK *nextpredlock; + + nextpredlock = (PREDICATELOCK *) + SHMQueueNext(&(target->predicateLocks), + &(predlock->targetLink), + offsetof(PREDICATELOCK, targetLink)); + + if (predlock->tag.myXact != MySerializableXact + && !RWConflictExists(predlock->tag.myXact, MySerializableXact)) + { + FlagRWConflict(predlock->tag.myXact, MySerializableXact); + } + + predlock = nextpredlock; + } + } + + /* Release locks in reverse order */ + LWLockRelease(SerializableXactHashLock); + for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); + LWLockRelease(SerializablePredicateListLock); +} + + +/* + * Flag a rw-dependency between two serializable transactions. + * + * The caller is responsible for ensuring that we have a LW lock on + * the transaction hash table. + */ +static void +FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer) +{ + Assert(reader != writer); + + /* First, see if this conflict causes failure. */ + OnConflict_CheckForSerializationFailure(reader, writer); + + /* Actually do the conflict flagging. */ + if (reader == OldCommittedSxact) + writer->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN; + else if (writer == OldCommittedSxact) + reader->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT; + else + SetRWConflict(reader, writer); +} + +/*---------------------------------------------------------------------------- + * We are about to add a RW-edge to the dependency graph - check that we don't + * introduce a dangerous structure by doing so, and abort one of the + * transactions if so. + * + * A serialization failure can only occur if there is a dangerous structure + * in the dependency graph: + * + * Tin ------> Tpivot ------> Tout + * rw rw + * + * Furthermore, Tout must commit first. + * + * One more optimization is that if Tin is declared READ ONLY (or commits + * without writing), we can only have a problem if Tout committed before Tin + * acquired its snapshot. + *---------------------------------------------------------------------------- + */ +static void +OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, + SERIALIZABLEXACT *writer) +{ + bool failure; + RWConflict conflict; + + Assert(LWLockHeldByMe(SerializableXactHashLock)); + + failure = false; + + /*------------------------------------------------------------------------ + * Check for already-committed writer with rw-conflict out flagged + * (conflict-flag on W means that T2 committed before W): + * + * R ------> W ------> T2 + * rw rw + * + * That is a dangerous structure, so we must abort. (Since the writer + * has already committed, we must be the reader) + *------------------------------------------------------------------------ + */ + if (SxactIsCommitted(writer) + && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer))) + failure = true; + + /*------------------------------------------------------------------------ + * Check whether the writer has become a pivot with an out-conflict + * committed transaction (T2), and T2 committed first: + * + * R ------> W ------> T2 + * rw rw + * + * Because T2 must've committed first, there is no anomaly if: + * - the reader committed before T2 + * - the writer committed before T2 + * - the reader is a READ ONLY transaction and the reader was concurrent + * with T2 (= reader acquired its snapshot before T2 committed) + * + * We also handle the case that T2 is prepared but not yet committed + * here. In that case T2 has already checked for conflicts, so if it + * commits first, making the above conflict real, it's too late for it + * to abort. + *------------------------------------------------------------------------ + */ + if (!failure) + { + if (SxactHasSummaryConflictOut(writer)) + { + failure = true; + conflict = NULL; + } + else + conflict = (RWConflict) + SHMQueueNext(&writer->outConflicts, + &writer->outConflicts, + offsetof(RWConflictData, outLink)); + while (conflict) + { + SERIALIZABLEXACT *t2 = conflict->sxactIn; + + if (SxactIsPrepared(t2) + && (!SxactIsCommitted(reader) + || t2->prepareSeqNo <= reader->commitSeqNo) + && (!SxactIsCommitted(writer) + || t2->prepareSeqNo <= writer->commitSeqNo) + && (!SxactIsReadOnly(reader) + || t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot)) + { + failure = true; + break; + } + conflict = (RWConflict) + SHMQueueNext(&writer->outConflicts, + &conflict->outLink, + offsetof(RWConflictData, outLink)); + } + } + + /*------------------------------------------------------------------------ + * Check whether the reader has become a pivot with a writer + * that's committed (or prepared): + * + * T0 ------> R ------> W + * rw rw + * + * Because W must've committed first for an anomaly to occur, there is no + * anomaly if: + * - T0 committed before the writer + * - T0 is READ ONLY, and overlaps the writer + *------------------------------------------------------------------------ + */ + if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader)) + { + if (SxactHasSummaryConflictIn(reader)) + { + failure = true; + conflict = NULL; + } + else + conflict = (RWConflict) + SHMQueueNext(&reader->inConflicts, + &reader->inConflicts, + offsetof(RWConflictData, inLink)); + while (conflict) + { + SERIALIZABLEXACT *t0 = conflict->sxactOut; + + if (!SxactIsDoomed(t0) + && (!SxactIsCommitted(t0) + || t0->commitSeqNo >= writer->prepareSeqNo) + && (!SxactIsReadOnly(t0) + || t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo)) + { + failure = true; + break; + } + conflict = (RWConflict) + SHMQueueNext(&reader->inConflicts, + &conflict->inLink, + offsetof(RWConflictData, inLink)); + } + } + + if (failure) + { + /* + * We have to kill a transaction to avoid a possible anomaly from + * occurring. If the writer is us, we can just ereport() to cause a + * transaction abort. Otherwise we flag the writer for termination, + * causing it to abort when it tries to commit. However, if the writer + * is a prepared transaction, already prepared, we can't abort it + * anymore, so we have to kill the reader instead. + */ + if (MySerializableXact == writer) + { + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during write."), + errhint("The transaction might succeed if retried."))); + } + else if (SxactIsPrepared(writer)) + { + LWLockRelease(SerializableXactHashLock); + + /* if we're not the writer, we have to be the reader */ + Assert(MySerializableXact == reader); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid), + errhint("The transaction might succeed if retried."))); + } + writer->flags |= SXACT_FLAG_DOOMED; + } +} + +/* + * PreCommit_CheckForSerializationFailure + * Check for dangerous structures in a serializable transaction + * at commit. + * + * We're checking for a dangerous structure as each conflict is recorded. + * The only way we could have a problem at commit is if this is the "out" + * side of a pivot, and neither the "in" side nor the pivot has yet + * committed. + * + * If a dangerous structure is found, the pivot (the near conflict) is + * marked for death, because rolling back another transaction might mean + * that we fail without ever making progress. This transaction is + * committing writes, so letting it commit ensures progress. If we + * canceled the far conflict, it might immediately fail again on retry. + */ +void +PreCommit_CheckForSerializationFailure(void) +{ + RWConflict nearConflict; + + if (MySerializableXact == InvalidSerializableXact) + return; + + Assert(IsolationIsSerializable()); + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* Check if someone else has already decided that we need to die */ + if (SxactIsDoomed(MySerializableXact)) + { + Assert(!SxactIsPartiallyReleased(MySerializableXact)); + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."), + errhint("The transaction might succeed if retried."))); + } + + nearConflict = (RWConflict) + SHMQueueNext(&MySerializableXact->inConflicts, + &MySerializableXact->inConflicts, + offsetof(RWConflictData, inLink)); + while (nearConflict) + { + if (!SxactIsCommitted(nearConflict->sxactOut) + && !SxactIsDoomed(nearConflict->sxactOut)) + { + RWConflict farConflict; + + farConflict = (RWConflict) + SHMQueueNext(&nearConflict->sxactOut->inConflicts, + &nearConflict->sxactOut->inConflicts, + offsetof(RWConflictData, inLink)); + while (farConflict) + { + if (farConflict->sxactOut == MySerializableXact + || (!SxactIsCommitted(farConflict->sxactOut) + && !SxactIsReadOnly(farConflict->sxactOut) + && !SxactIsDoomed(farConflict->sxactOut))) + { + /* + * Normally, we kill the pivot transaction to make sure we + * make progress if the failing transaction is retried. + * However, we can't kill it if it's already prepared, so + * in that case we commit suicide instead. + */ + if (SxactIsPrepared(nearConflict->sxactOut)) + { + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."), + errhint("The transaction might succeed if retried."))); + } + nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED; + break; + } + farConflict = (RWConflict) + SHMQueueNext(&nearConflict->sxactOut->inConflicts, + &farConflict->inLink, + offsetof(RWConflictData, inLink)); + } + } + + nearConflict = (RWConflict) + SHMQueueNext(&MySerializableXact->inConflicts, + &nearConflict->inLink, + offsetof(RWConflictData, inLink)); + } + + MySerializableXact->prepareSeqNo = ++(PredXact->LastSxactCommitSeqNo); + MySerializableXact->flags |= SXACT_FLAG_PREPARED; + + LWLockRelease(SerializableXactHashLock); +} + +/*------------------------------------------------------------------------*/ + +/* + * Two-phase commit support + */ + +/* + * AtPrepare_Locks + * Do the preparatory work for a PREPARE: make 2PC state file + * records for all predicate locks currently held. + */ +void +AtPrepare_PredicateLocks(void) +{ + PREDICATELOCK *predlock; + SERIALIZABLEXACT *sxact; + TwoPhasePredicateRecord record; + TwoPhasePredicateXactRecord *xactRecord; + TwoPhasePredicateLockRecord *lockRecord; + + sxact = MySerializableXact; + xactRecord = &(record.data.xactRecord); + lockRecord = &(record.data.lockRecord); + + if (MySerializableXact == InvalidSerializableXact) + return; + + /* Generate an xact record for our SERIALIZABLEXACT */ + record.type = TWOPHASEPREDICATERECORD_XACT; + xactRecord->xmin = MySerializableXact->xmin; + xactRecord->flags = MySerializableXact->flags; + + /* + * Note that we don't include the list of conflicts in our out in the + * statefile, because new conflicts can be added even after the + * transaction prepares. We'll just make a conservative assumption during + * recovery instead. + */ + + RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0, + &record, sizeof(record)); + + /* + * Generate a lock record for each lock. + * + * To do this, we need to walk the predicate lock list in our sxact rather + * than using the local predicate lock table because the latter is not + * guaranteed to be accurate. + */ + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + + /* + * No need to take sxact->perXactPredicateListLock in parallel mode + * because there cannot be any parallel workers running while we are + * preparing a transaction. + */ + Assert(!IsParallelWorker() && !ParallelContextActive()); + + predlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + &(sxact->predicateLocks), + offsetof(PREDICATELOCK, xactLink)); + + while (predlock != NULL) + { + record.type = TWOPHASEPREDICATERECORD_LOCK; + lockRecord->target = predlock->tag.myTarget->tag; + + RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0, + &record, sizeof(record)); + + predlock = (PREDICATELOCK *) + SHMQueueNext(&(sxact->predicateLocks), + &(predlock->xactLink), + offsetof(PREDICATELOCK, xactLink)); + } + + LWLockRelease(SerializablePredicateListLock); +} + +/* + * PostPrepare_Locks + * Clean up after successful PREPARE. Unlike the non-predicate + * lock manager, we do not need to transfer locks to a dummy + * PGPROC because our SERIALIZABLEXACT will stay around + * anyway. We only need to clean up our local state. + */ +void +PostPrepare_PredicateLocks(TransactionId xid) +{ + if (MySerializableXact == InvalidSerializableXact) + return; + + Assert(SxactIsPrepared(MySerializableXact)); + + MySerializableXact->pid = 0; + + hash_destroy(LocalPredicateLockHash); + LocalPredicateLockHash = NULL; + + MySerializableXact = InvalidSerializableXact; + MyXactDidWrite = false; +} + +/* + * PredicateLockTwoPhaseFinish + * Release a prepared transaction's predicate locks once it + * commits or aborts. + */ +void +PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit) +{ + SERIALIZABLEXID *sxid; + SERIALIZABLEXIDTAG sxidtag; + + sxidtag.xid = xid; + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + sxid = (SERIALIZABLEXID *) + hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL); + LWLockRelease(SerializableXactHashLock); + + /* xid will not be found if it wasn't a serializable transaction */ + if (sxid == NULL) + return; + + /* Release its locks */ + MySerializableXact = sxid->myXact; + MyXactDidWrite = true; /* conservatively assume that we wrote + * something */ + ReleasePredicateLocks(isCommit, false); +} + +/* + * Re-acquire a predicate lock belonging to a transaction that was prepared. + */ +void +predicatelock_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhasePredicateRecord *record; + + Assert(len == sizeof(TwoPhasePredicateRecord)); + + record = (TwoPhasePredicateRecord *) recdata; + + Assert((record->type == TWOPHASEPREDICATERECORD_XACT) || + (record->type == TWOPHASEPREDICATERECORD_LOCK)); + + if (record->type == TWOPHASEPREDICATERECORD_XACT) + { + /* Per-transaction record. Set up a SERIALIZABLEXACT. */ + TwoPhasePredicateXactRecord *xactRecord; + SERIALIZABLEXACT *sxact; + SERIALIZABLEXID *sxid; + SERIALIZABLEXIDTAG sxidtag; + bool found; + + xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord; + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + sxact = CreatePredXact(); + if (!sxact) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); + + /* vxid for a prepared xact is InvalidBackendId/xid; no pid */ + sxact->vxid.backendId = InvalidBackendId; + sxact->vxid.localTransactionId = (LocalTransactionId) xid; + sxact->pid = 0; + + /* a prepared xact hasn't committed yet */ + sxact->prepareSeqNo = RecoverySerCommitSeqNo; + sxact->commitSeqNo = InvalidSerCommitSeqNo; + sxact->finishedBefore = InvalidTransactionId; + + sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo; + + /* + * Don't need to track this; no transactions running at the time the + * recovered xact started are still active, except possibly other + * prepared xacts and we don't care whether those are RO_SAFE or not. + */ + SHMQueueInit(&(sxact->possibleUnsafeConflicts)); + + SHMQueueInit(&(sxact->predicateLocks)); + SHMQueueElemInit(&(sxact->finishedLink)); + + sxact->topXid = xid; + sxact->xmin = xactRecord->xmin; + sxact->flags = xactRecord->flags; + Assert(SxactIsPrepared(sxact)); + if (!SxactIsReadOnly(sxact)) + { + ++(PredXact->WritableSxactCount); + Assert(PredXact->WritableSxactCount <= + (MaxBackends + max_prepared_xacts)); + } + + /* + * We don't know whether the transaction had any conflicts or not, so + * we'll conservatively assume that it had both a conflict in and a + * conflict out, and represent that with the summary conflict flags. + */ + SHMQueueInit(&(sxact->outConflicts)); + SHMQueueInit(&(sxact->inConflicts)); + sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN; + sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT; + + /* Register the transaction's xid */ + sxidtag.xid = xid; + sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash, + &sxidtag, + HASH_ENTER, &found); + Assert(sxid != NULL); + Assert(!found); + sxid->myXact = (SERIALIZABLEXACT *) sxact; + + /* + * Update global xmin. Note that this is a special case compared to + * registering a normal transaction, because the global xmin might go + * backwards. That's OK, because until recovery is over we're not + * going to complete any transactions or create any non-prepared + * transactions, so there's no danger of throwing away. + */ + if ((!TransactionIdIsValid(PredXact->SxactGlobalXmin)) || + (TransactionIdFollows(PredXact->SxactGlobalXmin, sxact->xmin))) + { + PredXact->SxactGlobalXmin = sxact->xmin; + PredXact->SxactGlobalXminCount = 1; + SerialSetActiveSerXmin(sxact->xmin); + } + else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin)) + { + Assert(PredXact->SxactGlobalXminCount > 0); + PredXact->SxactGlobalXminCount++; + } + + LWLockRelease(SerializableXactHashLock); + } + else if (record->type == TWOPHASEPREDICATERECORD_LOCK) + { + /* Lock record. Recreate the PREDICATELOCK */ + TwoPhasePredicateLockRecord *lockRecord; + SERIALIZABLEXID *sxid; + SERIALIZABLEXACT *sxact; + SERIALIZABLEXIDTAG sxidtag; + uint32 targettaghash; + + lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord; + targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target); + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + sxidtag.xid = xid; + sxid = (SERIALIZABLEXID *) + hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL); + LWLockRelease(SerializableXactHashLock); + + Assert(sxid != NULL); + sxact = sxid->myXact; + Assert(sxact != InvalidSerializableXact); + + CreatePredicateLock(&lockRecord->target, targettaghash, sxact); + } +} + +/* + * Prepare to share the current SERIALIZABLEXACT with parallel workers. + * Return a handle object that can be used by AttachSerializableXact() in a + * parallel worker. + */ +SerializableXactHandle +ShareSerializableXact(void) +{ + return MySerializableXact; +} + +/* + * Allow parallel workers to import the leader's SERIALIZABLEXACT. + */ +void +AttachSerializableXact(SerializableXactHandle handle) +{ + + Assert(MySerializableXact == InvalidSerializableXact); + + MySerializableXact = (SERIALIZABLEXACT *) handle; + if (MySerializableXact != InvalidSerializableXact) + CreateLocalPredicateLockHash(); +} diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c new file mode 100644 index 0000000..c50a419 --- /dev/null +++ b/src/backend/storage/lmgr/proc.c @@ -0,0 +1,2012 @@ +/*------------------------------------------------------------------------- + * + * proc.c + * routines to manage per-process shared memory data structure + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/proc.c + * + *------------------------------------------------------------------------- + */ +/* + * Interface (a): + * ProcSleep(), ProcWakeup(), + * ProcQueueAlloc() -- create a shm queue for sleeping processes + * ProcQueueInit() -- create a queue without allocing memory + * + * Waiting for a lock causes the backend to be put to sleep. Whoever releases + * the lock wakes the process up again (and gives it an error code so it knows + * whether it was awoken on an error condition). + * + * Interface (b): + * + * ProcReleaseLocks -- frees the locks associated with current transaction + * + * ProcKill -- destroys the shared memory state (and locks) + * associated with the process. + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> +#include <sys/time.h> + +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "replication/slot.h" +#include "replication/syncrep.h" +#include "replication/walsender.h" +#include "storage/condition_variable.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "storage/spin.h" +#include "storage/standby.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* GUC variables */ +int DeadlockTimeout = 1000; +int StatementTimeout = 0; +int LockTimeout = 0; +int IdleInTransactionSessionTimeout = 0; +int IdleSessionTimeout = 0; +bool log_lock_waits = false; + +/* Pointer to this process's PGPROC struct, if any */ +PGPROC *MyProc = NULL; + +/* + * This spinlock protects the freelist of recycled PGPROC structures. + * We cannot use an LWLock because the LWLock manager depends on already + * having a PGPROC and a wait semaphore! But these structures are touched + * relatively infrequently (only at backend startup or shutdown) and not for + * very long, so a spinlock is okay. + */ +NON_EXEC_STATIC slock_t *ProcStructLock = NULL; + +/* Pointers to shared-memory structures */ +PROC_HDR *ProcGlobal = NULL; +NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL; +PGPROC *PreparedXactProcs = NULL; + +/* If we are waiting for a lock, this points to the associated LOCALLOCK */ +static LOCALLOCK *lockAwaited = NULL; + +static DeadLockState deadlock_state = DS_NOT_YET_CHECKED; + +/* Is a deadlock check pending? */ +static volatile sig_atomic_t got_deadlock_timeout; + +static void RemoveProcFromArray(int code, Datum arg); +static void ProcKill(int code, Datum arg); +static void AuxiliaryProcKill(int code, Datum arg); +static void CheckDeadLock(void); + + +/* + * Report shared-memory space needed by InitProcGlobal. + */ +Size +ProcGlobalShmemSize(void) +{ + Size size = 0; + Size TotalProcs = + add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts)); + + /* ProcGlobal */ + size = add_size(size, sizeof(PROC_HDR)); + size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC))); + size = add_size(size, sizeof(slock_t)); + + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids))); + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates))); + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags))); + + return size; +} + +/* + * Report number of semaphores needed by InitProcGlobal. + */ +int +ProcGlobalSemas(void) +{ + /* + * We need a sema per backend (including autovacuum), plus one for each + * auxiliary process. + */ + return MaxBackends + NUM_AUXILIARY_PROCS; +} + +/* + * InitProcGlobal - + * Initialize the global process table during postmaster or standalone + * backend startup. + * + * We also create all the per-process semaphores we will need to support + * the requested number of backends. We used to allocate semaphores + * only when backends were actually started up, but that is bad because + * it lets Postgres fail under load --- a lot of Unix systems are + * (mis)configured with small limits on the number of semaphores, and + * running out when trying to start another backend is a common failure. + * So, now we grab enough semaphores to support the desired max number + * of backends immediately at initialization --- if the sysadmin has set + * MaxConnections, max_worker_processes, max_wal_senders, or + * autovacuum_max_workers higher than his kernel will support, he'll + * find out sooner rather than later. + * + * Another reason for creating semaphores here is that the semaphore + * implementation typically requires us to create semaphores in the + * postmaster, not in backends. + * + * Note: this is NOT called by individual backends under a postmaster, + * not even in the EXEC_BACKEND case. The ProcGlobal and AuxiliaryProcs + * pointers must be propagated specially for EXEC_BACKEND operation. + */ +void +InitProcGlobal(void) +{ + PGPROC *procs; + int i, + j; + bool found; + uint32 TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts; + + /* Create the ProcGlobal shared structure */ + ProcGlobal = (PROC_HDR *) + ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found); + Assert(!found); + + /* + * Initialize the data structures. + */ + ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY; + ProcGlobal->freeProcs = NULL; + ProcGlobal->autovacFreeProcs = NULL; + ProcGlobal->bgworkerFreeProcs = NULL; + ProcGlobal->walsenderFreeProcs = NULL; + ProcGlobal->startupProc = NULL; + ProcGlobal->startupProcPid = 0; + ProcGlobal->startupBufferPinWaitBufId = -1; + ProcGlobal->walwriterLatch = NULL; + ProcGlobal->checkpointerLatch = NULL; + pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PGPROCNO); + pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PGPROCNO); + + /* + * Create and initialize all the PGPROC structures we'll need. There are + * five separate consumers: (1) normal backends, (2) autovacuum workers + * and the autovacuum launcher, (3) background workers, (4) auxiliary + * processes, and (5) prepared transactions. Each PGPROC structure is + * dedicated to exactly one of these purposes, and they do not move + * between groups. + */ + procs = (PGPROC *) ShmemAlloc(TotalProcs * sizeof(PGPROC)); + MemSet(procs, 0, TotalProcs * sizeof(PGPROC)); + ProcGlobal->allProcs = procs; + /* XXX allProcCount isn't really all of them; it excludes prepared xacts */ + ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS; + + /* + * Allocate arrays mirroring PGPROC fields in a dense manner. See + * PROC_HDR. + * + * XXX: It might make sense to increase padding for these arrays, given + * how hotly they are accessed. + */ + ProcGlobal->xids = + (TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids)); + MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids)); + ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates)); + MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates)); + ProcGlobal->statusFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->statusFlags)); + MemSet(ProcGlobal->statusFlags, 0, TotalProcs * sizeof(*ProcGlobal->statusFlags)); + + for (i = 0; i < TotalProcs; i++) + { + /* Common initialization for all PGPROCs, regardless of type. */ + + /* + * Set up per-PGPROC semaphore, latch, and fpInfoLock. Prepared xact + * dummy PGPROCs don't need these though - they're never associated + * with a real process + */ + if (i < MaxBackends + NUM_AUXILIARY_PROCS) + { + procs[i].sem = PGSemaphoreCreate(); + InitSharedLatch(&(procs[i].procLatch)); + LWLockInitialize(&(procs[i].fpInfoLock), LWTRANCHE_LOCK_FASTPATH); + } + procs[i].pgprocno = i; + + /* + * Newly created PGPROCs for normal backends, autovacuum and bgworkers + * must be queued up on the appropriate free list. Because there can + * only ever be a small, fixed number of auxiliary processes, no free + * list is used in that case; InitAuxiliaryProcess() instead uses a + * linear search. PGPROCs for prepared transactions are added to a + * free list by TwoPhaseShmemInit(). + */ + if (i < MaxConnections) + { + /* PGPROC for normal backend, add to freeProcs list */ + procs[i].links.next = (SHM_QUEUE *) ProcGlobal->freeProcs; + ProcGlobal->freeProcs = &procs[i]; + procs[i].procgloballist = &ProcGlobal->freeProcs; + } + else if (i < MaxConnections + autovacuum_max_workers + 1) + { + /* PGPROC for AV launcher/worker, add to autovacFreeProcs list */ + procs[i].links.next = (SHM_QUEUE *) ProcGlobal->autovacFreeProcs; + ProcGlobal->autovacFreeProcs = &procs[i]; + procs[i].procgloballist = &ProcGlobal->autovacFreeProcs; + } + else if (i < MaxConnections + autovacuum_max_workers + 1 + max_worker_processes) + { + /* PGPROC for bgworker, add to bgworkerFreeProcs list */ + procs[i].links.next = (SHM_QUEUE *) ProcGlobal->bgworkerFreeProcs; + ProcGlobal->bgworkerFreeProcs = &procs[i]; + procs[i].procgloballist = &ProcGlobal->bgworkerFreeProcs; + } + else if (i < MaxBackends) + { + /* PGPROC for walsender, add to walsenderFreeProcs list */ + procs[i].links.next = (SHM_QUEUE *) ProcGlobal->walsenderFreeProcs; + ProcGlobal->walsenderFreeProcs = &procs[i]; + procs[i].procgloballist = &ProcGlobal->walsenderFreeProcs; + } + + /* Initialize myProcLocks[] shared memory queues. */ + for (j = 0; j < NUM_LOCK_PARTITIONS; j++) + SHMQueueInit(&(procs[i].myProcLocks[j])); + + /* Initialize lockGroupMembers list. */ + dlist_init(&procs[i].lockGroupMembers); + + /* + * Initialize the atomic variables, otherwise, it won't be safe to + * access them for backends that aren't currently in use. + */ + pg_atomic_init_u32(&(procs[i].procArrayGroupNext), INVALID_PGPROCNO); + pg_atomic_init_u32(&(procs[i].clogGroupNext), INVALID_PGPROCNO); + pg_atomic_init_u64(&(procs[i].waitStart), 0); + } + + /* + * Save pointers to the blocks of PGPROC structures reserved for auxiliary + * processes and prepared transactions. + */ + AuxiliaryProcs = &procs[MaxBackends]; + PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS]; + + /* Create ProcStructLock spinlock, too */ + ProcStructLock = (slock_t *) ShmemAlloc(sizeof(slock_t)); + SpinLockInit(ProcStructLock); +} + +/* + * InitProcess -- initialize a per-process data structure for this backend + */ +void +InitProcess(void) +{ + PGPROC *volatile *procgloballist; + + /* + * ProcGlobal should be set up already (if we are a backend, we inherit + * this by fork() or EXEC_BACKEND mechanism from the postmaster). + */ + if (ProcGlobal == NULL) + elog(PANIC, "proc header uninitialized"); + + if (MyProc != NULL) + elog(ERROR, "you already exist"); + + /* Decide which list should supply our PGPROC. */ + if (IsAnyAutoVacuumProcess()) + procgloballist = &ProcGlobal->autovacFreeProcs; + else if (IsBackgroundWorker) + procgloballist = &ProcGlobal->bgworkerFreeProcs; + else if (am_walsender) + procgloballist = &ProcGlobal->walsenderFreeProcs; + else + procgloballist = &ProcGlobal->freeProcs; + + /* + * Try to get a proc struct from the appropriate free list. If this + * fails, we must be out of PGPROC structures (not to mention semaphores). + * + * While we are holding the ProcStructLock, also copy the current shared + * estimate of spins_per_delay to local storage. + */ + SpinLockAcquire(ProcStructLock); + + set_spins_per_delay(ProcGlobal->spins_per_delay); + + MyProc = *procgloballist; + + if (MyProc != NULL) + { + *procgloballist = (PGPROC *) MyProc->links.next; + SpinLockRelease(ProcStructLock); + } + else + { + /* + * If we reach here, all the PGPROCs are in use. This is one of the + * possible places to detect "too many backends", so give the standard + * error message. XXX do we need to give a different failure message + * in the autovacuum case? + */ + SpinLockRelease(ProcStructLock); + if (am_walsender) + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("number of requested standby connections exceeds max_wal_senders (currently %d)", + max_wal_senders))); + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("sorry, too many clients already"))); + } + + /* + * Cross-check that the PGPROC is of the type we expect; if this were not + * the case, it would get returned to the wrong list. + */ + Assert(MyProc->procgloballist == procgloballist); + + /* + * Now that we have a PGPROC, mark ourselves as an active postmaster + * child; this is so that the postmaster can detect it if we exit without + * cleaning up. (XXX autovac launcher currently doesn't participate in + * this; it probably should.) + */ + if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess()) + MarkPostmasterChildActive(); + + /* + * Initialize all fields of MyProc, except for those previously + * initialized by InitProcGlobal. + */ + SHMQueueElemInit(&(MyProc->links)); + MyProc->waitStatus = PROC_WAIT_STATUS_OK; + MyProc->lxid = InvalidLocalTransactionId; + MyProc->fpVXIDLock = false; + MyProc->fpLocalTransactionId = InvalidLocalTransactionId; + MyProc->xid = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; + MyProc->pid = MyProcPid; + /* backendId, databaseId and roleId will be filled in later */ + MyProc->backendId = InvalidBackendId; + MyProc->databaseId = InvalidOid; + MyProc->roleId = InvalidOid; + MyProc->tempNamespaceId = InvalidOid; + MyProc->isBackgroundWorker = IsBackgroundWorker; + MyProc->delayChkpt = 0; + MyProc->statusFlags = 0; + /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ + if (IsAutoVacuumWorkerProcess()) + MyProc->statusFlags |= PROC_IS_AUTOVACUUM; + MyProc->lwWaiting = false; + MyProc->lwWaitMode = 0; + MyProc->waitLock = NULL; + MyProc->waitProcLock = NULL; + pg_atomic_write_u64(&MyProc->waitStart, 0); +#ifdef USE_ASSERT_CHECKING + { + int i; + + /* Last process should have released all locks. */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i]))); + } +#endif + MyProc->recoveryConflictPending = false; + + /* Initialize fields for sync rep */ + MyProc->waitLSN = 0; + MyProc->syncRepState = SYNC_REP_NOT_WAITING; + SHMQueueElemInit(&(MyProc->syncRepLinks)); + + /* Initialize fields for group XID clearing. */ + MyProc->procArrayGroupMember = false; + MyProc->procArrayGroupMemberXid = InvalidTransactionId; + Assert(pg_atomic_read_u32(&MyProc->procArrayGroupNext) == INVALID_PGPROCNO); + + /* Check that group locking fields are in a proper initial state. */ + Assert(MyProc->lockGroupLeader == NULL); + Assert(dlist_is_empty(&MyProc->lockGroupMembers)); + + /* Initialize wait event information. */ + MyProc->wait_event_info = 0; + + /* Initialize fields for group transaction status update. */ + MyProc->clogGroupMember = false; + MyProc->clogGroupMemberXid = InvalidTransactionId; + MyProc->clogGroupMemberXidStatus = TRANSACTION_STATUS_IN_PROGRESS; + MyProc->clogGroupMemberPage = -1; + MyProc->clogGroupMemberLsn = InvalidXLogRecPtr; + Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO); + + /* + * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch + * on it. That allows us to repoint the process latch, which so far + * points to process local one, to the shared one. + */ + OwnLatch(&MyProc->procLatch); + SwitchToSharedLatch(); + + /* now that we have a proc, report wait events to shared memory */ + pgstat_set_wait_event_storage(&MyProc->wait_event_info); + + /* + * We might be reusing a semaphore that belonged to a failed process. So + * be careful and reinitialize its value here. (This is not strictly + * necessary anymore, but seems like a good idea for cleanliness.) + */ + PGSemaphoreReset(MyProc->sem); + + /* + * Arrange to clean up at backend exit. + */ + on_shmem_exit(ProcKill, 0); + + /* + * Now that we have a PGPROC, we could try to acquire locks, so initialize + * local state needed for LWLocks, and the deadlock checker. + */ + InitLWLockAccess(); + InitDeadLockChecking(); +} + +/* + * InitProcessPhase2 -- make MyProc visible in the shared ProcArray. + * + * This is separate from InitProcess because we can't acquire LWLocks until + * we've created a PGPROC, but in the EXEC_BACKEND case ProcArrayAdd won't + * work until after we've done CreateSharedMemoryAndSemaphores. + */ +void +InitProcessPhase2(void) +{ + Assert(MyProc != NULL); + + /* + * Add our PGPROC to the PGPROC array in shared memory. + */ + ProcArrayAdd(MyProc); + + /* + * Arrange to clean that up at backend exit. + */ + on_shmem_exit(RemoveProcFromArray, 0); +} + +/* + * InitAuxiliaryProcess -- create a per-auxiliary-process data structure + * + * This is called by bgwriter and similar processes so that they will have a + * MyProc value that's real enough to let them wait for LWLocks. The PGPROC + * and sema that are assigned are one of the extra ones created during + * InitProcGlobal. + * + * Auxiliary processes are presently not expected to wait for real (lockmgr) + * locks, so we need not set up the deadlock checker. They are never added + * to the ProcArray or the sinval messaging mechanism, either. They also + * don't get a VXID assigned, since this is only useful when we actually + * hold lockmgr locks. + * + * Startup process however uses locks but never waits for them in the + * normal backend sense. Startup process also takes part in sinval messaging + * as a sendOnly process, so never reads messages from sinval queue. So + * Startup process does have a VXID and does show up in pg_locks. + */ +void +InitAuxiliaryProcess(void) +{ + PGPROC *auxproc; + int proctype; + + /* + * ProcGlobal should be set up already (if we are a backend, we inherit + * this by fork() or EXEC_BACKEND mechanism from the postmaster). + */ + if (ProcGlobal == NULL || AuxiliaryProcs == NULL) + elog(PANIC, "proc header uninitialized"); + + if (MyProc != NULL) + elog(ERROR, "you already exist"); + + /* + * We use the ProcStructLock to protect assignment and releasing of + * AuxiliaryProcs entries. + * + * While we are holding the ProcStructLock, also copy the current shared + * estimate of spins_per_delay to local storage. + */ + SpinLockAcquire(ProcStructLock); + + set_spins_per_delay(ProcGlobal->spins_per_delay); + + /* + * Find a free auxproc ... *big* trouble if there isn't one ... + */ + for (proctype = 0; proctype < NUM_AUXILIARY_PROCS; proctype++) + { + auxproc = &AuxiliaryProcs[proctype]; + if (auxproc->pid == 0) + break; + } + if (proctype >= NUM_AUXILIARY_PROCS) + { + SpinLockRelease(ProcStructLock); + elog(FATAL, "all AuxiliaryProcs are in use"); + } + + /* Mark auxiliary proc as in use by me */ + /* use volatile pointer to prevent code rearrangement */ + ((volatile PGPROC *) auxproc)->pid = MyProcPid; + + MyProc = auxproc; + + SpinLockRelease(ProcStructLock); + + /* + * Initialize all fields of MyProc, except for those previously + * initialized by InitProcGlobal. + */ + SHMQueueElemInit(&(MyProc->links)); + MyProc->waitStatus = PROC_WAIT_STATUS_OK; + MyProc->lxid = InvalidLocalTransactionId; + MyProc->fpVXIDLock = false; + MyProc->fpLocalTransactionId = InvalidLocalTransactionId; + MyProc->xid = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; + MyProc->backendId = InvalidBackendId; + MyProc->databaseId = InvalidOid; + MyProc->roleId = InvalidOid; + MyProc->tempNamespaceId = InvalidOid; + MyProc->isBackgroundWorker = IsBackgroundWorker; + MyProc->delayChkpt = 0; + MyProc->statusFlags = 0; + MyProc->lwWaiting = false; + MyProc->lwWaitMode = 0; + MyProc->waitLock = NULL; + MyProc->waitProcLock = NULL; + pg_atomic_write_u64(&MyProc->waitStart, 0); +#ifdef USE_ASSERT_CHECKING + { + int i; + + /* Last process should have released all locks. */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i]))); + } +#endif + + /* + * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch + * on it. That allows us to repoint the process latch, which so far + * points to process local one, to the shared one. + */ + OwnLatch(&MyProc->procLatch); + SwitchToSharedLatch(); + + /* now that we have a proc, report wait events to shared memory */ + pgstat_set_wait_event_storage(&MyProc->wait_event_info); + + /* Check that group locking fields are in a proper initial state. */ + Assert(MyProc->lockGroupLeader == NULL); + Assert(dlist_is_empty(&MyProc->lockGroupMembers)); + + /* + * We might be reusing a semaphore that belonged to a failed process. So + * be careful and reinitialize its value here. (This is not strictly + * necessary anymore, but seems like a good idea for cleanliness.) + */ + PGSemaphoreReset(MyProc->sem); + + /* + * Arrange to clean up at process exit. + */ + on_shmem_exit(AuxiliaryProcKill, Int32GetDatum(proctype)); +} + +/* + * Record the PID and PGPROC structures for the Startup process, for use in + * ProcSendSignal(). See comments there for further explanation. + */ +void +PublishStartupProcessInformation(void) +{ + SpinLockAcquire(ProcStructLock); + + ProcGlobal->startupProc = MyProc; + ProcGlobal->startupProcPid = MyProcPid; + + SpinLockRelease(ProcStructLock); +} + +/* + * Used from bufmgr to share the value of the buffer that Startup waits on, + * or to reset the value to "not waiting" (-1). This allows processing + * of recovery conflicts for buffer pins. Set is made before backends look + * at this value, so locking not required, especially since the set is + * an atomic integer set operation. + */ +void +SetStartupBufferPinWaitBufId(int bufid) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile PROC_HDR *procglobal = ProcGlobal; + + procglobal->startupBufferPinWaitBufId = bufid; +} + +/* + * Used by backends when they receive a request to check for buffer pin waits. + */ +int +GetStartupBufferPinWaitBufId(void) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile PROC_HDR *procglobal = ProcGlobal; + + return procglobal->startupBufferPinWaitBufId; +} + +/* + * Check whether there are at least N free PGPROC objects. + * + * Note: this is designed on the assumption that N will generally be small. + */ +bool +HaveNFreeProcs(int n) +{ + PGPROC *proc; + + SpinLockAcquire(ProcStructLock); + + proc = ProcGlobal->freeProcs; + + while (n > 0 && proc != NULL) + { + proc = (PGPROC *) proc->links.next; + n--; + } + + SpinLockRelease(ProcStructLock); + + return (n <= 0); +} + +/* + * Check if the current process is awaiting a lock. + */ +bool +IsWaitingForLock(void) +{ + if (lockAwaited == NULL) + return false; + + return true; +} + +/* + * Cancel any pending wait for lock, when aborting a transaction, and revert + * any strong lock count acquisition for a lock being acquired. + * + * (Normally, this would only happen if we accept a cancel/die + * interrupt while waiting; but an ereport(ERROR) before or during the lock + * wait is within the realm of possibility, too.) + */ +void +LockErrorCleanup(void) +{ + LWLock *partitionLock; + DisableTimeoutParams timeouts[2]; + + HOLD_INTERRUPTS(); + + AbortStrongLockAcquire(); + + /* Nothing to do if we weren't waiting for a lock */ + if (lockAwaited == NULL) + { + RESUME_INTERRUPTS(); + return; + } + + /* + * Turn off the deadlock and lock timeout timers, if they are still + * running (see ProcSleep). Note we must preserve the LOCK_TIMEOUT + * indicator flag, since this function is executed before + * ProcessInterrupts when responding to SIGINT; else we'd lose the + * knowledge that the SIGINT came from a lock timeout and not an external + * source. + */ + timeouts[0].id = DEADLOCK_TIMEOUT; + timeouts[0].keep_indicator = false; + timeouts[1].id = LOCK_TIMEOUT; + timeouts[1].keep_indicator = true; + disable_timeouts(timeouts, 2); + + /* Unlink myself from the wait queue, if on it (might not be anymore!) */ + partitionLock = LockHashPartitionLock(lockAwaited->hashcode); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + if (MyProc->links.next != NULL) + { + /* We could not have been granted the lock yet */ + RemoveFromWaitQueue(MyProc, lockAwaited->hashcode); + } + else + { + /* + * Somebody kicked us off the lock queue already. Perhaps they + * granted us the lock, or perhaps they detected a deadlock. If they + * did grant us the lock, we'd better remember it in our local lock + * table. + */ + if (MyProc->waitStatus == PROC_WAIT_STATUS_OK) + GrantAwaitedLock(); + } + + lockAwaited = NULL; + + LWLockRelease(partitionLock); + + RESUME_INTERRUPTS(); +} + + +/* + * ProcReleaseLocks() -- release locks associated with current transaction + * at main transaction commit or abort + * + * At main transaction commit, we release standard locks except session locks. + * At main transaction abort, we release all locks including session locks. + * + * Advisory locks are released only if they are transaction-level; + * session-level holds remain, whether this is a commit or not. + * + * At subtransaction commit, we don't release any locks (so this func is not + * needed at all); we will defer the releasing to the parent transaction. + * At subtransaction abort, we release all locks held by the subtransaction; + * this is implemented by retail releasing of the locks under control of + * the ResourceOwner mechanism. + */ +void +ProcReleaseLocks(bool isCommit) +{ + if (!MyProc) + return; + /* If waiting, get off wait queue (should only be needed after error) */ + LockErrorCleanup(); + /* Release standard locks, including session-level if aborting */ + LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit); + /* Release transaction-level advisory locks */ + LockReleaseAll(USER_LOCKMETHOD, false); +} + + +/* + * RemoveProcFromArray() -- Remove this process from the shared ProcArray. + */ +static void +RemoveProcFromArray(int code, Datum arg) +{ + Assert(MyProc != NULL); + ProcArrayRemove(MyProc, InvalidTransactionId); +} + +/* + * ProcKill() -- Destroy the per-proc data structure for + * this process. Release any of its held LW locks. + */ +static void +ProcKill(int code, Datum arg) +{ + PGPROC *proc; + PGPROC *volatile *procgloballist; + + Assert(MyProc != NULL); + + /* Make sure we're out of the sync rep lists */ + SyncRepCleanupAtProcExit(); + +#ifdef USE_ASSERT_CHECKING + { + int i; + + /* Last process should have released all locks. */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + Assert(SHMQueueEmpty(&(MyProc->myProcLocks[i]))); + } +#endif + + /* + * Release any LW locks I am holding. There really shouldn't be any, but + * it's cheap to check again before we cut the knees off the LWLock + * facility by releasing our PGPROC ... + */ + LWLockReleaseAll(); + + /* Cancel any pending condition variable sleep, too */ + ConditionVariableCancelSleep(); + + /* Make sure active replication slots are released */ + if (MyReplicationSlot != NULL) + ReplicationSlotRelease(); + + /* Also cleanup all the temporary slots. */ + ReplicationSlotCleanup(); + + /* + * Detach from any lock group of which we are a member. If the leader + * exist before all other group members, its PGPROC will remain allocated + * until the last group process exits; that process must return the + * leader's PGPROC to the appropriate list. + */ + if (MyProc->lockGroupLeader != NULL) + { + PGPROC *leader = MyProc->lockGroupLeader; + LWLock *leader_lwlock = LockHashPartitionLockByProc(leader); + + LWLockAcquire(leader_lwlock, LW_EXCLUSIVE); + Assert(!dlist_is_empty(&leader->lockGroupMembers)); + dlist_delete(&MyProc->lockGroupLink); + if (dlist_is_empty(&leader->lockGroupMembers)) + { + leader->lockGroupLeader = NULL; + if (leader != MyProc) + { + procgloballist = leader->procgloballist; + + /* Leader exited first; return its PGPROC. */ + SpinLockAcquire(ProcStructLock); + leader->links.next = (SHM_QUEUE *) *procgloballist; + *procgloballist = leader; + SpinLockRelease(ProcStructLock); + } + } + else if (leader != MyProc) + MyProc->lockGroupLeader = NULL; + LWLockRelease(leader_lwlock); + } + + /* + * Reset MyLatch to the process local one. This is so that signal + * handlers et al can continue using the latch after the shared latch + * isn't ours anymore. + * + * Similarly, stop reporting wait events to MyProc->wait_event_info. + * + * After that clear MyProc and disown the shared latch. + */ + SwitchBackToLocalLatch(); + pgstat_reset_wait_event_storage(); + + proc = MyProc; + MyProc = NULL; + DisownLatch(&proc->procLatch); + + procgloballist = proc->procgloballist; + SpinLockAcquire(ProcStructLock); + + /* + * If we're still a member of a locking group, that means we're a leader + * which has somehow exited before its children. The last remaining child + * will release our PGPROC. Otherwise, release it now. + */ + if (proc->lockGroupLeader == NULL) + { + /* Since lockGroupLeader is NULL, lockGroupMembers should be empty. */ + Assert(dlist_is_empty(&proc->lockGroupMembers)); + + /* Return PGPROC structure (and semaphore) to appropriate freelist */ + proc->links.next = (SHM_QUEUE *) *procgloballist; + *procgloballist = proc; + } + + /* Update shared estimate of spins_per_delay */ + ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay); + + SpinLockRelease(ProcStructLock); + + /* + * This process is no longer present in shared memory in any meaningful + * way, so tell the postmaster we've cleaned up acceptably well. (XXX + * autovac launcher should be included here someday) + */ + if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess()) + MarkPostmasterChildInactive(); + + /* wake autovac launcher if needed -- see comments in FreeWorkerInfo */ + if (AutovacuumLauncherPid != 0) + kill(AutovacuumLauncherPid, SIGUSR2); +} + +/* + * AuxiliaryProcKill() -- Cut-down version of ProcKill for auxiliary + * processes (bgwriter, etc). The PGPROC and sema are not released, only + * marked as not-in-use. + */ +static void +AuxiliaryProcKill(int code, Datum arg) +{ + int proctype = DatumGetInt32(arg); + PGPROC *auxproc PG_USED_FOR_ASSERTS_ONLY; + PGPROC *proc; + + Assert(proctype >= 0 && proctype < NUM_AUXILIARY_PROCS); + + auxproc = &AuxiliaryProcs[proctype]; + + Assert(MyProc == auxproc); + + /* Release any LW locks I am holding (see notes above) */ + LWLockReleaseAll(); + + /* Cancel any pending condition variable sleep, too */ + ConditionVariableCancelSleep(); + + /* look at the equivalent ProcKill() code for comments */ + SwitchBackToLocalLatch(); + pgstat_reset_wait_event_storage(); + + proc = MyProc; + MyProc = NULL; + DisownLatch(&proc->procLatch); + + SpinLockAcquire(ProcStructLock); + + /* Mark auxiliary proc no longer in use */ + proc->pid = 0; + + /* Update shared estimate of spins_per_delay */ + ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay); + + SpinLockRelease(ProcStructLock); +} + +/* + * AuxiliaryPidGetProc -- get PGPROC for an auxiliary process + * given its PID + * + * Returns NULL if not found. + */ +PGPROC * +AuxiliaryPidGetProc(int pid) +{ + PGPROC *result = NULL; + int index; + + if (pid == 0) /* never match dummy PGPROCs */ + return NULL; + + for (index = 0; index < NUM_AUXILIARY_PROCS; index++) + { + PGPROC *proc = &AuxiliaryProcs[index]; + + if (proc->pid == pid) + { + result = proc; + break; + } + } + return result; +} + +/* + * ProcQueue package: routines for putting processes to sleep + * and waking them up + */ + +/* + * ProcQueueAlloc -- alloc/attach to a shared memory process queue + * + * Returns: a pointer to the queue + * Side Effects: Initializes the queue if it wasn't there before + */ +#ifdef NOT_USED +PROC_QUEUE * +ProcQueueAlloc(const char *name) +{ + PROC_QUEUE *queue; + bool found; + + queue = (PROC_QUEUE *) + ShmemInitStruct(name, sizeof(PROC_QUEUE), &found); + + if (!found) + ProcQueueInit(queue); + + return queue; +} +#endif + +/* + * ProcQueueInit -- initialize a shared memory process queue + */ +void +ProcQueueInit(PROC_QUEUE *queue) +{ + SHMQueueInit(&(queue->links)); + queue->size = 0; +} + + +/* + * ProcSleep -- put a process to sleep on the specified lock + * + * Caller must have set MyProc->heldLocks to reflect locks already held + * on the lockable object by this process (under all XIDs). + * + * The lock table's partition lock must be held at entry, and will be held + * at exit. + * + * Result: PROC_WAIT_STATUS_OK if we acquired the lock, PROC_WAIT_STATUS_ERROR if not (deadlock). + * + * ASSUME: that no one will fiddle with the queue until after + * we release the partition lock. + * + * NOTES: The process queue is now a priority queue for locking. + */ +ProcWaitStatus +ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) +{ + LOCKMODE lockmode = locallock->tag.mode; + LOCK *lock = locallock->lock; + PROCLOCK *proclock = locallock->proclock; + uint32 hashcode = locallock->hashcode; + LWLock *partitionLock = LockHashPartitionLock(hashcode); + PROC_QUEUE *waitQueue = &(lock->waitProcs); + LOCKMASK myHeldLocks = MyProc->heldLocks; + TimestampTz standbyWaitStart = 0; + bool early_deadlock = false; + bool allow_autovacuum_cancel = true; + bool logged_recovery_conflict = false; + ProcWaitStatus myWaitStatus; + PGPROC *proc; + PGPROC *leader = MyProc->lockGroupLeader; + int i; + + /* + * If group locking is in use, locks held by members of my locking group + * need to be included in myHeldLocks. This is not required for relation + * extension or page locks which conflict among group members. However, + * including them in myHeldLocks will give group members the priority to + * get those locks as compared to other backends which are also trying to + * acquire those locks. OTOH, we can avoid giving priority to group + * members for that kind of locks, but there doesn't appear to be a clear + * advantage of the same. + */ + if (leader != NULL) + { + SHM_QUEUE *procLocks = &(lock->procLocks); + PROCLOCK *otherproclock; + + otherproclock = (PROCLOCK *) + SHMQueueNext(procLocks, procLocks, offsetof(PROCLOCK, lockLink)); + while (otherproclock != NULL) + { + if (otherproclock->groupLeader == leader) + myHeldLocks |= otherproclock->holdMask; + otherproclock = (PROCLOCK *) + SHMQueueNext(procLocks, &otherproclock->lockLink, + offsetof(PROCLOCK, lockLink)); + } + } + + /* + * Determine where to add myself in the wait queue. + * + * Normally I should go at the end of the queue. However, if I already + * hold locks that conflict with the request of any previous waiter, put + * myself in the queue just in front of the first such waiter. This is not + * a necessary step, since deadlock detection would move me to before that + * waiter anyway; but it's relatively cheap to detect such a conflict + * immediately, and avoid delaying till deadlock timeout. + * + * Special case: if I find I should go in front of some waiter, check to + * see if I conflict with already-held locks or the requests before that + * waiter. If not, then just grant myself the requested lock immediately. + * This is the same as the test for immediate grant in LockAcquire, except + * we are only considering the part of the wait queue before my insertion + * point. + */ + if (myHeldLocks != 0) + { + LOCKMASK aheadRequests = 0; + + proc = (PGPROC *) waitQueue->links.next; + for (i = 0; i < waitQueue->size; i++) + { + /* + * If we're part of the same locking group as this waiter, its + * locks neither conflict with ours nor contribute to + * aheadRequests. + */ + if (leader != NULL && leader == proc->lockGroupLeader) + { + proc = (PGPROC *) proc->links.next; + continue; + } + /* Must he wait for me? */ + if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks) + { + /* Must I wait for him ? */ + if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks) + { + /* + * Yes, so we have a deadlock. Easiest way to clean up + * correctly is to call RemoveFromWaitQueue(), but we + * can't do that until we are *on* the wait queue. So, set + * a flag to check below, and break out of loop. Also, + * record deadlock info for later message. + */ + RememberSimpleDeadLock(MyProc, lockmode, lock, proc); + early_deadlock = true; + break; + } + /* I must go before this waiter. Check special case. */ + if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 && + !LockCheckConflicts(lockMethodTable, lockmode, lock, + proclock)) + { + /* Skip the wait and just grant myself the lock. */ + GrantLock(lock, proclock, lockmode); + GrantAwaitedLock(); + return PROC_WAIT_STATUS_OK; + } + /* Break out of loop to put myself before him */ + break; + } + /* Nope, so advance to next waiter */ + aheadRequests |= LOCKBIT_ON(proc->waitLockMode); + proc = (PGPROC *) proc->links.next; + } + + /* + * If we fall out of loop normally, proc points to waitQueue head, so + * we will insert at tail of queue as desired. + */ + } + else + { + /* I hold no locks, so I can't push in front of anyone. */ + proc = (PGPROC *) &(waitQueue->links); + } + + /* + * Insert self into queue, ahead of the given proc (or at tail of queue). + */ + SHMQueueInsertBefore(&(proc->links), &(MyProc->links)); + waitQueue->size++; + + lock->waitMask |= LOCKBIT_ON(lockmode); + + /* Set up wait information in PGPROC object, too */ + MyProc->waitLock = lock; + MyProc->waitProcLock = proclock; + MyProc->waitLockMode = lockmode; + + MyProc->waitStatus = PROC_WAIT_STATUS_WAITING; + + /* + * If we detected deadlock, give up without waiting. This must agree with + * CheckDeadLock's recovery code. + */ + if (early_deadlock) + { + RemoveFromWaitQueue(MyProc, hashcode); + return PROC_WAIT_STATUS_ERROR; + } + + /* mark that we are waiting for a lock */ + lockAwaited = locallock; + + /* + * Release the lock table's partition lock. + * + * NOTE: this may also cause us to exit critical-section state, possibly + * allowing a cancel/die interrupt to be accepted. This is OK because we + * have recorded the fact that we are waiting for a lock, and so + * LockErrorCleanup will clean up if cancel/die happens. + */ + LWLockRelease(partitionLock); + + /* + * Also, now that we will successfully clean up after an ereport, it's + * safe to check to see if there's a buffer pin deadlock against the + * Startup process. Of course, that's only necessary if we're doing Hot + * Standby and are not the Startup process ourselves. + */ + if (RecoveryInProgress() && !InRecovery) + CheckRecoveryConflictDeadlock(); + + /* Reset deadlock_state before enabling the timeout handler */ + deadlock_state = DS_NOT_YET_CHECKED; + got_deadlock_timeout = false; + + /* + * Set timer so we can wake up after awhile and check for a deadlock. If a + * deadlock is detected, the handler sets MyProc->waitStatus = + * PROC_WAIT_STATUS_ERROR, allowing us to know that we must report failure + * rather than success. + * + * By delaying the check until we've waited for a bit, we can avoid + * running the rather expensive deadlock-check code in most cases. + * + * If LockTimeout is set, also enable the timeout for that. We can save a + * few cycles by enabling both timeout sources in one call. + * + * If InHotStandby we set lock waits slightly later for clarity with other + * code. + */ + if (!InHotStandby) + { + if (LockTimeout > 0) + { + EnableTimeoutParams timeouts[2]; + + timeouts[0].id = DEADLOCK_TIMEOUT; + timeouts[0].type = TMPARAM_AFTER; + timeouts[0].delay_ms = DeadlockTimeout; + timeouts[1].id = LOCK_TIMEOUT; + timeouts[1].type = TMPARAM_AFTER; + timeouts[1].delay_ms = LockTimeout; + enable_timeouts(timeouts, 2); + } + else + enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout); + + /* + * Use the current time obtained for the deadlock timeout timer as + * waitStart (i.e., the time when this process started waiting for the + * lock). Since getting the current time newly can cause overhead, we + * reuse the already-obtained time to avoid that overhead. + * + * Note that waitStart is updated without holding the lock table's + * partition lock, to avoid the overhead by additional lock + * acquisition. This can cause "waitstart" in pg_locks to become NULL + * for a very short period of time after the wait started even though + * "granted" is false. This is OK in practice because we can assume + * that users are likely to look at "waitstart" when waiting for the + * lock for a long time. + */ + pg_atomic_write_u64(&MyProc->waitStart, + get_timeout_start_time(DEADLOCK_TIMEOUT)); + } + else if (log_recovery_conflict_waits) + { + /* + * Set the wait start timestamp if logging is enabled and in hot + * standby. + */ + standbyWaitStart = GetCurrentTimestamp(); + } + + /* + * If somebody wakes us between LWLockRelease and WaitLatch, the latch + * will not wait. But a set latch does not necessarily mean that the lock + * is free now, as there are many other sources for latch sets than + * somebody releasing the lock. + * + * We process interrupts whenever the latch has been set, so cancel/die + * interrupts are processed quickly. This means we must not mind losing + * control to a cancel/die interrupt here. We don't, because we have no + * shared-state-change work to do after being granted the lock (the + * grantor did it all). We do have to worry about canceling the deadlock + * timeout and updating the locallock table, but if we lose control to an + * error, LockErrorCleanup will fix that up. + */ + do + { + if (InHotStandby) + { + bool maybe_log_conflict = + (standbyWaitStart != 0 && !logged_recovery_conflict); + + /* Set a timer and wait for that or for the lock to be granted */ + ResolveRecoveryConflictWithLock(locallock->tag.lock, + maybe_log_conflict); + + /* + * Emit the log message if the startup process is waiting longer + * than deadlock_timeout for recovery conflict on lock. + */ + if (maybe_log_conflict) + { + TimestampTz now = GetCurrentTimestamp(); + + if (TimestampDifferenceExceeds(standbyWaitStart, now, + DeadlockTimeout)) + { + VirtualTransactionId *vxids; + int cnt; + + vxids = GetLockConflicts(&locallock->tag.lock, + AccessExclusiveLock, &cnt); + + /* + * Log the recovery conflict and the list of PIDs of + * backends holding the conflicting lock. Note that we do + * logging even if there are no such backends right now + * because the startup process here has already waited + * longer than deadlock_timeout. + */ + LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK, + standbyWaitStart, now, + cnt > 0 ? vxids : NULL, true); + logged_recovery_conflict = true; + } + } + } + else + { + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + PG_WAIT_LOCK | locallock->tag.lock.locktag_type); + ResetLatch(MyLatch); + /* check for deadlocks first, as that's probably log-worthy */ + if (got_deadlock_timeout) + { + CheckDeadLock(); + got_deadlock_timeout = false; + } + CHECK_FOR_INTERRUPTS(); + } + + /* + * waitStatus could change from PROC_WAIT_STATUS_WAITING to something + * else asynchronously. Read it just once per loop to prevent + * surprising behavior (such as missing log messages). + */ + myWaitStatus = *((volatile ProcWaitStatus *) &MyProc->waitStatus); + + /* + * If we are not deadlocked, but are waiting on an autovacuum-induced + * task, send a signal to interrupt it. + */ + if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel) + { + PGPROC *autovac = GetBlockingAutoVacuumPgproc(); + uint8 statusFlags; + uint8 lockmethod_copy; + LOCKTAG locktag_copy; + + /* + * Grab info we need, then release lock immediately. Note this + * coding means that there is a tiny chance that the process + * terminates its current transaction and starts a different one + * before we have a change to send the signal; the worst possible + * consequence is that a for-wraparound vacuum is cancelled. But + * that could happen in any case unless we were to do kill() with + * the lock held, which is much more undesirable. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + statusFlags = ProcGlobal->statusFlags[autovac->pgxactoff]; + lockmethod_copy = lock->tag.locktag_lockmethodid; + locktag_copy = lock->tag; + LWLockRelease(ProcArrayLock); + + /* + * Only do it if the worker is not working to protect against Xid + * wraparound. + */ + if ((statusFlags & PROC_IS_AUTOVACUUM) && + !(statusFlags & PROC_VACUUM_FOR_WRAPAROUND)) + { + int pid = autovac->pid; + + /* report the case, if configured to do so */ + if (message_level_is_interesting(DEBUG1)) + { + StringInfoData locktagbuf; + StringInfoData logbuf; /* errdetail for server log */ + + initStringInfo(&locktagbuf); + initStringInfo(&logbuf); + DescribeLockTag(&locktagbuf, &locktag_copy); + appendStringInfo(&logbuf, + "Process %d waits for %s on %s.", + MyProcPid, + GetLockmodeName(lockmethod_copy, lockmode), + locktagbuf.data); + + ereport(DEBUG1, + (errmsg_internal("sending cancel to blocking autovacuum PID %d", + pid), + errdetail_log("%s", logbuf.data))); + + pfree(locktagbuf.data); + pfree(logbuf.data); + } + + /* send the autovacuum worker Back to Old Kent Road */ + if (kill(pid, SIGINT) < 0) + { + /* + * There's a race condition here: once we release the + * ProcArrayLock, it's possible for the autovac worker to + * close up shop and exit before we can do the kill(). + * Therefore, we do not whinge about no-such-process. + * Other errors such as EPERM could conceivably happen if + * the kernel recycles the PID fast enough, but such cases + * seem improbable enough that it's probably best to issue + * a warning if we see some other errno. + */ + if (errno != ESRCH) + ereport(WARNING, + (errmsg("could not send signal to process %d: %m", + pid))); + } + } + + /* prevent signal from being sent again more than once */ + allow_autovacuum_cancel = false; + } + + /* + * If awoken after the deadlock check interrupt has run, and + * log_lock_waits is on, then report about the wait. + */ + if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED) + { + StringInfoData buf, + lock_waiters_sbuf, + lock_holders_sbuf; + const char *modename; + long secs; + int usecs; + long msecs; + SHM_QUEUE *procLocks; + PROCLOCK *proclock; + bool first_holder = true, + first_waiter = true; + int lockHoldersNum = 0; + + initStringInfo(&buf); + initStringInfo(&lock_waiters_sbuf); + initStringInfo(&lock_holders_sbuf); + + DescribeLockTag(&buf, &locallock->tag.lock); + modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid, + lockmode); + TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT), + GetCurrentTimestamp(), + &secs, &usecs); + msecs = secs * 1000 + usecs / 1000; + usecs = usecs % 1000; + + /* + * we loop over the lock's procLocks to gather a list of all + * holders and waiters. Thus we will be able to provide more + * detailed information for lock debugging purposes. + * + * lock->procLocks contains all processes which hold or wait for + * this lock. + */ + + LWLockAcquire(partitionLock, LW_SHARED); + + procLocks = &(lock->procLocks); + proclock = (PROCLOCK *) SHMQueueNext(procLocks, procLocks, + offsetof(PROCLOCK, lockLink)); + + while (proclock) + { + /* + * we are a waiter if myProc->waitProcLock == proclock; we are + * a holder if it is NULL or something different + */ + if (proclock->tag.myProc->waitProcLock == proclock) + { + if (first_waiter) + { + appendStringInfo(&lock_waiters_sbuf, "%d", + proclock->tag.myProc->pid); + first_waiter = false; + } + else + appendStringInfo(&lock_waiters_sbuf, ", %d", + proclock->tag.myProc->pid); + } + else + { + if (first_holder) + { + appendStringInfo(&lock_holders_sbuf, "%d", + proclock->tag.myProc->pid); + first_holder = false; + } + else + appendStringInfo(&lock_holders_sbuf, ", %d", + proclock->tag.myProc->pid); + + lockHoldersNum++; + } + + proclock = (PROCLOCK *) SHMQueueNext(procLocks, &proclock->lockLink, + offsetof(PROCLOCK, lockLink)); + } + + LWLockRelease(partitionLock); + + if (deadlock_state == DS_SOFT_DEADLOCK) + ereport(LOG, + (errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs), + (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", + "Processes holding the lock: %s. Wait queue: %s.", + lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); + else if (deadlock_state == DS_HARD_DEADLOCK) + { + /* + * This message is a bit redundant with the error that will be + * reported subsequently, but in some cases the error report + * might not make it to the log (eg, if it's caught by an + * exception handler), and we want to ensure all long-wait + * events get logged. + */ + ereport(LOG, + (errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs), + (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", + "Processes holding the lock: %s. Wait queue: %s.", + lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); + } + + if (myWaitStatus == PROC_WAIT_STATUS_WAITING) + ereport(LOG, + (errmsg("process %d still waiting for %s on %s after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs), + (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", + "Processes holding the lock: %s. Wait queue: %s.", + lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); + else if (myWaitStatus == PROC_WAIT_STATUS_OK) + ereport(LOG, + (errmsg("process %d acquired %s on %s after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs))); + else + { + Assert(myWaitStatus == PROC_WAIT_STATUS_ERROR); + + /* + * Currently, the deadlock checker always kicks its own + * process, which means that we'll only see + * PROC_WAIT_STATUS_ERROR when deadlock_state == + * DS_HARD_DEADLOCK, and there's no need to print redundant + * messages. But for completeness and future-proofing, print + * a message if it looks like someone else kicked us off the + * lock. + */ + if (deadlock_state != DS_HARD_DEADLOCK) + ereport(LOG, + (errmsg("process %d failed to acquire %s on %s after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs), + (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", + "Processes holding the lock: %s. Wait queue: %s.", + lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); + } + + /* + * At this point we might still need to wait for the lock. Reset + * state so we don't print the above messages again. + */ + deadlock_state = DS_NO_DEADLOCK; + + pfree(buf.data); + pfree(lock_holders_sbuf.data); + pfree(lock_waiters_sbuf.data); + } + } while (myWaitStatus == PROC_WAIT_STATUS_WAITING); + + /* + * Disable the timers, if they are still running. As in LockErrorCleanup, + * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has + * already caused QueryCancelPending to become set, we want the cancel to + * be reported as a lock timeout, not a user cancel. + */ + if (!InHotStandby) + { + if (LockTimeout > 0) + { + DisableTimeoutParams timeouts[2]; + + timeouts[0].id = DEADLOCK_TIMEOUT; + timeouts[0].keep_indicator = false; + timeouts[1].id = LOCK_TIMEOUT; + timeouts[1].keep_indicator = true; + disable_timeouts(timeouts, 2); + } + else + disable_timeout(DEADLOCK_TIMEOUT, false); + } + + /* + * Emit the log message if recovery conflict on lock was resolved but the + * startup process waited longer than deadlock_timeout for it. + */ + if (InHotStandby && logged_recovery_conflict) + LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK, + standbyWaitStart, GetCurrentTimestamp(), + NULL, false); + + /* + * Re-acquire the lock table's partition lock. We have to do this to hold + * off cancel/die interrupts before we can mess with lockAwaited (else we + * might have a missed or duplicated locallock update). + */ + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * We no longer want LockErrorCleanup to do anything. + */ + lockAwaited = NULL; + + /* + * If we got the lock, be sure to remember it in the locallock table. + */ + if (MyProc->waitStatus == PROC_WAIT_STATUS_OK) + GrantAwaitedLock(); + + /* + * We don't have to do anything else, because the awaker did all the + * necessary update of the lock table and MyProc. + */ + return MyProc->waitStatus; +} + + +/* + * ProcWakeup -- wake up a process by setting its latch. + * + * Also remove the process from the wait queue and set its links invalid. + * RETURN: the next process in the wait queue. + * + * The appropriate lock partition lock must be held by caller. + * + * XXX: presently, this code is only used for the "success" case, and only + * works correctly for that case. To clean up in failure case, would need + * to twiddle the lock's request counts too --- see RemoveFromWaitQueue. + * Hence, in practice the waitStatus parameter must be PROC_WAIT_STATUS_OK. + */ +PGPROC * +ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus) +{ + PGPROC *retProc; + + /* Proc should be sleeping ... */ + if (proc->links.prev == NULL || + proc->links.next == NULL) + return NULL; + Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING); + + /* Save next process before we zap the list link */ + retProc = (PGPROC *) proc->links.next; + + /* Remove process from wait queue */ + SHMQueueDelete(&(proc->links)); + (proc->waitLock->waitProcs.size)--; + + /* Clean up process' state and pass it the ok/fail signal */ + proc->waitLock = NULL; + proc->waitProcLock = NULL; + proc->waitStatus = waitStatus; + pg_atomic_write_u64(&MyProc->waitStart, 0); + + /* And awaken it */ + SetLatch(&proc->procLatch); + + return retProc; +} + +/* + * ProcLockWakeup -- routine for waking up processes when a lock is + * released (or a prior waiter is aborted). Scan all waiters + * for lock, waken any that are no longer blocked. + * + * The appropriate lock partition lock must be held by caller. + */ +void +ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock) +{ + PROC_QUEUE *waitQueue = &(lock->waitProcs); + int queue_size = waitQueue->size; + PGPROC *proc; + LOCKMASK aheadRequests = 0; + + Assert(queue_size >= 0); + + if (queue_size == 0) + return; + + proc = (PGPROC *) waitQueue->links.next; + + while (queue_size-- > 0) + { + LOCKMODE lockmode = proc->waitLockMode; + + /* + * Waken if (a) doesn't conflict with requests of earlier waiters, and + * (b) doesn't conflict with already-held locks. + */ + if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 && + !LockCheckConflicts(lockMethodTable, lockmode, lock, + proc->waitProcLock)) + { + /* OK to waken */ + GrantLock(lock, proc->waitProcLock, lockmode); + proc = ProcWakeup(proc, PROC_WAIT_STATUS_OK); + + /* + * ProcWakeup removes proc from the lock's waiting process queue + * and returns the next proc in chain; don't use proc's next-link, + * because it's been cleared. + */ + } + else + { + /* + * Cannot wake this guy. Remember his request for later checks. + */ + aheadRequests |= LOCKBIT_ON(lockmode); + proc = (PGPROC *) proc->links.next; + } + } + + Assert(waitQueue->size >= 0); +} + +/* + * CheckDeadLock + * + * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a + * lock to be released by some other process. Check if there's a deadlock; if + * not, just return. (But signal ProcSleep to log a message, if + * log_lock_waits is true.) If we have a real deadlock, remove ourselves from + * the lock's wait queue and signal an error to ProcSleep. + */ +static void +CheckDeadLock(void) +{ + int i; + + /* + * Acquire exclusive lock on the entire shared lock data structures. Must + * grab LWLocks in partition-number order to avoid LWLock deadlock. + * + * Note that the deadlock check interrupt had better not be enabled + * anywhere that this process itself holds lock partition locks, else this + * will wait forever. Also note that LWLockAcquire creates a critical + * section, so that this routine cannot be interrupted by cancel/die + * interrupts. + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_EXCLUSIVE); + + /* + * Check to see if we've been awoken by anyone in the interim. + * + * If we have, we can return and resume our transaction -- happy day. + * Before we are awoken the process releasing the lock grants it to us so + * we know that we don't have to wait anymore. + * + * We check by looking to see if we've been unlinked from the wait queue. + * This is safe because we hold the lock partition lock. + */ + if (MyProc->links.prev == NULL || + MyProc->links.next == NULL) + goto check_done; + +#ifdef LOCK_DEBUG + if (Debug_deadlocks) + DumpAllLocks(); +#endif + + /* Run the deadlock check, and set deadlock_state for use by ProcSleep */ + deadlock_state = DeadLockCheck(MyProc); + + if (deadlock_state == DS_HARD_DEADLOCK) + { + /* + * Oops. We have a deadlock. + * + * Get this process out of wait state. (Note: we could do this more + * efficiently by relying on lockAwaited, but use this coding to + * preserve the flexibility to kill some other transaction than the + * one detecting the deadlock.) + * + * RemoveFromWaitQueue sets MyProc->waitStatus to + * PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we + * return from the signal handler. + */ + Assert(MyProc->waitLock != NULL); + RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag))); + + /* + * We're done here. Transaction abort caused by the error that + * ProcSleep will raise will cause any other locks we hold to be + * released, thus allowing other processes to wake up; we don't need + * to do that here. NOTE: an exception is that releasing locks we + * hold doesn't consider the possibility of waiters that were blocked + * behind us on the lock we just failed to get, and might now be + * wakable because we're not in front of them anymore. However, + * RemoveFromWaitQueue took care of waking up any such processes. + */ + } + + /* + * And release locks. We do this in reverse order for two reasons: (1) + * Anyone else who needs more than one of the locks will be trying to lock + * them in increasing order; we don't want to release the other process + * until it can get all the locks it needs. (2) This avoids O(N^2) + * behavior inside LWLockRelease. + */ +check_done: + for (i = NUM_LOCK_PARTITIONS; --i >= 0;) + LWLockRelease(LockHashPartitionLockByIndex(i)); +} + +/* + * CheckDeadLockAlert - Handle the expiry of deadlock_timeout. + * + * NB: Runs inside a signal handler, be careful. + */ +void +CheckDeadLockAlert(void) +{ + int save_errno = errno; + + got_deadlock_timeout = true; + + /* + * Have to set the latch again, even if handle_sig_alarm already did. Back + * then got_deadlock_timeout wasn't yet set... It's unlikely that this + * ever would be a problem, but setting a set latch again is cheap. + * + * Note that, when this function runs inside procsignal_sigusr1_handler(), + * the handler function sets the latch again after the latch is set here. + */ + SetLatch(MyLatch); + errno = save_errno; +} + +/* + * ProcWaitForSignal - wait for a signal from another backend. + * + * As this uses the generic process latch the caller has to be robust against + * unrelated wakeups: Always check that the desired state has occurred, and + * wait again if not. + */ +void +ProcWaitForSignal(uint32 wait_event_info) +{ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + wait_event_info); + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); +} + +/* + * ProcSendSignal - send a signal to a backend identified by PID + */ +void +ProcSendSignal(int pid) +{ + PGPROC *proc = NULL; + + if (RecoveryInProgress()) + { + SpinLockAcquire(ProcStructLock); + + /* + * Check to see whether it is the Startup process we wish to signal. + * This call is made by the buffer manager when it wishes to wake up a + * process that has been waiting for a pin in so it can obtain a + * cleanup lock using LockBufferForCleanup(). Startup is not a normal + * backend, so BackendPidGetProc() will not return any pid at all. So + * we remember the information for this special case. + */ + if (pid == ProcGlobal->startupProcPid) + proc = ProcGlobal->startupProc; + + SpinLockRelease(ProcStructLock); + } + + if (proc == NULL) + proc = BackendPidGetProc(pid); + + if (proc != NULL) + { + SetLatch(&proc->procLatch); + } +} + +/* + * BecomeLockGroupLeader - designate process as lock group leader + * + * Once this function has returned, other processes can join the lock group + * by calling BecomeLockGroupMember. + */ +void +BecomeLockGroupLeader(void) +{ + LWLock *leader_lwlock; + + /* If we already did it, we don't need to do it again. */ + if (MyProc->lockGroupLeader == MyProc) + return; + + /* We had better not be a follower. */ + Assert(MyProc->lockGroupLeader == NULL); + + /* Create single-member group, containing only ourselves. */ + leader_lwlock = LockHashPartitionLockByProc(MyProc); + LWLockAcquire(leader_lwlock, LW_EXCLUSIVE); + MyProc->lockGroupLeader = MyProc; + dlist_push_head(&MyProc->lockGroupMembers, &MyProc->lockGroupLink); + LWLockRelease(leader_lwlock); +} + +/* + * BecomeLockGroupMember - designate process as lock group member + * + * This is pretty straightforward except for the possibility that the leader + * whose group we're trying to join might exit before we manage to do so; + * and the PGPROC might get recycled for an unrelated process. To avoid + * that, we require the caller to pass the PID of the intended PGPROC as + * an interlock. Returns true if we successfully join the intended lock + * group, and false if not. + */ +bool +BecomeLockGroupMember(PGPROC *leader, int pid) +{ + LWLock *leader_lwlock; + bool ok = false; + + /* Group leader can't become member of group */ + Assert(MyProc != leader); + + /* Can't already be a member of a group */ + Assert(MyProc->lockGroupLeader == NULL); + + /* PID must be valid. */ + Assert(pid != 0); + + /* + * Get lock protecting the group fields. Note LockHashPartitionLockByProc + * accesses leader->pgprocno in a PGPROC that might be free. This is safe + * because all PGPROCs' pgprocno fields are set during shared memory + * initialization and never change thereafter; so we will acquire the + * correct lock even if the leader PGPROC is in process of being recycled. + */ + leader_lwlock = LockHashPartitionLockByProc(leader); + LWLockAcquire(leader_lwlock, LW_EXCLUSIVE); + + /* Is this the leader we're looking for? */ + if (leader->pid == pid && leader->lockGroupLeader == leader) + { + /* OK, join the group */ + ok = true; + MyProc->lockGroupLeader = leader; + dlist_push_tail(&leader->lockGroupMembers, &MyProc->lockGroupLink); + } + LWLockRelease(leader_lwlock); + + return ok; +} diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c new file mode 100644 index 0000000..2dc2d67 --- /dev/null +++ b/src/backend/storage/lmgr/s_lock.c @@ -0,0 +1,377 @@ +/*------------------------------------------------------------------------- + * + * s_lock.c + * Hardware-dependent implementation of spinlocks. + * + * When waiting for a contended spinlock we loop tightly for awhile, then + * delay using pg_usleep() and try again. Preferably, "awhile" should be a + * small multiple of the maximum time we expect a spinlock to be held. 100 + * iterations seems about right as an initial guess. However, on a + * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario + * it's usually better to spin a bit longer than to call the kernel, so we try + * to adapt the spin loop count depending on whether we seem to be in a + * uniprocessor or multiprocessor. + * + * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd + * be wrong; there are platforms where that can result in a "stuck + * spinlock" failure. This has been seen particularly on Alphas; it seems + * that the first TAS after returning from kernel space will always fail + * on that hardware. + * + * Once we do decide to block, we use randomly increasing pg_usleep() + * delays. The first delay is 1 msec, then the delay randomly increases to + * about one second, after which we reset to 1 msec and start again. The + * idea here is that in the presence of heavy contention we need to + * increase the delay, else the spinlock holder may never get to run and + * release the lock. (Consider situation where spinlock holder has been + * nice'd down in priority by the scheduler --- it will not get scheduled + * until all would-be acquirers are sleeping, so if we always use a 1-msec + * sleep, there is a real possibility of starvation.) But we can't just + * clamp the delay to an upper bound, else it would take a long time to + * make a reasonable number of tries. + * + * We time out and declare error after NUM_DELAYS delays (thus, exactly + * that many tries). With the given settings, this will usually take 2 or + * so minutes. It seems better to fix the total number of tries (and thus + * the probability of unintended failure) than to fix the total time + * spent. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/s_lock.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <time.h> +#include <unistd.h> + +#include "port/atomics.h" +#include "storage/s_lock.h" + +#define MIN_SPINS_PER_DELAY 10 +#define MAX_SPINS_PER_DELAY 1000 +#define NUM_DELAYS 1000 +#define MIN_DELAY_USEC 1000L +#define MAX_DELAY_USEC 1000000L + + +slock_t dummy_spinlock; + +static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; + + +/* + * s_lock_stuck() - complain about a stuck spinlock + */ +static void +s_lock_stuck(const char *file, int line, const char *func) +{ + if (!func) + func = "(unknown)"; +#if defined(S_LOCK_TEST) + fprintf(stderr, + "\nStuck spinlock detected at %s, %s:%d.\n", + func, file, line); + exit(1); +#else + elog(PANIC, "stuck spinlock detected at %s, %s:%d", + func, file, line); +#endif +} + +/* + * s_lock(lock) - platform-independent portion of waiting for a spinlock. + */ +int +s_lock(volatile slock_t *lock, const char *file, int line, const char *func) +{ + SpinDelayStatus delayStatus; + + init_spin_delay(&delayStatus, file, line, func); + + while (TAS_SPIN(lock)) + { + perform_spin_delay(&delayStatus); + } + + finish_spin_delay(&delayStatus); + + return delayStatus.delays; +} + +#ifdef USE_DEFAULT_S_UNLOCK +void +s_unlock(volatile slock_t *lock) +{ +#ifdef TAS_ACTIVE_WORD + /* HP's PA-RISC */ + *TAS_ACTIVE_WORD(lock) = -1; +#else + *lock = 0; +#endif +} +#endif + +/* + * Wait while spinning on a contended spinlock. + */ +void +perform_spin_delay(SpinDelayStatus *status) +{ + /* CPU-specific delay each time through the loop */ + SPIN_DELAY(); + + /* Block the process every spins_per_delay tries */ + if (++(status->spins) >= spins_per_delay) + { + if (++(status->delays) > NUM_DELAYS) + s_lock_stuck(status->file, status->line, status->func); + + if (status->cur_delay == 0) /* first time to delay? */ + status->cur_delay = MIN_DELAY_USEC; + + pg_usleep(status->cur_delay); + +#if defined(S_LOCK_TEST) + fprintf(stdout, "*"); + fflush(stdout); +#endif + + /* increase delay by a random fraction between 1X and 2X */ + status->cur_delay += (int) (status->cur_delay * + ((double) random() / (double) MAX_RANDOM_VALUE) + 0.5); + /* wrap back to minimum delay when max is exceeded */ + if (status->cur_delay > MAX_DELAY_USEC) + status->cur_delay = MIN_DELAY_USEC; + + status->spins = 0; + } +} + +/* + * After acquiring a spinlock, update estimates about how long to loop. + * + * If we were able to acquire the lock without delaying, it's a good + * indication we are in a multiprocessor. If we had to delay, it's a sign + * (but not a sure thing) that we are in a uniprocessor. Hence, we + * decrement spins_per_delay slowly when we had to delay, and increase it + * rapidly when we didn't. It's expected that spins_per_delay will + * converge to the minimum value on a uniprocessor and to the maximum + * value on a multiprocessor. + * + * Note: spins_per_delay is local within our current process. We want to + * average these observations across multiple backends, since it's + * relatively rare for this function to even get entered, and so a single + * backend might not live long enough to converge on a good value. That + * is handled by the two routines below. + */ +void +finish_spin_delay(SpinDelayStatus *status) +{ + if (status->cur_delay == 0) + { + /* we never had to delay */ + if (spins_per_delay < MAX_SPINS_PER_DELAY) + spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY); + } + else + { + if (spins_per_delay > MIN_SPINS_PER_DELAY) + spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY); + } +} + +/* + * Set local copy of spins_per_delay during backend startup. + * + * NB: this has to be pretty fast as it is called while holding a spinlock + */ +void +set_spins_per_delay(int shared_spins_per_delay) +{ + spins_per_delay = shared_spins_per_delay; +} + +/* + * Update shared estimate of spins_per_delay during backend exit. + * + * NB: this has to be pretty fast as it is called while holding a spinlock + */ +int +update_spins_per_delay(int shared_spins_per_delay) +{ + /* + * We use an exponential moving average with a relatively slow adaption + * rate, so that noise in any one backend's result won't affect the shared + * value too much. As long as both inputs are within the allowed range, + * the result must be too, so we need not worry about clamping the result. + * + * We deliberately truncate rather than rounding; this is so that single + * adjustments inside a backend can affect the shared estimate (see the + * asymmetric adjustment rules above). + */ + return (shared_spins_per_delay * 15 + spins_per_delay) / 16; +} + + +/* + * Various TAS implementations that cannot live in s_lock.h as no inline + * definition exists (yet). + * In the future, get rid of tas.[cso] and fold it into this file. + * + * If you change something here, you will likely need to modify s_lock.h too, + * because the definitions for these are split between this file and s_lock.h. + */ + + +#ifdef HAVE_SPINLOCKS /* skip spinlocks if requested */ + + +#if defined(__GNUC__) + +/* + * All the gcc flavors that are not inlined + */ + + +/* + * Note: all the if-tests here probably ought to be testing gcc version + * rather than platform, but I don't have adequate info to know what to + * write. Ideally we'd flush all this in favor of the inline version. + */ +#if defined(__m68k__) && !defined(__linux__) +/* really means: extern int tas(slock_t* **lock); */ +static void +tas_dummy() +{ + __asm__ __volatile__( +#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(__ELF__) +/* no underscore for label and % for registers */ + "\ +.global tas \n\ +tas: \n\ + movel %sp@(0x4),%a0 \n\ + tas %a0@ \n\ + beq _success \n\ + moveq #-128,%d0 \n\ + rts \n\ +_success: \n\ + moveq #0,%d0 \n\ + rts \n" +#else + "\ +.global _tas \n\ +_tas: \n\ + movel sp@(0x4),a0 \n\ + tas a0@ \n\ + beq _success \n\ + moveq #-128,d0 \n\ + rts \n\ +_success: \n\ + moveq #0,d0 \n\ + rts \n" +#endif /* (__NetBSD__ || __OpenBSD__) && __ELF__ */ + ); +} +#endif /* __m68k__ && !__linux__ */ +#endif /* not __GNUC__ */ +#endif /* HAVE_SPINLOCKS */ + + + +/*****************************************************************************/ +#if defined(S_LOCK_TEST) + +/* + * test program for verifying a port's spinlock support. + */ + +struct test_lock_struct +{ + char pad1; + slock_t lock; + char pad2; +}; + +volatile struct test_lock_struct test_lock; + +int +main() +{ + srandom((unsigned int) time(NULL)); + + test_lock.pad1 = test_lock.pad2 = 0x44; + + S_INIT_LOCK(&test_lock.lock); + + if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) + { + printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); + return 1; + } + + if (!S_LOCK_FREE(&test_lock.lock)) + { + printf("S_LOCK_TEST: failed, lock not initialized\n"); + return 1; + } + + S_LOCK(&test_lock.lock); + + if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) + { + printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); + return 1; + } + + if (S_LOCK_FREE(&test_lock.lock)) + { + printf("S_LOCK_TEST: failed, lock not locked\n"); + return 1; + } + + S_UNLOCK(&test_lock.lock); + + if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) + { + printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); + return 1; + } + + if (!S_LOCK_FREE(&test_lock.lock)) + { + printf("S_LOCK_TEST: failed, lock not unlocked\n"); + return 1; + } + + S_LOCK(&test_lock.lock); + + if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) + { + printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); + return 1; + } + + if (S_LOCK_FREE(&test_lock.lock)) + { + printf("S_LOCK_TEST: failed, lock not re-locked\n"); + return 1; + } + + printf("S_LOCK_TEST: this will print %d stars and then\n", NUM_DELAYS); + printf(" exit with a 'stuck spinlock' message\n"); + printf(" if S_LOCK() and TAS() are working.\n"); + fflush(stdout); + + s_lock(&test_lock.lock, __FILE__, __LINE__); + + printf("S_LOCK_TEST: failed, lock not locked\n"); + return 1; +} + +#endif /* S_LOCK_TEST */ diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c new file mode 100644 index 0000000..557672c --- /dev/null +++ b/src/backend/storage/lmgr/spin.c @@ -0,0 +1,180 @@ +/*------------------------------------------------------------------------- + * + * spin.c + * Hardware-independent implementation of spinlocks. + * + * + * For machines that have test-and-set (TAS) instructions, s_lock.h/.c + * define the spinlock implementation. This file contains only a stub + * implementation for spinlocks using PGSemaphores. Unless semaphores + * are implemented in a way that doesn't involve a kernel call, this + * is too slow to be very useful :-( + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/spin.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/pg_sema.h" +#include "storage/shmem.h" +#include "storage/spin.h" + + +#ifndef HAVE_SPINLOCKS + +/* + * No TAS, so spinlocks are implemented as PGSemaphores. + */ + +#ifndef HAVE_ATOMICS +#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES) +#else +#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES) +#endif /* DISABLE_ATOMICS */ + +PGSemaphore *SpinlockSemaArray; + +#else /* !HAVE_SPINLOCKS */ + +#define NUM_EMULATION_SEMAPHORES 0 + +#endif /* HAVE_SPINLOCKS */ + +/* + * Report the amount of shared memory needed to store semaphores for spinlock + * support. + */ +Size +SpinlockSemaSize(void) +{ + return NUM_EMULATION_SEMAPHORES * sizeof(PGSemaphore); +} + +/* + * Report number of semaphores needed to support spinlocks. + */ +int +SpinlockSemas(void) +{ + return NUM_EMULATION_SEMAPHORES; +} + +#ifndef HAVE_SPINLOCKS + +/* + * Initialize spinlock emulation. + * + * This must be called after PGReserveSemaphores(). + */ +void +SpinlockSemaInit(void) +{ + PGSemaphore *spinsemas; + int nsemas = SpinlockSemas(); + int i; + + /* + * We must use ShmemAllocUnlocked(), since the spinlock protecting + * ShmemAlloc() obviously can't be ready yet. + */ + spinsemas = (PGSemaphore *) ShmemAllocUnlocked(SpinlockSemaSize()); + for (i = 0; i < nsemas; ++i) + spinsemas[i] = PGSemaphoreCreate(); + SpinlockSemaArray = spinsemas; +} + +/* + * s_lock.h hardware-spinlock emulation using semaphores + * + * We map all spinlocks onto NUM_EMULATION_SEMAPHORES semaphores. It's okay to + * map multiple spinlocks onto one semaphore because no process should ever + * hold more than one at a time. We just need enough semaphores so that we + * aren't adding too much extra contention from that. + * + * There is one exception to the restriction of only holding one spinlock at a + * time, which is that it's ok if emulated atomic operations are nested inside + * spinlocks. To avoid the danger of spinlocks and atomic using the same sema, + * we make sure "normal" spinlocks and atomics backed by spinlocks use + * distinct semaphores (see the nested argument to s_init_lock_sema). + * + * slock_t is just an int for this implementation; it holds the spinlock + * number from 1..NUM_EMULATION_SEMAPHORES. We intentionally ensure that 0 + * is not a valid value, so that testing with this code can help find + * failures to initialize spinlocks. + */ + +static inline void +s_check_valid(int lockndx) +{ + if (unlikely(lockndx <= 0 || lockndx > NUM_EMULATION_SEMAPHORES)) + elog(ERROR, "invalid spinlock number: %d", lockndx); +} + +void +s_init_lock_sema(volatile slock_t *lock, bool nested) +{ + static uint32 counter = 0; + uint32 offset; + uint32 sema_total; + uint32 idx; + + if (nested) + { + /* + * To allow nesting atomics inside spinlocked sections, use a + * different spinlock. See comment above. + */ + offset = 1 + NUM_SPINLOCK_SEMAPHORES; + sema_total = NUM_ATOMICS_SEMAPHORES; + } + else + { + offset = 1; + sema_total = NUM_SPINLOCK_SEMAPHORES; + } + + idx = (counter++ % sema_total) + offset; + + /* double check we did things correctly */ + s_check_valid(idx); + + *lock = idx; +} + +void +s_unlock_sema(volatile slock_t *lock) +{ + int lockndx = *lock; + + s_check_valid(lockndx); + + PGSemaphoreUnlock(SpinlockSemaArray[lockndx - 1]); +} + +bool +s_lock_free_sema(volatile slock_t *lock) +{ + /* We don't currently use S_LOCK_FREE anyway */ + elog(ERROR, "spin.c does not support S_LOCK_FREE()"); + return false; +} + +int +tas_sema(volatile slock_t *lock) +{ + int lockndx = *lock; + + s_check_valid(lockndx); + + /* Note that TAS macros return 0 if *success* */ + return !PGSemaphoreTryLock(SpinlockSemaArray[lockndx - 1]); +} + +#endif /* !HAVE_SPINLOCKS */ |