From 293913568e6a7a86fd1479e1cff8e2ecb58d6568 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 13 Apr 2024 15:44:03 +0200 Subject: Adding upstream version 16.2. Signed-off-by: Daniel Baumann --- src/backend/storage/Makefile | 13 + src/backend/storage/buffer/Makefile | 22 + src/backend/storage/buffer/README | 276 ++ src/backend/storage/buffer/buf_init.c | 186 + src/backend/storage/buffer/buf_table.c | 162 + src/backend/storage/buffer/bufmgr.c | 5594 ++++++++++++++++++++++ src/backend/storage/buffer/freelist.c | 774 +++ src/backend/storage/buffer/localbuf.c | 821 ++++ src/backend/storage/buffer/meson.build | 9 + src/backend/storage/file/Makefile | 23 + src/backend/storage/file/buffile.c | 1039 ++++ src/backend/storage/file/copydir.c | 216 + src/backend/storage/file/fd.c | 3976 +++++++++++++++ src/backend/storage/file/fileset.c | 205 + src/backend/storage/file/meson.build | 10 + src/backend/storage/file/reinit.c | 422 ++ src/backend/storage/file/sharedfileset.c | 120 + src/backend/storage/freespace/Makefile | 20 + src/backend/storage/freespace/README | 196 + src/backend/storage/freespace/freespace.c | 865 ++++ src/backend/storage/freespace/fsmpage.c | 374 ++ src/backend/storage/freespace/indexfsm.c | 74 + src/backend/storage/freespace/meson.build | 7 + src/backend/storage/ipc/Makefile | 29 + src/backend/storage/ipc/barrier.c | 333 ++ src/backend/storage/ipc/dsm.c | 1257 +++++ src/backend/storage/ipc/dsm_impl.c | 1053 ++++ src/backend/storage/ipc/ipc.c | 439 ++ src/backend/storage/ipc/ipci.c | 354 ++ src/backend/storage/ipc/latch.c | 2268 +++++++++ src/backend/storage/ipc/meson.build | 21 + src/backend/storage/ipc/pmsignal.c | 462 ++ src/backend/storage/ipc/procarray.c | 5224 ++++++++++++++++++++ src/backend/storage/ipc/procsignal.c | 688 +++ src/backend/storage/ipc/shm_mq.c | 1329 +++++ src/backend/storage/ipc/shm_toc.c | 272 ++ src/backend/storage/ipc/shmem.c | 584 +++ src/backend/storage/ipc/signalfuncs.c | 318 ++ src/backend/storage/ipc/sinval.c | 205 + src/backend/storage/ipc/sinvaladt.c | 791 +++ src/backend/storage/ipc/standby.c | 1518 ++++++ src/backend/storage/large_object/Makefile | 18 + src/backend/storage/large_object/inv_api.c | 954 ++++ src/backend/storage/large_object/meson.build | 5 + src/backend/storage/lmgr/.gitignore | 3 + src/backend/storage/lmgr/Makefile | 52 + src/backend/storage/lmgr/README | 731 +++ src/backend/storage/lmgr/README-SSI | 646 +++ src/backend/storage/lmgr/README.barrier | 197 + src/backend/storage/lmgr/condition_variable.c | 360 ++ src/backend/storage/lmgr/deadlock.c | 1159 +++++ src/backend/storage/lmgr/generate-lwlocknames.pl | 77 + src/backend/storage/lmgr/lmgr.c | 1270 +++++ src/backend/storage/lmgr/lock.c | 4651 ++++++++++++++++++ src/backend/storage/lmgr/lwlock.c | 1973 ++++++++ src/backend/storage/lmgr/lwlocknames.c | 52 + src/backend/storage/lmgr/lwlocknames.h | 50 + src/backend/storage/lmgr/lwlocknames.txt | 55 + src/backend/storage/lmgr/meson.build | 15 + src/backend/storage/lmgr/predicate.c | 4997 +++++++++++++++++++ src/backend/storage/lmgr/proc.c | 1897 ++++++++ src/backend/storage/lmgr/s_lock.c | 324 ++ src/backend/storage/lmgr/spin.c | 180 + src/backend/storage/meson.build | 11 + src/backend/storage/page/Makefile | 23 + src/backend/storage/page/README | 64 + src/backend/storage/page/bufpage.c | 1549 ++++++ src/backend/storage/page/checksum.c | 22 + src/backend/storage/page/itemptr.c | 131 + src/backend/storage/page/meson.build | 7 + src/backend/storage/smgr/Makefile | 19 + src/backend/storage/smgr/README | 52 + src/backend/storage/smgr/md.c | 1623 +++++++ src/backend/storage/smgr/meson.build | 6 + src/backend/storage/smgr/smgr.c | 767 +++ src/backend/storage/sync/Makefile | 18 + src/backend/storage/sync/meson.build | 6 + src/backend/storage/sync/sync.c | 624 +++ 78 files changed, 57137 insertions(+) create mode 100644 src/backend/storage/Makefile create mode 100644 src/backend/storage/buffer/Makefile create mode 100644 src/backend/storage/buffer/README create mode 100644 src/backend/storage/buffer/buf_init.c create mode 100644 src/backend/storage/buffer/buf_table.c create mode 100644 src/backend/storage/buffer/bufmgr.c create mode 100644 src/backend/storage/buffer/freelist.c create mode 100644 src/backend/storage/buffer/localbuf.c create mode 100644 src/backend/storage/buffer/meson.build create mode 100644 src/backend/storage/file/Makefile create mode 100644 src/backend/storage/file/buffile.c create mode 100644 src/backend/storage/file/copydir.c create mode 100644 src/backend/storage/file/fd.c create mode 100644 src/backend/storage/file/fileset.c create mode 100644 src/backend/storage/file/meson.build create mode 100644 src/backend/storage/file/reinit.c create mode 100644 src/backend/storage/file/sharedfileset.c create mode 100644 src/backend/storage/freespace/Makefile create mode 100644 src/backend/storage/freespace/README create mode 100644 src/backend/storage/freespace/freespace.c create mode 100644 src/backend/storage/freespace/fsmpage.c create mode 100644 src/backend/storage/freespace/indexfsm.c create mode 100644 src/backend/storage/freespace/meson.build create mode 100644 src/backend/storage/ipc/Makefile create mode 100644 src/backend/storage/ipc/barrier.c create mode 100644 src/backend/storage/ipc/dsm.c create mode 100644 src/backend/storage/ipc/dsm_impl.c create mode 100644 src/backend/storage/ipc/ipc.c create mode 100644 src/backend/storage/ipc/ipci.c create mode 100644 src/backend/storage/ipc/latch.c create mode 100644 src/backend/storage/ipc/meson.build create mode 100644 src/backend/storage/ipc/pmsignal.c create mode 100644 src/backend/storage/ipc/procarray.c create mode 100644 src/backend/storage/ipc/procsignal.c create mode 100644 src/backend/storage/ipc/shm_mq.c create mode 100644 src/backend/storage/ipc/shm_toc.c create mode 100644 src/backend/storage/ipc/shmem.c create mode 100644 src/backend/storage/ipc/signalfuncs.c create mode 100644 src/backend/storage/ipc/sinval.c create mode 100644 src/backend/storage/ipc/sinvaladt.c create mode 100644 src/backend/storage/ipc/standby.c create mode 100644 src/backend/storage/large_object/Makefile create mode 100644 src/backend/storage/large_object/inv_api.c create mode 100644 src/backend/storage/large_object/meson.build create mode 100644 src/backend/storage/lmgr/.gitignore create mode 100644 src/backend/storage/lmgr/Makefile create mode 100644 src/backend/storage/lmgr/README create mode 100644 src/backend/storage/lmgr/README-SSI create mode 100644 src/backend/storage/lmgr/README.barrier create mode 100644 src/backend/storage/lmgr/condition_variable.c create mode 100644 src/backend/storage/lmgr/deadlock.c create mode 100644 src/backend/storage/lmgr/generate-lwlocknames.pl create mode 100644 src/backend/storage/lmgr/lmgr.c create mode 100644 src/backend/storage/lmgr/lock.c create mode 100644 src/backend/storage/lmgr/lwlock.c create mode 100644 src/backend/storage/lmgr/lwlocknames.c create mode 100644 src/backend/storage/lmgr/lwlocknames.h create mode 100644 src/backend/storage/lmgr/lwlocknames.txt create mode 100644 src/backend/storage/lmgr/meson.build create mode 100644 src/backend/storage/lmgr/predicate.c create mode 100644 src/backend/storage/lmgr/proc.c create mode 100644 src/backend/storage/lmgr/s_lock.c create mode 100644 src/backend/storage/lmgr/spin.c create mode 100644 src/backend/storage/meson.build create mode 100644 src/backend/storage/page/Makefile create mode 100644 src/backend/storage/page/README create mode 100644 src/backend/storage/page/bufpage.c create mode 100644 src/backend/storage/page/checksum.c create mode 100644 src/backend/storage/page/itemptr.c create mode 100644 src/backend/storage/page/meson.build create mode 100644 src/backend/storage/smgr/Makefile create mode 100644 src/backend/storage/smgr/README create mode 100644 src/backend/storage/smgr/md.c create mode 100644 src/backend/storage/smgr/meson.build create mode 100644 src/backend/storage/smgr/smgr.c create mode 100644 src/backend/storage/sync/Makefile create mode 100644 src/backend/storage/sync/meson.build create mode 100644 src/backend/storage/sync/sync.c (limited to 'src/backend/storage') diff --git a/src/backend/storage/Makefile b/src/backend/storage/Makefile new file mode 100644 index 0000000..8376cdf --- /dev/null +++ b/src/backend/storage/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the storage manager subsystem +# +# src/backend/storage/Makefile +# + +subdir = src/backend/storage +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global + +SUBDIRS = buffer file freespace ipc large_object lmgr page smgr sync + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile new file mode 100644 index 0000000..fd7c40d --- /dev/null +++ b/src/backend/storage/buffer/Makefile @@ -0,0 +1,22 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/buffer +# +# IDENTIFICATION +# src/backend/storage/buffer/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/buffer +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + buf_init.o \ + buf_table.o \ + bufmgr.o \ + freelist.o \ + localbuf.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README new file mode 100644 index 0000000..011af7a --- /dev/null +++ b/src/backend/storage/buffer/README @@ -0,0 +1,276 @@ +src/backend/storage/buffer/README + +Notes About Shared Buffer Access Rules +====================================== + +There are two separate access control mechanisms for shared disk buffers: +reference counts (a/k/a pin counts) and buffer content locks. (Actually, +there's a third level of access control: one must hold the appropriate kind +of lock on a relation before one can legally access any page belonging to +the relation. Relation-level locks are not discussed here.) + +Pins: one must "hold a pin on" a buffer (increment its reference count) +before being allowed to do anything at all with it. An unpinned buffer is +subject to being reclaimed and reused for a different page at any instant, +so touching it is unsafe. Normally a pin is acquired via ReadBuffer and +released via ReleaseBuffer. It is OK and indeed common for a single +backend to pin a page more than once concurrently; the buffer manager +handles this efficiently. It is considered OK to hold a pin for long +intervals --- for example, sequential scans hold a pin on the current page +until done processing all the tuples on the page, which could be quite a +while if the scan is the outer scan of a join. Similarly, a btree index +scan may hold a pin on the current index page. This is OK because normal +operations never wait for a page's pin count to drop to zero. (Anything +that might need to do such a wait is instead handled by waiting to obtain +the relation-level lock, which is why you'd better hold one first.) Pins +may not be held across transaction boundaries, however. + +Buffer content locks: there are two kinds of buffer lock, shared and exclusive, +which act just as you'd expect: multiple backends can hold shared locks on +the same buffer, but an exclusive lock prevents anyone else from holding +either shared or exclusive lock. (These can alternatively be called READ +and WRITE locks.) These locks are intended to be short-term: they should not +be held for long. Buffer locks are acquired and released by LockBuffer(). +It will *not* work for a single backend to try to acquire multiple locks on +the same buffer. One must pin a buffer before trying to lock it. + +Buffer access rules: + +1. To scan a page for tuples, one must hold a pin and either shared or +exclusive content lock. To examine the commit status (XIDs and status bits) +of a tuple in a shared buffer, one must likewise hold a pin and either shared +or exclusive lock. + +2. Once one has determined that a tuple is interesting (visible to the +current transaction) one may drop the content lock, yet continue to access +the tuple's data for as long as one holds the buffer pin. This is what is +typically done by heap scans, since the tuple returned by heap_fetch +contains a pointer to tuple data in the shared buffer. Therefore the +tuple cannot go away while the pin is held (see rule #5). Its state could +change, but that is assumed not to matter after the initial determination +of visibility is made. + +3. To add a tuple or change the xmin/xmax fields of an existing tuple, +one must hold a pin and an exclusive content lock on the containing buffer. +This ensures that no one else might see a partially-updated state of the +tuple while they are doing visibility checks. + +4. It is considered OK to update tuple commit status bits (ie, OR the +values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or +HEAP_XMAX_INVALID into t_infomask) while holding only a shared lock and +pin on a buffer. This is OK because another backend looking at the tuple +at about the same time would OR the same bits into the field, so there +is little or no risk of conflicting update; what's more, if there did +manage to be a conflict it would merely mean that one bit-update would +be lost and need to be done again later. These four bits are only hints +(they cache the results of transaction status lookups in pg_xact), so no +great harm is done if they get reset to zero by conflicting updates. +Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID +and HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires +an exclusive buffer lock (and it must also be WAL-logged). + +5. To physically remove a tuple or compact free space on a page, one +must hold a pin and an exclusive lock, *and* observe while holding the +exclusive lock that the buffer's shared reference count is one (ie, +no other backend holds a pin). If these conditions are met then no other +backend can perform a page scan until the exclusive lock is dropped, and +no other backend can be holding a reference to an existing tuple that it +might expect to examine again. Note that another backend might pin the +buffer (increment the refcount) while one is performing the cleanup, but +it won't be able to actually examine the page until it acquires shared +or exclusive content lock. + + +Obtaining the lock needed under rule #5 is done by the bufmgr routines +LockBufferForCleanup() or ConditionalLockBufferForCleanup(). They first get +an exclusive lock and then check to see if the shared pin count is currently +1. If not, ConditionalLockBufferForCleanup() releases the exclusive lock and +then returns false, while LockBufferForCleanup() releases the exclusive lock +(but not the caller's pin) and waits until signaled by another backend, +whereupon it tries again. The signal will occur when UnpinBuffer decrements +the shared pin count to 1. As indicated above, this operation might have to +wait a good while before it acquires the lock, but that shouldn't matter much +for concurrent VACUUM. The current implementation only supports a single +waiter for pin-count-1 on any particular shared buffer. This is enough for +VACUUM's use, since we don't allow multiple VACUUMs concurrently on a single +relation anyway. Anyone wishing to obtain a cleanup lock outside of recovery +or a VACUUM must use the conditional variant of the function. + + +Buffer Manager's Internal Locking +--------------------------------- + +Before PostgreSQL 8.1, all operations of the shared buffer manager itself +were protected by a single system-wide lock, the BufMgrLock, which +unsurprisingly proved to be a source of contention. The new locking scheme +avoids grabbing system-wide exclusive locks in common code paths. It works +like this: + +* There is a system-wide LWLock, the BufMappingLock, that notionally +protects the mapping from buffer tags (page identifiers) to buffers. +(Physically, it can be thought of as protecting the hash table maintained +by buf_table.c.) To look up whether a buffer exists for a tag, it is +sufficient to obtain share lock on the BufMappingLock. Note that one +must pin the found buffer, if any, before releasing the BufMappingLock. +To alter the page assignment of any buffer, one must hold exclusive lock +on the BufMappingLock. This lock must be held across adjusting the buffer's +header fields and changing the buf_table hash table. The only common +operation that needs exclusive lock is reading in a page that was not +in shared buffers already, which will require at least a kernel call +and usually a wait for I/O, so it will be slow anyway. + +* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS +separate locks, each guarding a portion of the buffer tag space. This allows +further reduction of contention in the normal code paths. The partition +that a particular buffer tag belongs to is determined from the low-order +bits of the tag's hash value. The rules stated above apply to each partition +independently. If it is necessary to lock more than one partition at a time, +they must be locked in partition-number order to avoid risk of deadlock. + +* A separate system-wide spinlock, buffer_strategy_lock, provides mutual +exclusion for operations that access the buffer free list or select +buffers for replacement. A spinlock is used here rather than a lightweight +lock for efficiency; no other locks of any sort should be acquired while +buffer_strategy_lock is held. This is essential to allow buffer replacement +to happen in multiple backends with reasonable concurrency. + +* Each buffer header contains a spinlock that must be taken when examining +or changing fields of that buffer header. This allows operations such as +ReleaseBuffer to make local state changes without taking any system-wide +lock. We use a spinlock, not an LWLock, since there are no cases where +the lock needs to be held for more than a few instructions. + +Note that a buffer header's spinlock does not control access to the data +held within the buffer. Each buffer header also contains an LWLock, the +"buffer content lock", that *does* represent the right to access the data +in the buffer. It is used per the rules above. + +* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a +buffer to complete (and in releases before 14, it was accompanied by a +per-buffer LWLock). The process doing a read or write sets the flag for the +duration, and processes that need to wait for it to be cleared sleep on a +condition variable. + + +Normal Buffer Replacement Strategy +---------------------------------- + +There is a "free list" of buffers that are prime candidates for replacement. +In particular, buffers that are completely free (contain no valid page) are +always in this list. We could also throw buffers into this list if we +consider their pages unlikely to be needed soon; however, the current +algorithm never does that. The list is singly-linked using fields in the +buffer headers; we maintain head and tail pointers in global variables. +(Note: although the list links are in the buffer headers, they are +considered to be protected by the buffer_strategy_lock, not the buffer-header +spinlocks.) To choose a victim buffer to recycle when there are no free +buffers available, we use a simple clock-sweep algorithm, which avoids the +need to take system-wide locks during common operations. It works like +this: + +Each buffer header contains a usage counter, which is incremented (up to a +small limit value) whenever the buffer is pinned. (This requires only the +buffer header spinlock, which would have to be taken anyway to increment the +buffer reference count, so it's nearly free.) + +The "clock hand" is a buffer index, nextVictimBuffer, that moves circularly +through all the available buffers. nextVictimBuffer is protected by the +buffer_strategy_lock. + +The algorithm for a process that needs to obtain a victim buffer is: + +1. Obtain buffer_strategy_lock. + +2. If buffer free list is nonempty, remove its head buffer. Release +buffer_strategy_lock. If the buffer is pinned or has a nonzero usage count, +it cannot be used; ignore it go back to step 1. Otherwise, pin the buffer, +and return it. + +3. Otherwise, the buffer free list is empty. Select the buffer pointed to by +nextVictimBuffer, and circularly advance nextVictimBuffer for next time. +Release buffer_strategy_lock. + +4. If the selected buffer is pinned or has a nonzero usage count, it cannot +be used. Decrement its usage count (if nonzero), reacquire +buffer_strategy_lock, and return to step 3 to examine the next buffer. + +5. Pin the selected buffer, and return. + +(Note that if the selected buffer is dirty, we will have to write it out +before we can recycle it; if someone else pins the buffer meanwhile we will +have to give up and try another buffer. This however is not a concern +of the basic select-a-victim-buffer algorithm.) + + +Buffer Ring Replacement Strategy +--------------------------------- + +When running a query that needs to access a large number of pages just once, +such as VACUUM or a large sequential scan, a different strategy is used. +A page that has been touched only by such a scan is unlikely to be needed +again soon, so instead of running the normal clock sweep algorithm and +blowing out the entire buffer cache, a small ring of buffers is allocated +using the normal clock sweep algorithm and those buffers are reused for the +whole scan. This also implies that much of the write traffic caused by such +a statement will be done by the backend itself and not pushed off onto other +processes. + +For sequential scans, a 256KB ring is used. That's small enough to fit in L2 +cache, which makes transferring pages from OS cache to shared buffer cache +efficient. Even less would often be enough, but the ring must be big enough +to accommodate all pages in the scan that are pinned concurrently. 256KB +should also be enough to leave a small cache trail for other backends to +join in a synchronized seq scan. If a ring buffer is dirtied and its LSN +updated, we would normally have to write and flush WAL before we could +re-use the buffer; in this case we instead discard the buffer from the ring +and (later) choose a replacement using the normal clock-sweep algorithm. +Hence this strategy works best for scans that are read-only (or at worst +update hint bits). In a scan that modifies every page in the scan, like a +bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and +the ring strategy effectively degrades to the normal strategy. + +VACUUM uses a ring like sequential scans, however, the size of this ring is +controlled by the vacuum_buffer_usage_limit GUC. Dirty pages are not removed +from the ring. Instead, WAL is flushed if needed to allow reuse of the +buffers. Before introducing the buffer ring strategy in 8.3, VACUUM's buffers +were sent to the freelist, which was effectively a buffer ring of 1 buffer, +resulting in excessive WAL flushing. + +Bulk writes work similarly to VACUUM. Currently this applies only to +COPY IN and CREATE TABLE AS SELECT. (Might it be interesting to make +seqscan UPDATE and DELETE use the bulkwrite strategy?) For bulk writes +we use a ring size of 16MB (but not more than 1/8th of shared_buffers). +Smaller sizes have been shown to result in the COPY blocking too often +for WAL flushes. While it's okay for a background vacuum to be slowed by +doing its own WAL flushing, we'd prefer that COPY not be subject to that, +so we let it use up a bit more of the buffer arena. + + +Background Writer's Processing +------------------------------ + +The background writer is designed to write out pages that are likely to be +recycled soon, thereby offloading the writing work from active backends. +To do this, it scans forward circularly from the current position of +nextVictimBuffer (which it does not change!), looking for buffers that are +dirty and not pinned nor marked with a positive usage count. It pins, +writes, and releases any such buffer. + +If we can assume that reading nextVictimBuffer is an atomic action, then +the writer doesn't even need to take buffer_strategy_lock in order to look +for buffers to write; it needs only to spinlock each buffer header for long +enough to check the dirtybit. Even without that assumption, the writer +only needs to take the lock long enough to read the variable value, not +while scanning the buffers. (This is a very substantial improvement in +the contention cost of the writer compared to PG 8.0.) + +The background writer takes shared content lock on a buffer while writing it +out (and anyone else who flushes buffer contents to disk must do so too). +This ensures that the page image transferred to disk is reasonably consistent. +We might miss a hint-bit update or two but that isn't a problem, for the same +reasons mentioned under buffer access rules. + +As of 8.4, background writer starts during recovery mode when there is +some form of potentially extended recovery to perform. It performs an +identical service to normal processing, except that checkpoints it +writes are technically restartpoints. diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c new file mode 100644 index 0000000..0057443 --- /dev/null +++ b/src/backend/storage/buffer/buf_init.c @@ -0,0 +1,186 @@ +/*------------------------------------------------------------------------- + * + * buf_init.c + * buffer manager initialization routines + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/buffer/buf_init.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/proc.h" + +BufferDescPadded *BufferDescriptors; +char *BufferBlocks; +ConditionVariableMinimallyPadded *BufferIOCVArray; +WritebackContext BackendWritebackContext; +CkptSortItem *CkptBufferIds; + + +/* + * Data Structures: + * buffers live in a freelist and a lookup data structure. + * + * + * Buffer Lookup: + * Two important notes. First, the buffer has to be + * available for lookup BEFORE an IO begins. Otherwise + * a second process trying to read the buffer will + * allocate its own copy and the buffer pool will + * become inconsistent. + * + * Buffer Replacement: + * see freelist.c. A buffer cannot be replaced while in + * use either by data manager or during IO. + * + * + * Synchronization/Locking: + * + * IO_IN_PROGRESS -- this is a flag in the buffer descriptor. + * It must be set when an IO is initiated and cleared at + * the end of the IO. It is there to make sure that one + * process doesn't start to use a buffer while another is + * faulting it in. see WaitIO and related routines. + * + * refcount -- Counts the number of processes holding pins on a buffer. + * A buffer is pinned during IO and immediately after a BufferAlloc(). + * Pins must be released before end of transaction. For efficiency the + * shared refcount isn't increased if an individual backend pins a buffer + * multiple times. Check the PrivateRefCount infrastructure in bufmgr.c. + */ + + +/* + * Initialize shared buffer pool + * + * This is called once during shared-memory initialization (either in the + * postmaster, or in a standalone backend). + */ +void +InitBufferPool(void) +{ + bool foundBufs, + foundDescs, + foundIOCV, + foundBufCkpt; + + /* Align descriptors to a cacheline boundary. */ + BufferDescriptors = (BufferDescPadded *) + ShmemInitStruct("Buffer Descriptors", + NBuffers * sizeof(BufferDescPadded), + &foundDescs); + + /* Align buffer pool on IO page size boundary. */ + BufferBlocks = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + ShmemInitStruct("Buffer Blocks", + NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE, + &foundBufs)); + + /* Align condition variables to cacheline boundary. */ + BufferIOCVArray = (ConditionVariableMinimallyPadded *) + ShmemInitStruct("Buffer IO Condition Variables", + NBuffers * sizeof(ConditionVariableMinimallyPadded), + &foundIOCV); + + /* + * The array used to sort to-be-checkpointed buffer ids is located in + * shared memory, to avoid having to allocate significant amounts of + * memory at runtime. As that'd be in the middle of a checkpoint, or when + * the checkpointer is restarted, memory allocation failures would be + * painful. + */ + CkptBufferIds = (CkptSortItem *) + ShmemInitStruct("Checkpoint BufferIds", + NBuffers * sizeof(CkptSortItem), &foundBufCkpt); + + if (foundDescs || foundBufs || foundIOCV || foundBufCkpt) + { + /* should find all of these, or none of them */ + Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt); + /* note: this path is only taken in EXEC_BACKEND case */ + } + else + { + int i; + + /* + * Initialize all the buffer headers. + */ + for (i = 0; i < NBuffers; i++) + { + BufferDesc *buf = GetBufferDescriptor(i); + + ClearBufferTag(&buf->tag); + + pg_atomic_init_u32(&buf->state, 0); + buf->wait_backend_pgprocno = INVALID_PGPROCNO; + + buf->buf_id = i; + + /* + * Initially link all the buffers together as unused. Subsequent + * management of this list is done by freelist.c. + */ + buf->freeNext = i + 1; + + LWLockInitialize(BufferDescriptorGetContentLock(buf), + LWTRANCHE_BUFFER_CONTENT); + + ConditionVariableInit(BufferDescriptorGetIOCV(buf)); + } + + /* Correct last entry of linked list */ + GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST; + } + + /* Init other shared buffer-management stuff */ + StrategyInitialize(!foundDescs); + + /* Initialize per-backend file flush context */ + WritebackContextInit(&BackendWritebackContext, + &backend_flush_after); +} + +/* + * BufferShmemSize + * + * compute the size of shared memory for the buffer pool including + * data pages, buffer descriptors, hash tables, etc. + */ +Size +BufferShmemSize(void) +{ + Size size = 0; + + /* size of buffer descriptors */ + size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded))); + /* to allow aligning buffer descriptors */ + size = add_size(size, PG_CACHE_LINE_SIZE); + + /* size of data pages, plus alignment padding */ + size = add_size(size, PG_IO_ALIGN_SIZE); + size = add_size(size, mul_size(NBuffers, BLCKSZ)); + + /* size of stuff controlled by freelist.c */ + size = add_size(size, StrategyShmemSize()); + + /* size of I/O condition variables */ + size = add_size(size, mul_size(NBuffers, + sizeof(ConditionVariableMinimallyPadded))); + /* to allow aligning the above */ + size = add_size(size, PG_CACHE_LINE_SIZE); + + /* size of checkpoint sort array in bufmgr.c */ + size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); + + return size; +} diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c new file mode 100644 index 0000000..2b96639 --- /dev/null +++ b/src/backend/storage/buffer/buf_table.c @@ -0,0 +1,162 @@ +/*------------------------------------------------------------------------- + * + * buf_table.c + * routines for mapping BufferTags to buffer indexes. + * + * Note: the routines in this file do no locking of their own. The caller + * must hold a suitable lock on the appropriate BufMappingLock, as specified + * in the comments. We can't do the locking inside these functions because + * in most cases the caller needs to adjust the buffer header contents + * before the lock is released (see notes in README). + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/buffer/buf_table.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" + +/* entry for buffer lookup hashtable */ +typedef struct +{ + BufferTag key; /* Tag of a disk page */ + int id; /* Associated buffer ID */ +} BufferLookupEnt; + +static HTAB *SharedBufHash; + + +/* + * Estimate space needed for mapping hashtable + * size is the desired hash table size (possibly more than NBuffers) + */ +Size +BufTableShmemSize(int size) +{ + return hash_estimate_size(size, sizeof(BufferLookupEnt)); +} + +/* + * Initialize shmem hash table for mapping buffers + * size is the desired hash table size (possibly more than NBuffers) + */ +void +InitBufTable(int size) +{ + HASHCTL info; + + /* assume no locking is needed yet */ + + /* BufferTag maps to Buffer */ + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(BufferLookupEnt); + info.num_partitions = NUM_BUFFER_PARTITIONS; + + SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", + size, size, + &info, + HASH_ELEM | HASH_BLOBS | HASH_PARTITION); +} + +/* + * BufTableHashCode + * Compute the hash code associated with a BufferTag + * + * This must be passed to the lookup/insert/delete routines along with the + * tag. We do it like this because the callers need to know the hash code + * in order to determine which buffer partition to lock, and we don't want + * to do the hash computation twice (hash_any is a bit slow). + */ +uint32 +BufTableHashCode(BufferTag *tagPtr) +{ + return get_hash_value(SharedBufHash, (void *) tagPtr); +} + +/* + * BufTableLookup + * Lookup the given BufferTag; return buffer ID, or -1 if not found + * + * Caller must hold at least share lock on BufMappingLock for tag's partition + */ +int +BufTableLookup(BufferTag *tagPtr, uint32 hashcode) +{ + BufferLookupEnt *result; + + result = (BufferLookupEnt *) + hash_search_with_hash_value(SharedBufHash, + tagPtr, + hashcode, + HASH_FIND, + NULL); + + if (!result) + return -1; + + return result->id; +} + +/* + * BufTableInsert + * Insert a hashtable entry for given tag and buffer ID, + * unless an entry already exists for that tag + * + * Returns -1 on successful insertion. If a conflicting entry exists + * already, returns the buffer ID in that entry. + * + * Caller must hold exclusive lock on BufMappingLock for tag's partition + */ +int +BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id) +{ + BufferLookupEnt *result; + bool found; + + Assert(buf_id >= 0); /* -1 is reserved for not-in-table */ + Assert(tagPtr->blockNum != P_NEW); /* invalid tag */ + + result = (BufferLookupEnt *) + hash_search_with_hash_value(SharedBufHash, + tagPtr, + hashcode, + HASH_ENTER, + &found); + + if (found) /* found something already in the table */ + return result->id; + + result->id = buf_id; + + return -1; +} + +/* + * BufTableDelete + * Delete the hashtable entry for given tag (which must exist) + * + * Caller must hold exclusive lock on BufMappingLock for tag's partition + */ +void +BufTableDelete(BufferTag *tagPtr, uint32 hashcode) +{ + BufferLookupEnt *result; + + result = (BufferLookupEnt *) + hash_search_with_hash_value(SharedBufHash, + tagPtr, + hashcode, + HASH_REMOVE, + NULL); + + if (!result) /* shouldn't happen */ + elog(ERROR, "shared buffer hash table corrupted"); +} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c new file mode 100644 index 0000000..e066a3f --- /dev/null +++ b/src/backend/storage/buffer/bufmgr.c @@ -0,0 +1,5594 @@ +/*------------------------------------------------------------------------- + * + * bufmgr.c + * buffer manager interface routines + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/buffer/bufmgr.c + * + *------------------------------------------------------------------------- + */ +/* + * Principal entry points: + * + * ReadBuffer() -- find or create a buffer holding the requested page, + * and pin it so that no one can destroy it while this process + * is using it. + * + * ReleaseBuffer() -- unpin a buffer + * + * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty". + * The disk write is delayed until buffer replacement or checkpoint. + * + * See also these files: + * freelist.c -- chooses victim for buffer replacement + * buf_table.c -- manages the buffer lookup table + */ +#include "postgres.h" + +#include +#include + +#include "access/tableam.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "catalog/catalog.h" +#include "catalog/storage.h" +#include "catalog/storage_xlog.h" +#include "executor/instrument.h" +#include "lib/binaryheap.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/smgr.h" +#include "storage/standby.h" +#include "utils/memdebug.h" +#include "utils/ps_status.h" +#include "utils/rel.h" +#include "utils/resowner_private.h" +#include "utils/timestamp.h" + + +/* Note: these two macros only work on shared buffers, not local ones! */ +#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) +#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr))) + +/* Note: this macro only works on local buffers, not shared ones! */ +#define LocalBufHdrGetBlock(bufHdr) \ + LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)] + +/* Bits in SyncOneBuffer's return value */ +#define BUF_WRITTEN 0x01 +#define BUF_REUSABLE 0x02 + +#define RELS_BSEARCH_THRESHOLD 20 + +/* + * This is the size (in the number of blocks) above which we scan the + * entire buffer pool to remove the buffers for all the pages of relation + * being dropped. For the relations with size below this threshold, we find + * the buffers by doing lookups in BufMapping table. + */ +#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32) + +typedef struct PrivateRefCountEntry +{ + Buffer buffer; + int32 refcount; +} PrivateRefCountEntry; + +/* 64 bytes, about the size of a cache line on common systems */ +#define REFCOUNT_ARRAY_ENTRIES 8 + +/* + * Status of buffers to checkpoint for a particular tablespace, used + * internally in BufferSync. + */ +typedef struct CkptTsStatus +{ + /* oid of the tablespace */ + Oid tsId; + + /* + * Checkpoint progress for this tablespace. To make progress comparable + * between tablespaces the progress is, for each tablespace, measured as a + * number between 0 and the total number of to-be-checkpointed pages. Each + * page checkpointed in this tablespace increments this space's progress + * by progress_slice. + */ + float8 progress; + float8 progress_slice; + + /* number of to-be checkpointed pages in this tablespace */ + int num_to_scan; + /* already processed pages in this tablespace */ + int num_scanned; + + /* current offset in CkptBufferIds for this tablespace */ + int index; +} CkptTsStatus; + +/* + * Type for array used to sort SMgrRelations + * + * FlushRelationsAllBuffers shares the same comparator function with + * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be + * compatible. + */ +typedef struct SMgrSortArray +{ + RelFileLocator rlocator; /* This must be the first member */ + SMgrRelation srel; +} SMgrSortArray; + +/* GUC variables */ +bool zero_damaged_pages = false; +int bgwriter_lru_maxpages = 100; +double bgwriter_lru_multiplier = 2.0; +bool track_io_timing = false; + +/* + * How many buffers PrefetchBuffer callers should try to stay ahead of their + * ReadBuffer calls by. Zero means "never prefetch". This value is only used + * for buffers not belonging to tablespaces that have their + * effective_io_concurrency parameter set. + */ +int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY; + +/* + * Like effective_io_concurrency, but used by maintenance code paths that might + * benefit from a higher setting because they work on behalf of many sessions. + * Overridden by the tablespace setting of the same name. + */ +int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY; + +/* + * GUC variables about triggering kernel writeback for buffers written; OS + * dependent defaults are set via the GUC mechanism. + */ +int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER; +int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER; +int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER; + +/* local state for LockBufferForCleanup */ +static BufferDesc *PinCountWaitBuf = NULL; + +/* + * Backend-Private refcount management: + * + * Each buffer also has a private refcount that keeps track of the number of + * times the buffer is pinned in the current process. This is so that the + * shared refcount needs to be modified only once if a buffer is pinned more + * than once by an individual backend. It's also used to check that no buffers + * are still pinned at the end of transactions and when exiting. + * + * + * To avoid - as we used to - requiring an array with NBuffers entries to keep + * track of local buffers, we use a small sequentially searched array + * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to + * keep track of backend local pins. + * + * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all + * refcounts are kept track of in the array; after that, new array entries + * displace old ones into the hash table. That way a frequently used entry + * can't get "stuck" in the hashtable while infrequent ones clog the array. + * + * Note that in most scenarios the number of pinned buffers will not exceed + * REFCOUNT_ARRAY_ENTRIES. + * + * + * To enter a buffer into the refcount tracking mechanism first reserve a free + * entry using ReservePrivateRefCountEntry() and then later, if necessary, + * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing + * memory allocations in NewPrivateRefCountEntry() which can be important + * because in some scenarios it's called with a spinlock held... + */ +static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]; +static HTAB *PrivateRefCountHash = NULL; +static int32 PrivateRefCountOverflowed = 0; +static uint32 PrivateRefCountClock = 0; +static PrivateRefCountEntry *ReservedRefCountEntry = NULL; + +static void ReservePrivateRefCountEntry(void); +static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer); +static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move); +static inline int32 GetPrivateRefCount(Buffer buffer); +static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); + +/* + * Ensure that the PrivateRefCountArray has sufficient space to store one more + * entry. This has to be called before using NewPrivateRefCountEntry() to fill + * a new entry - but it's perfectly fine to not use a reserved entry. + */ +static void +ReservePrivateRefCountEntry(void) +{ + /* Already reserved (or freed), nothing to do */ + if (ReservedRefCountEntry != NULL) + return; + + /* + * First search for a free entry the array, that'll be sufficient in the + * majority of cases. + */ + { + int i; + + for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + { + PrivateRefCountEntry *res; + + res = &PrivateRefCountArray[i]; + + if (res->buffer == InvalidBuffer) + { + ReservedRefCountEntry = res; + return; + } + } + } + + /* + * No luck. All array entries are full. Move one array entry into the hash + * table. + */ + { + /* + * Move entry from the current clock position in the array into the + * hashtable. Use that slot. + */ + PrivateRefCountEntry *hashent; + bool found; + + /* select victim slot */ + ReservedRefCountEntry = + &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES]; + + /* Better be used, otherwise we shouldn't get here. */ + Assert(ReservedRefCountEntry->buffer != InvalidBuffer); + + /* enter victim array entry into hashtable */ + hashent = hash_search(PrivateRefCountHash, + &(ReservedRefCountEntry->buffer), + HASH_ENTER, + &found); + Assert(!found); + hashent->refcount = ReservedRefCountEntry->refcount; + + /* clear the now free array slot */ + ReservedRefCountEntry->buffer = InvalidBuffer; + ReservedRefCountEntry->refcount = 0; + + PrivateRefCountOverflowed++; + } +} + +/* + * Fill a previously reserved refcount entry. + */ +static PrivateRefCountEntry * +NewPrivateRefCountEntry(Buffer buffer) +{ + PrivateRefCountEntry *res; + + /* only allowed to be called when a reservation has been made */ + Assert(ReservedRefCountEntry != NULL); + + /* use up the reserved entry */ + res = ReservedRefCountEntry; + ReservedRefCountEntry = NULL; + + /* and fill it */ + res->buffer = buffer; + res->refcount = 0; + + return res; +} + +/* + * Return the PrivateRefCount entry for the passed buffer. + * + * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if + * do_move is true, and the entry resides in the hashtable the entry is + * optimized for frequent access by moving it to the array. + */ +static PrivateRefCountEntry * +GetPrivateRefCountEntry(Buffer buffer, bool do_move) +{ + PrivateRefCountEntry *res; + int i; + + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); + + /* + * First search for references in the array, that'll be sufficient in the + * majority of cases. + */ + for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + { + res = &PrivateRefCountArray[i]; + + if (res->buffer == buffer) + return res; + } + + /* + * By here we know that the buffer, if already pinned, isn't residing in + * the array. + * + * Only look up the buffer in the hashtable if we've previously overflowed + * into it. + */ + if (PrivateRefCountOverflowed == 0) + return NULL; + + res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL); + + if (res == NULL) + return NULL; + else if (!do_move) + { + /* caller doesn't want us to move the hash entry into the array */ + return res; + } + else + { + /* move buffer from hashtable into the free array slot */ + bool found; + PrivateRefCountEntry *free; + + /* Ensure there's a free array slot */ + ReservePrivateRefCountEntry(); + + /* Use up the reserved slot */ + Assert(ReservedRefCountEntry != NULL); + free = ReservedRefCountEntry; + ReservedRefCountEntry = NULL; + Assert(free->buffer == InvalidBuffer); + + /* and fill it */ + free->buffer = buffer; + free->refcount = res->refcount; + + /* delete from hashtable */ + hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); + Assert(found); + Assert(PrivateRefCountOverflowed > 0); + PrivateRefCountOverflowed--; + + return free; + } +} + +/* + * Returns how many times the passed buffer is pinned by this backend. + * + * Only works for shared memory buffers! + */ +static inline int32 +GetPrivateRefCount(Buffer buffer) +{ + PrivateRefCountEntry *ref; + + Assert(BufferIsValid(buffer)); + Assert(!BufferIsLocal(buffer)); + + /* + * Not moving the entry - that's ok for the current users, but we might + * want to change this one day. + */ + ref = GetPrivateRefCountEntry(buffer, false); + + if (ref == NULL) + return 0; + return ref->refcount; +} + +/* + * Release resources used to track the reference count of a buffer which we no + * longer have pinned and don't want to pin again immediately. + */ +static void +ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) +{ + Assert(ref->refcount == 0); + + if (ref >= &PrivateRefCountArray[0] && + ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES]) + { + ref->buffer = InvalidBuffer; + + /* + * Mark the just used entry as reserved - in many scenarios that + * allows us to avoid ever having to search the array/hash for free + * entries. + */ + ReservedRefCountEntry = ref; + } + else + { + bool found; + Buffer buffer = ref->buffer; + + hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found); + Assert(found); + Assert(PrivateRefCountOverflowed > 0); + PrivateRefCountOverflowed--; + } +} + +/* + * BufferIsPinned + * True iff the buffer is pinned (also checks for valid buffer number). + * + * NOTE: what we check here is that *this* backend holds a pin on + * the buffer. We do not care whether some other backend does. + */ +#define BufferIsPinned(bufnum) \ +( \ + !BufferIsValid(bufnum) ? \ + false \ + : \ + BufferIsLocal(bufnum) ? \ + (LocalRefCount[-(bufnum) - 1] > 0) \ + : \ + (GetPrivateRefCount(bufnum) > 0) \ +) + + +static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence, + ForkNumber forkNum, BlockNumber blockNum, + ReadBufferMode mode, BufferAccessStrategy strategy, + bool *hit); +static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by); +static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by); +static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy); +static void PinBuffer_Locked(BufferDesc *buf); +static void UnpinBuffer(BufferDesc *buf); +static void BufferSync(int flags); +static uint32 WaitBufHdrUnlocked(BufferDesc *buf); +static int SyncOneBuffer(int buf_id, bool skip_recently_used, + WritebackContext *wb_context); +static void WaitIO(BufferDesc *buf); +static bool StartBufferIO(BufferDesc *buf, bool forInput); +static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, + uint32 set_flag_bits); +static void shared_buffer_write_error_callback(void *arg); +static void local_buffer_write_error_callback(void *arg); +static BufferDesc *BufferAlloc(SMgrRelation smgr, + char relpersistence, + ForkNumber forkNum, + BlockNumber blockNum, + BufferAccessStrategy strategy, + bool *foundPtr, IOContext io_context); +static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context); +static void FlushBuffer(BufferDesc *buf, SMgrRelation reln, + IOObject io_object, IOContext io_context); +static void FindAndDropRelationBuffers(RelFileLocator rlocator, + ForkNumber forkNum, + BlockNumber nForkBlock, + BlockNumber firstDelBlock); +static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator, + RelFileLocator dstlocator, + ForkNumber forkNum, bool permanent); +static void AtProcExit_Buffers(int code, Datum arg); +static void CheckForBufferLeaks(void); +static int rlocator_comparator(const void *p1, const void *p2); +static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb); +static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b); +static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg); + + +/* + * Implementation of PrefetchBuffer() for shared buffers. + */ +PrefetchBufferResult +PrefetchSharedBuffer(SMgrRelation smgr_reln, + ForkNumber forkNum, + BlockNumber blockNum) +{ + PrefetchBufferResult result = {InvalidBuffer, false}; + BufferTag newTag; /* identity of requested block */ + uint32 newHash; /* hash value for newTag */ + LWLock *newPartitionLock; /* buffer partition lock for it */ + int buf_id; + + Assert(BlockNumberIsValid(blockNum)); + + /* create a tag so we can lookup the buffer */ + InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator, + forkNum, blockNum); + + /* determine its hash code and partition lock ID */ + newHash = BufTableHashCode(&newTag); + newPartitionLock = BufMappingPartitionLock(newHash); + + /* see if the block is in the buffer pool already */ + LWLockAcquire(newPartitionLock, LW_SHARED); + buf_id = BufTableLookup(&newTag, newHash); + LWLockRelease(newPartitionLock); + + /* If not in buffers, initiate prefetch */ + if (buf_id < 0) + { +#ifdef USE_PREFETCH + /* + * Try to initiate an asynchronous read. This returns false in + * recovery if the relation file doesn't exist. + */ + if ((io_direct_flags & IO_DIRECT_DATA) == 0 && + smgrprefetch(smgr_reln, forkNum, blockNum)) + { + result.initiated_io = true; + } +#endif /* USE_PREFETCH */ + } + else + { + /* + * Report the buffer it was in at that time. The caller may be able + * to avoid a buffer table lookup, but it's not pinned and it must be + * rechecked! + */ + result.recent_buffer = buf_id + 1; + } + + /* + * If the block *is* in buffers, we do nothing. This is not really ideal: + * the block might be just about to be evicted, which would be stupid + * since we know we are going to need it soon. But the only easy answer + * is to bump the usage_count, which does not seem like a great solution: + * when the caller does ultimately touch the block, usage_count would get + * bumped again, resulting in too much favoritism for blocks that are + * involved in a prefetch sequence. A real fix would involve some + * additional per-buffer state, and it's not clear that there's enough of + * a problem to justify that. + */ + + return result; +} + +/* + * PrefetchBuffer -- initiate asynchronous read of a block of a relation + * + * This is named by analogy to ReadBuffer but doesn't actually allocate a + * buffer. Instead it tries to ensure that a future ReadBuffer for the given + * block will not be delayed by the I/O. Prefetching is optional. + * + * There are three possible outcomes: + * + * 1. If the block is already cached, the result includes a valid buffer that + * could be used by the caller to avoid the need for a later buffer lookup, but + * it's not pinned, so the caller must recheck it. + * + * 2. If the kernel has been asked to initiate I/O, the initiated_io member is + * true. Currently there is no way to know if the data was already cached by + * the kernel and therefore didn't really initiate I/O, and no way to know when + * the I/O completes other than using synchronous ReadBuffer(). + * + * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and + * USE_PREFETCH is not defined (this build doesn't support prefetching due to + * lack of a kernel facility), direct I/O is enabled, or the underlying + * relation file wasn't found and we are in recovery. (If the relation file + * wasn't found and we are not in recovery, an error is raised). + */ +PrefetchBufferResult +PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) +{ + Assert(RelationIsValid(reln)); + Assert(BlockNumberIsValid(blockNum)); + + if (RelationUsesLocalBuffers(reln)) + { + /* see comments in ReadBufferExtended */ + if (RELATION_IS_OTHER_TEMP(reln)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + /* pass it off to localbuf.c */ + return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum); + } + else + { + /* pass it to the shared buffer version */ + return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum); + } +} + +/* + * ReadRecentBuffer -- try to pin a block in a recently observed buffer + * + * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's + * successful. Return true if the buffer is valid and still has the expected + * tag. In that case, the buffer is pinned and the usage count is bumped. + */ +bool +ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum, + Buffer recent_buffer) +{ + BufferDesc *bufHdr; + BufferTag tag; + uint32 buf_state; + bool have_private_ref; + + Assert(BufferIsValid(recent_buffer)); + + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + ReservePrivateRefCountEntry(); + InitBufferTag(&tag, &rlocator, forkNum, blockNum); + + if (BufferIsLocal(recent_buffer)) + { + int b = -recent_buffer - 1; + + bufHdr = GetLocalBufferDescriptor(b); + buf_state = pg_atomic_read_u32(&bufHdr->state); + + /* Is it still valid and holding the right tag? */ + if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag)) + { + PinLocalBuffer(bufHdr, true); + + pgBufferUsage.local_blks_hit++; + + return true; + } + } + else + { + bufHdr = GetBufferDescriptor(recent_buffer - 1); + have_private_ref = GetPrivateRefCount(recent_buffer) > 0; + + /* + * Do we already have this buffer pinned with a private reference? If + * so, it must be valid and it is safe to check the tag without + * locking. If not, we have to lock the header first and then check. + */ + if (have_private_ref) + buf_state = pg_atomic_read_u32(&bufHdr->state); + else + buf_state = LockBufHdr(bufHdr); + + if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag)) + { + /* + * It's now safe to pin the buffer. We can't pin first and ask + * questions later, because it might confuse code paths like + * InvalidateBuffer() if we pinned a random non-matching buffer. + */ + if (have_private_ref) + PinBuffer(bufHdr, NULL); /* bump pin count */ + else + PinBuffer_Locked(bufHdr); /* pin for first time */ + + pgBufferUsage.shared_blks_hit++; + + return true; + } + + /* If we locked the header above, now unlock. */ + if (!have_private_ref) + UnlockBufHdr(bufHdr, buf_state); + } + + return false; +} + +/* + * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main + * fork with RBM_NORMAL mode and default strategy. + */ +Buffer +ReadBuffer(Relation reln, BlockNumber blockNum) +{ + return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL); +} + +/* + * ReadBufferExtended -- returns a buffer containing the requested + * block of the requested relation. If the blknum + * requested is P_NEW, extend the relation file and + * allocate a new block. (Caller is responsible for + * ensuring that only one backend tries to extend a + * relation at the same time!) + * + * Returns: the buffer number for the buffer containing + * the block read. The returned buffer has been pinned. + * Does not return on error --- elog's instead. + * + * Assume when this function is called, that reln has been opened already. + * + * In RBM_NORMAL mode, the page is read from disk, and the page header is + * validated. An error is thrown if the page header is not valid. (But + * note that an all-zero page is considered "valid"; see + * PageIsVerifiedExtended().) + * + * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not + * valid, the page is zeroed instead of throwing an error. This is intended + * for non-critical data, where the caller is prepared to repair errors. + * + * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's + * filled with zeros instead of reading it from disk. Useful when the caller + * is going to fill the page from scratch, since this saves I/O and avoids + * unnecessary failure if the page-on-disk has corrupt page headers. + * The page is returned locked to ensure that the caller has a chance to + * initialize the page before it's made visible to others. + * Caution: do not use this mode to read a page that is beyond the relation's + * current physical EOF; that is likely to cause problems in md.c when + * the page is modified and written out. P_NEW is OK, though. + * + * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires + * a cleanup-strength lock on the page. + * + * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here. + * + * If strategy is not NULL, a nondefault buffer access strategy is used. + * See buffer/README for details. + */ +Buffer +ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, + ReadBufferMode mode, BufferAccessStrategy strategy) +{ + bool hit; + Buffer buf; + + /* + * Reject attempts to read non-local temporary relations; we would be + * likely to get wrong data since we have no visibility into the owning + * session's local buffers. + */ + if (RELATION_IS_OTHER_TEMP(reln)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot access temporary tables of other sessions"))); + + /* + * Read the buffer, and update pgstat counters to reflect a cache hit or + * miss. + */ + pgstat_count_buffer_read(reln); + buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence, + forkNum, blockNum, mode, strategy, &hit); + if (hit) + pgstat_count_buffer_hit(reln); + return buf; +} + + +/* + * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require + * a relcache entry for the relation. + * + * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and + * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function + * cannot be used for temporary relations (and making that work might be + * difficult, unless we only want to read temporary relations for our own + * BackendId). + */ +Buffer +ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum, + BlockNumber blockNum, ReadBufferMode mode, + BufferAccessStrategy strategy, bool permanent) +{ + bool hit; + + SMgrRelation smgr = smgropen(rlocator, InvalidBackendId); + + return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT : + RELPERSISTENCE_UNLOGGED, forkNum, blockNum, + mode, strategy, &hit); +} + +/* + * Convenience wrapper around ExtendBufferedRelBy() extending by one block. + */ +Buffer +ExtendBufferedRel(BufferManagerRelation bmr, + ForkNumber forkNum, + BufferAccessStrategy strategy, + uint32 flags) +{ + Buffer buf; + uint32 extend_by = 1; + + ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by, + &buf, &extend_by); + + return buf; +} + +/* + * Extend relation by multiple blocks. + * + * Tries to extend the relation by extend_by blocks. Depending on the + * availability of resources the relation may end up being extended by a + * smaller number of pages (unless an error is thrown, always by at least one + * page). *extended_by is updated to the number of pages the relation has been + * extended to. + * + * buffers needs to be an array that is at least extend_by long. Upon + * completion, the first extend_by array elements will point to a pinned + * buffer. + * + * If EB_LOCK_FIRST is part of flags, the first returned buffer is + * locked. This is useful for callers that want a buffer that is guaranteed to + * be empty. + */ +BlockNumber +ExtendBufferedRelBy(BufferManagerRelation bmr, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + Buffer *buffers, + uint32 *extended_by) +{ + Assert((bmr.rel != NULL) != (bmr.smgr != NULL)); + Assert(bmr.smgr == NULL || bmr.relpersistence != 0); + Assert(extend_by > 0); + + if (bmr.smgr == NULL) + { + bmr.smgr = RelationGetSmgr(bmr.rel); + bmr.relpersistence = bmr.rel->rd_rel->relpersistence; + } + + return ExtendBufferedRelCommon(bmr, fork, strategy, flags, + extend_by, InvalidBlockNumber, + buffers, extended_by); +} + +/* + * Extend the relation so it is at least extend_to blocks large, return buffer + * (extend_to - 1). + * + * This is useful for callers that want to write a specific page, regardless + * of the current size of the relation (e.g. useful for visibilitymap and for + * crash recovery). + */ +Buffer +ExtendBufferedRelTo(BufferManagerRelation bmr, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + BlockNumber extend_to, + ReadBufferMode mode) +{ + BlockNumber current_size; + uint32 extended_by = 0; + Buffer buffer = InvalidBuffer; + Buffer buffers[64]; + + Assert((bmr.rel != NULL) != (bmr.smgr != NULL)); + Assert(bmr.smgr == NULL || bmr.relpersistence != 0); + Assert(extend_to != InvalidBlockNumber && extend_to > 0); + + if (bmr.smgr == NULL) + { + bmr.smgr = RelationGetSmgr(bmr.rel); + bmr.relpersistence = bmr.rel->rd_rel->relpersistence; + } + + /* + * If desired, create the file if it doesn't exist. If + * smgr_cached_nblocks[fork] is positive then it must exist, no need for + * an smgrexists call. + */ + if ((flags & EB_CREATE_FORK_IF_NEEDED) && + (bmr.smgr->smgr_cached_nblocks[fork] == 0 || + bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) && + !smgrexists(bmr.smgr, fork)) + { + LockRelationForExtension(bmr.rel, ExclusiveLock); + + /* could have been closed while waiting for lock */ + if (bmr.rel) + bmr.smgr = RelationGetSmgr(bmr.rel); + + /* recheck, fork might have been created concurrently */ + if (!smgrexists(bmr.smgr, fork)) + smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY); + + UnlockRelationForExtension(bmr.rel, ExclusiveLock); + } + + /* + * If requested, invalidate size cache, so that smgrnblocks asks the + * kernel. + */ + if (flags & EB_CLEAR_SIZE_CACHE) + bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber; + + /* + * Estimate how many pages we'll need to extend by. This avoids acquiring + * unnecessarily many victim buffers. + */ + current_size = smgrnblocks(bmr.smgr, fork); + + /* + * Since no-one else can be looking at the page contents yet, there is no + * difference between an exclusive lock and a cleanup-strength lock. Note + * that we pass the original mode to ReadBuffer_common() below, when + * falling back to reading the buffer to a concurrent relation extension. + */ + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + flags |= EB_LOCK_TARGET; + + while (current_size < extend_to) + { + uint32 num_pages = lengthof(buffers); + BlockNumber first_block; + + if ((uint64) current_size + num_pages > extend_to) + num_pages = extend_to - current_size; + + first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags, + num_pages, extend_to, + buffers, &extended_by); + + current_size = first_block + extended_by; + Assert(num_pages != 0 || current_size >= extend_to); + + for (int i = 0; i < extended_by; i++) + { + if (first_block + i != extend_to - 1) + ReleaseBuffer(buffers[i]); + else + buffer = buffers[i]; + } + } + + /* + * It's possible that another backend concurrently extended the relation. + * In that case read the buffer. + * + * XXX: Should we control this via a flag? + */ + if (buffer == InvalidBuffer) + { + bool hit; + + Assert(extended_by == 0); + buffer = ReadBuffer_common(bmr.smgr, bmr.relpersistence, + fork, extend_to - 1, mode, strategy, + &hit); + } + + return buffer; +} + +/* + * ReadBuffer_common -- common logic for all ReadBuffer variants + * + * *hit is set to true if the request was satisfied from shared buffer cache. + */ +static Buffer +ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, + BlockNumber blockNum, ReadBufferMode mode, + BufferAccessStrategy strategy, bool *hit) +{ + BufferDesc *bufHdr; + Block bufBlock; + bool found; + IOContext io_context; + IOObject io_object; + bool isLocalBuf = SmgrIsTemp(smgr); + + *hit = false; + + /* + * Backward compatibility path, most code should use ExtendBufferedRel() + * instead, as acquiring the extension lock inside ExtendBufferedRel() + * scales a lot better. + */ + if (unlikely(blockNum == P_NEW)) + { + uint32 flags = EB_SKIP_EXTENSION_LOCK; + + /* + * Since no-one else can be looking at the page contents yet, there is + * no difference between an exclusive lock and a cleanup-strength + * lock. + */ + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + flags |= EB_LOCK_FIRST; + + return ExtendBufferedRel(BMR_SMGR(smgr, relpersistence), + forkNum, strategy, flags); + } + + /* Make sure we will have room to remember the buffer pin */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum, + smgr->smgr_rlocator.locator.spcOid, + smgr->smgr_rlocator.locator.dbOid, + smgr->smgr_rlocator.locator.relNumber, + smgr->smgr_rlocator.backend); + + if (isLocalBuf) + { + /* + * We do not use a BufferAccessStrategy for I/O of temporary tables. + * However, in some cases, the "strategy" may not be NULL, so we can't + * rely on IOContextForStrategy() to set the right IOContext for us. + * This may happen in cases like CREATE TEMPORARY TABLE AS... + */ + io_context = IOCONTEXT_NORMAL; + io_object = IOOBJECT_TEMP_RELATION; + bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found); + if (found) + pgBufferUsage.local_blks_hit++; + else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG || + mode == RBM_ZERO_ON_ERROR) + pgBufferUsage.local_blks_read++; + } + else + { + /* + * lookup the buffer. IO_IN_PROGRESS is set if the requested block is + * not currently in memory. + */ + io_context = IOContextForStrategy(strategy); + io_object = IOOBJECT_RELATION; + bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum, + strategy, &found, io_context); + if (found) + pgBufferUsage.shared_blks_hit++; + else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG || + mode == RBM_ZERO_ON_ERROR) + pgBufferUsage.shared_blks_read++; + } + + /* At this point we do NOT hold any locks. */ + + /* if it was already in the buffer pool, we're done */ + if (found) + { + /* Just need to update stats before we exit */ + *hit = true; + VacuumPageHit++; + pgstat_count_io_op(io_object, io_context, IOOP_HIT); + + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageHit; + + TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, + smgr->smgr_rlocator.locator.spcOid, + smgr->smgr_rlocator.locator.dbOid, + smgr->smgr_rlocator.locator.relNumber, + smgr->smgr_rlocator.backend, + found); + + /* + * In RBM_ZERO_AND_LOCK mode the caller expects the page to be locked + * on return. + */ + if (!isLocalBuf) + { + if (mode == RBM_ZERO_AND_LOCK) + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), + LW_EXCLUSIVE); + else if (mode == RBM_ZERO_AND_CLEANUP_LOCK) + LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr)); + } + + return BufferDescriptorGetBuffer(bufHdr); + } + + /* + * if we have gotten to this point, we have allocated a buffer for the + * page but its contents are not yet valid. IO_IN_PROGRESS is set for it, + * if it's a shared buffer. + */ + Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */ + + bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr); + + /* + * Read in the page, unless the caller intends to overwrite it and just + * wants us to allocate a buffer. + */ + if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) + MemSet((char *) bufBlock, 0, BLCKSZ); + else + { + instr_time io_start = pgstat_prepare_io_time(); + + smgrread(smgr, forkNum, blockNum, bufBlock); + + pgstat_count_io_op_time(io_object, io_context, + IOOP_READ, io_start, 1); + + /* check for garbage data */ + if (!PageIsVerifiedExtended((Page) bufBlock, blockNum, + PIV_LOG_WARNING | PIV_REPORT_STAT)) + { + if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages) + { + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid page in block %u of relation %s; zeroing out page", + blockNum, + relpath(smgr->smgr_rlocator, forkNum)))); + MemSet((char *) bufBlock, 0, BLCKSZ); + } + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("invalid page in block %u of relation %s", + blockNum, + relpath(smgr->smgr_rlocator, forkNum)))); + } + } + + /* + * In RBM_ZERO_AND_LOCK / RBM_ZERO_AND_CLEANUP_LOCK mode, grab the buffer + * content lock before marking the page as valid, to make sure that no + * other backend sees the zeroed page before the caller has had a chance + * to initialize it. + * + * Since no-one else can be looking at the page contents yet, there is no + * difference between an exclusive lock and a cleanup-strength lock. (Note + * that we cannot use LockBuffer() or LockBufferForCleanup() here, because + * they assert that the buffer is already valid.) + */ + if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) && + !isLocalBuf) + { + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE); + } + + if (isLocalBuf) + { + /* Only need to adjust flags */ + uint32 buf_state = pg_atomic_read_u32(&bufHdr->state); + + buf_state |= BM_VALID; + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + } + else + { + /* Set BM_VALID, terminate IO, and wake up any waiters */ + TerminateBufferIO(bufHdr, false, BM_VALID); + } + + VacuumPageMiss++; + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageMiss; + + TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum, + smgr->smgr_rlocator.locator.spcOid, + smgr->smgr_rlocator.locator.dbOid, + smgr->smgr_rlocator.locator.relNumber, + smgr->smgr_rlocator.backend, + found); + + return BufferDescriptorGetBuffer(bufHdr); +} + +/* + * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared + * buffer. If no buffer exists already, selects a replacement + * victim and evicts the old page, but does NOT read in new page. + * + * "strategy" can be a buffer replacement strategy object, or NULL for + * the default strategy. The selected buffer's usage_count is advanced when + * using the default strategy, but otherwise possibly not (see PinBuffer). + * + * The returned buffer is pinned and is already marked as holding the + * desired page. If it already did have the desired page, *foundPtr is + * set true. Otherwise, *foundPtr is set false and the buffer is marked + * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it. + * + * *foundPtr is actually redundant with the buffer's BM_VALID flag, but + * we keep it for simplicity in ReadBuffer. + * + * io_context is passed as an output parameter to avoid calling + * IOContextForStrategy() when there is a shared buffers hit and no IO + * statistics need be captured. + * + * No locks are held either at entry or exit. + */ +static BufferDesc * +BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, + BlockNumber blockNum, + BufferAccessStrategy strategy, + bool *foundPtr, IOContext io_context) +{ + BufferTag newTag; /* identity of requested block */ + uint32 newHash; /* hash value for newTag */ + LWLock *newPartitionLock; /* buffer partition lock for it */ + int existing_buf_id; + Buffer victim_buffer; + BufferDesc *victim_buf_hdr; + uint32 victim_buf_state; + + /* create a tag so we can lookup the buffer */ + InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum); + + /* determine its hash code and partition lock ID */ + newHash = BufTableHashCode(&newTag); + newPartitionLock = BufMappingPartitionLock(newHash); + + /* see if the block is in the buffer pool already */ + LWLockAcquire(newPartitionLock, LW_SHARED); + existing_buf_id = BufTableLookup(&newTag, newHash); + if (existing_buf_id >= 0) + { + BufferDesc *buf; + bool valid; + + /* + * Found it. Now, pin the buffer so no one can steal it from the + * buffer pool, and check to see if the correct data has been loaded + * into the buffer. + */ + buf = GetBufferDescriptor(existing_buf_id); + + valid = PinBuffer(buf, strategy); + + /* Can release the mapping lock as soon as we've pinned it */ + LWLockRelease(newPartitionLock); + + *foundPtr = true; + + if (!valid) + { + /* + * We can only get here if (a) someone else is still reading in + * the page, or (b) a previous read attempt failed. We have to + * wait for any active read attempt to finish, and then set up our + * own read attempt if the page is still not BM_VALID. + * StartBufferIO does it all. + */ + if (StartBufferIO(buf, true)) + { + /* + * If we get here, previous attempts to read the buffer must + * have failed ... but we shall bravely try again. + */ + *foundPtr = false; + } + } + + return buf; + } + + /* + * Didn't find it in the buffer pool. We'll have to initialize a new + * buffer. Remember to unlock the mapping lock while doing the work. + */ + LWLockRelease(newPartitionLock); + + /* + * Acquire a victim buffer. Somebody else might try to do the same, we + * don't hold any conflicting locks. If so we'll have to undo our work + * later. + */ + victim_buffer = GetVictimBuffer(strategy, io_context); + victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1); + + /* + * Try to make a hashtable entry for the buffer under its new tag. If + * somebody else inserted another buffer for the tag, we'll release the + * victim buffer we acquired and use the already inserted one. + */ + LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); + existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id); + if (existing_buf_id >= 0) + { + BufferDesc *existing_buf_hdr; + bool valid; + + /* + * Got a collision. Someone has already done what we were about to do. + * We'll just handle this as if it were found in the buffer pool in + * the first place. First, give up the buffer we were planning to + * use. + * + * We could do this after releasing the partition lock, but then we'd + * have to call ResourceOwnerEnlargeBuffers() & + * ReservePrivateRefCountEntry() before acquiring the lock, for the + * rare case of such a collision. + */ + UnpinBuffer(victim_buf_hdr); + + /* + * The victim buffer we acquired previously is clean and unused, let + * it be found again quickly + */ + StrategyFreeBuffer(victim_buf_hdr); + + /* remaining code should match code at top of routine */ + + existing_buf_hdr = GetBufferDescriptor(existing_buf_id); + + valid = PinBuffer(existing_buf_hdr, strategy); + + /* Can release the mapping lock as soon as we've pinned it */ + LWLockRelease(newPartitionLock); + + *foundPtr = true; + + if (!valid) + { + /* + * We can only get here if (a) someone else is still reading in + * the page, or (b) a previous read attempt failed. We have to + * wait for any active read attempt to finish, and then set up our + * own read attempt if the page is still not BM_VALID. + * StartBufferIO does it all. + */ + if (StartBufferIO(existing_buf_hdr, true)) + { + /* + * If we get here, previous attempts to read the buffer must + * have failed ... but we shall bravely try again. + */ + *foundPtr = false; + } + } + + return existing_buf_hdr; + } + + /* + * Need to lock the buffer header too in order to change its tag. + */ + victim_buf_state = LockBufHdr(victim_buf_hdr); + + /* some sanity checks while we hold the buffer header lock */ + Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1); + Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS))); + + victim_buf_hdr->tag = newTag; + + /* + * Make sure BM_PERMANENT is set for buffers that must be written at every + * checkpoint. Unlogged buffers only need to be written at shutdown + * checkpoints, except for their "init" forks, which need to be treated + * just like permanent relations. + */ + victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM) + victim_buf_state |= BM_PERMANENT; + + UnlockBufHdr(victim_buf_hdr, victim_buf_state); + + LWLockRelease(newPartitionLock); + + /* + * Buffer contents are currently invalid. Try to obtain the right to + * start I/O. If StartBufferIO returns false, then someone else managed + * to read it before we did, so there's nothing left for BufferAlloc() to + * do. + */ + if (StartBufferIO(victim_buf_hdr, true)) + *foundPtr = false; + else + *foundPtr = true; + + return victim_buf_hdr; +} + +/* + * InvalidateBuffer -- mark a shared buffer invalid and return it to the + * freelist. + * + * The buffer header spinlock must be held at entry. We drop it before + * returning. (This is sane because the caller must have locked the + * buffer in order to be sure it should be dropped.) + * + * This is used only in contexts such as dropping a relation. We assume + * that no other backend could possibly be interested in using the page, + * so the only reason the buffer might be pinned is if someone else is + * trying to write it out. We have to let them finish before we can + * reclaim the buffer. + * + * The buffer could get reclaimed by someone else while we are waiting + * to acquire the necessary locks; if so, don't mess it up. + */ +static void +InvalidateBuffer(BufferDesc *buf) +{ + BufferTag oldTag; + uint32 oldHash; /* hash value for oldTag */ + LWLock *oldPartitionLock; /* buffer partition lock for it */ + uint32 oldFlags; + uint32 buf_state; + + /* Save the original buffer tag before dropping the spinlock */ + oldTag = buf->tag; + + buf_state = pg_atomic_read_u32(&buf->state); + Assert(buf_state & BM_LOCKED); + UnlockBufHdr(buf, buf_state); + + /* + * Need to compute the old tag's hashcode and partition lock ID. XXX is it + * worth storing the hashcode in BufferDesc so we need not recompute it + * here? Probably not. + */ + oldHash = BufTableHashCode(&oldTag); + oldPartitionLock = BufMappingPartitionLock(oldHash); + +retry: + + /* + * Acquire exclusive mapping lock in preparation for changing the buffer's + * association. + */ + LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE); + + /* Re-lock the buffer header */ + buf_state = LockBufHdr(buf); + + /* If it's changed while we were waiting for lock, do nothing */ + if (!BufferTagsEqual(&buf->tag, &oldTag)) + { + UnlockBufHdr(buf, buf_state); + LWLockRelease(oldPartitionLock); + return; + } + + /* + * We assume the only reason for it to be pinned is that someone else is + * flushing the page out. Wait for them to finish. (This could be an + * infinite loop if the refcount is messed up... it would be nice to time + * out after awhile, but there seems no way to be sure how many loops may + * be needed. Note that if the other guy has pinned the buffer but not + * yet done StartBufferIO, WaitIO will fall through and we'll effectively + * be busy-looping here.) + */ + if (BUF_STATE_GET_REFCOUNT(buf_state) != 0) + { + UnlockBufHdr(buf, buf_state); + LWLockRelease(oldPartitionLock); + /* safety check: should definitely not be our *own* pin */ + if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0) + elog(ERROR, "buffer is pinned in InvalidateBuffer"); + WaitIO(buf); + goto retry; + } + + /* + * Clear out the buffer's tag and flags. We must do this to ensure that + * linear scans of the buffer array don't think the buffer is valid. + */ + oldFlags = buf_state & BUF_FLAG_MASK; + ClearBufferTag(&buf->tag); + buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK); + UnlockBufHdr(buf, buf_state); + + /* + * Remove the buffer from the lookup hashtable, if it was in there. + */ + if (oldFlags & BM_TAG_VALID) + BufTableDelete(&oldTag, oldHash); + + /* + * Done with mapping lock. + */ + LWLockRelease(oldPartitionLock); + + /* + * Insert the buffer at the head of the list of free buffers. + */ + StrategyFreeBuffer(buf); +} + +/* + * Helper routine for GetVictimBuffer() + * + * Needs to be called on a buffer with a valid tag, pinned, but without the + * buffer header spinlock held. + * + * Returns true if the buffer can be reused, in which case the buffer is only + * pinned by this backend and marked as invalid, false otherwise. + */ +static bool +InvalidateVictimBuffer(BufferDesc *buf_hdr) +{ + uint32 buf_state; + uint32 hash; + LWLock *partition_lock; + BufferTag tag; + + Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1); + + /* have buffer pinned, so it's safe to read tag without lock */ + tag = buf_hdr->tag; + + hash = BufTableHashCode(&tag); + partition_lock = BufMappingPartitionLock(hash); + + LWLockAcquire(partition_lock, LW_EXCLUSIVE); + + /* lock the buffer header */ + buf_state = LockBufHdr(buf_hdr); + + /* + * We have the buffer pinned nobody else should have been able to unset + * this concurrently. + */ + Assert(buf_state & BM_TAG_VALID); + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + Assert(BufferTagsEqual(&buf_hdr->tag, &tag)); + + /* + * If somebody else pinned the buffer since, or even worse, dirtied it, + * give up on this buffer: It's clearly in use. + */ + if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY)) + { + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + + UnlockBufHdr(buf_hdr, buf_state); + LWLockRelease(partition_lock); + + return false; + } + + /* + * Clear out the buffer's tag and flags and usagecount. This is not + * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before + * doing anything with the buffer. But currently it's beneficial, as the + * cheaper pre-check for several linear scans of shared buffers use the + * tag (see e.g. FlushDatabaseBuffers()). + */ + ClearBufferTag(&buf_hdr->tag); + buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK); + UnlockBufHdr(buf_hdr, buf_state); + + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + + /* finally delete buffer from the buffer mapping table */ + BufTableDelete(&tag, hash); + + LWLockRelease(partition_lock); + + Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID))); + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0); + + return true; +} + +static Buffer +GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context) +{ + BufferDesc *buf_hdr; + Buffer buf; + uint32 buf_state; + bool from_ring; + + /* + * Ensure, while the spinlock's not yet held, that there's a free refcount + * entry. + */ + ReservePrivateRefCountEntry(); + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + /* we return here if a prospective victim buffer gets used concurrently */ +again: + + /* + * Select a victim buffer. The buffer is returned with its header + * spinlock still held! + */ + buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring); + buf = BufferDescriptorGetBuffer(buf_hdr); + + Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0); + + /* Pin the buffer and then release the buffer spinlock */ + PinBuffer_Locked(buf_hdr); + + /* + * We shouldn't have any other pins for this buffer. + */ + CheckBufferIsPinnedOnce(buf); + + /* + * If the buffer was dirty, try to write it out. There is a race + * condition here, in that someone might dirty it after we released the + * buffer header lock above, or even while we are writing it out (since + * our share-lock won't prevent hint-bit updates). We will recheck the + * dirty bit after re-locking the buffer header. + */ + if (buf_state & BM_DIRTY) + { + LWLock *content_lock; + + Assert(buf_state & BM_TAG_VALID); + Assert(buf_state & BM_VALID); + + /* + * We need a share-lock on the buffer contents to write it out (else + * we might write invalid data, eg because someone else is compacting + * the page contents while we write). We must use a conditional lock + * acquisition here to avoid deadlock. Even though the buffer was not + * pinned (and therefore surely not locked) when StrategyGetBuffer + * returned it, someone else could have pinned and exclusive-locked it + * by the time we get here. If we try to get the lock unconditionally, + * we'd block waiting for them; if they later block waiting for us, + * deadlock ensues. (This has been observed to happen when two + * backends are both trying to split btree index pages, and the second + * one just happens to be trying to split the page the first one got + * from StrategyGetBuffer.) + */ + content_lock = BufferDescriptorGetContentLock(buf_hdr); + if (!LWLockConditionalAcquire(content_lock, LW_SHARED)) + { + /* + * Someone else has locked the buffer, so give it up and loop back + * to get another one. + */ + UnpinBuffer(buf_hdr); + goto again; + } + + /* + * If using a nondefault strategy, and writing the buffer would + * require a WAL flush, let the strategy decide whether to go ahead + * and write/reuse the buffer or to choose another victim. We need a + * lock to inspect the page LSN, so this can't be done inside + * StrategyGetBuffer. + */ + if (strategy != NULL) + { + XLogRecPtr lsn; + + /* Read the LSN while holding buffer header lock */ + buf_state = LockBufHdr(buf_hdr); + lsn = BufferGetLSN(buf_hdr); + UnlockBufHdr(buf_hdr, buf_state); + + if (XLogNeedsFlush(lsn) + && StrategyRejectBuffer(strategy, buf_hdr, from_ring)) + { + LWLockRelease(content_lock); + UnpinBuffer(buf_hdr); + goto again; + } + } + + /* OK, do the I/O */ + FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context); + LWLockRelease(content_lock); + + ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context, + &buf_hdr->tag); + } + + + if (buf_state & BM_VALID) + { + /* + * When a BufferAccessStrategy is in use, blocks evicted from shared + * buffers are counted as IOOP_EVICT in the corresponding context + * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a + * strategy in two cases: 1) while initially claiming buffers for the + * strategy ring 2) to replace an existing strategy ring buffer + * because it is pinned or in use and cannot be reused. + * + * Blocks evicted from buffers already in the strategy ring are + * counted as IOOP_REUSE in the corresponding strategy context. + * + * At this point, we can accurately count evictions and reuses, + * because we have successfully claimed the valid buffer. Previously, + * we may have been forced to release the buffer due to concurrent + * pinners or erroring out. + */ + pgstat_count_io_op(IOOBJECT_RELATION, io_context, + from_ring ? IOOP_REUSE : IOOP_EVICT); + } + + /* + * If the buffer has an entry in the buffer mapping table, delete it. This + * can fail because another backend could have pinned or dirtied the + * buffer. + */ + if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr)) + { + UnpinBuffer(buf_hdr); + goto again; + } + + /* a final set of sanity checks */ +#ifdef USE_ASSERT_CHECKING + buf_state = pg_atomic_read_u32(&buf_hdr->state); + + Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1); + Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY))); + + CheckBufferIsPinnedOnce(buf); +#endif + + return buf; +} + +/* + * Limit the number of pins a batch operation may additionally acquire, to + * avoid running out of pinnable buffers. + * + * One additional pin is always allowed, as otherwise the operation likely + * cannot be performed at all. + * + * The number of allowed pins for a backend is computed based on + * shared_buffers and the maximum number of connections possible. That's very + * pessimistic, but outside of toy-sized shared_buffers it should allow + * sufficient pins. + */ +static void +LimitAdditionalPins(uint32 *additional_pins) +{ + uint32 max_backends; + int max_proportional_pins; + + if (*additional_pins <= 1) + return; + + max_backends = MaxBackends + NUM_AUXILIARY_PROCS; + max_proportional_pins = NBuffers / max_backends; + + /* + * Subtract the approximate number of buffers already pinned by this + * backend. We get the number of "overflowed" pins for free, but don't + * know the number of pins in PrivateRefCountArray. The cost of + * calculating that exactly doesn't seem worth it, so just assume the max. + */ + max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES; + + if (max_proportional_pins <= 0) + max_proportional_pins = 1; + + if (*additional_pins > max_proportional_pins) + *additional_pins = max_proportional_pins; +} + +/* + * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to + * avoid duplicating the tracing and relpersistence related logic. + */ +static BlockNumber +ExtendBufferedRelCommon(BufferManagerRelation bmr, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by) +{ + BlockNumber first_block; + + TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork, + bmr.smgr->smgr_rlocator.locator.spcOid, + bmr.smgr->smgr_rlocator.locator.dbOid, + bmr.smgr->smgr_rlocator.locator.relNumber, + bmr.smgr->smgr_rlocator.backend, + extend_by); + + if (bmr.relpersistence == RELPERSISTENCE_TEMP) + first_block = ExtendBufferedRelLocal(bmr, fork, flags, + extend_by, extend_upto, + buffers, &extend_by); + else + first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags, + extend_by, extend_upto, + buffers, &extend_by); + *extended_by = extend_by; + + TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork, + bmr.smgr->smgr_rlocator.locator.spcOid, + bmr.smgr->smgr_rlocator.locator.dbOid, + bmr.smgr->smgr_rlocator.locator.relNumber, + bmr.smgr->smgr_rlocator.backend, + *extended_by, + first_block); + + return first_block; +} + +/* + * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for + * shared buffers. + */ +static BlockNumber +ExtendBufferedRelShared(BufferManagerRelation bmr, + ForkNumber fork, + BufferAccessStrategy strategy, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by) +{ + BlockNumber first_block; + IOContext io_context = IOContextForStrategy(strategy); + instr_time io_start; + + LimitAdditionalPins(&extend_by); + + /* + * Acquire victim buffers for extension without holding extension lock. + * Writing out victim buffers is the most expensive part of extending the + * relation, particularly when doing so requires WAL flushes. Zeroing out + * the buffers is also quite expensive, so do that before holding the + * extension lock as well. + * + * These pages are pinned by us and not valid. While we hold the pin they + * can't be acquired as victim buffers by another backend. + */ + for (uint32 i = 0; i < extend_by; i++) + { + Block buf_block; + + buffers[i] = GetVictimBuffer(strategy, io_context); + buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1)); + + /* new buffers are zero-filled */ + MemSet((char *) buf_block, 0, BLCKSZ); + } + + /* in case we need to pin an existing buffer below */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + /* + * Lock relation against concurrent extensions, unless requested not to. + * + * We use the same extension lock for all forks. That's unnecessarily + * restrictive, but currently extensions for forks don't happen often + * enough to make it worth locking more granularly. + * + * Note that another backend might have extended the relation by the time + * we get the lock. + */ + if (!(flags & EB_SKIP_EXTENSION_LOCK)) + { + LockRelationForExtension(bmr.rel, ExclusiveLock); + if (bmr.rel) + bmr.smgr = RelationGetSmgr(bmr.rel); + } + + /* + * If requested, invalidate size cache, so that smgrnblocks asks the + * kernel. + */ + if (flags & EB_CLEAR_SIZE_CACHE) + bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber; + + first_block = smgrnblocks(bmr.smgr, fork); + + /* + * Now that we have the accurate relation size, check if the caller wants + * us to extend to only up to a specific size. If there were concurrent + * extensions, we might have acquired too many buffers and need to release + * them. + */ + if (extend_upto != InvalidBlockNumber) + { + uint32 orig_extend_by = extend_by; + + if (first_block > extend_upto) + extend_by = 0; + else if ((uint64) first_block + extend_by > extend_upto) + extend_by = extend_upto - first_block; + + for (uint32 i = extend_by; i < orig_extend_by; i++) + { + BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1); + + /* + * The victim buffer we acquired previously is clean and unused, + * let it be found again quickly + */ + StrategyFreeBuffer(buf_hdr); + UnpinBuffer(buf_hdr); + } + + if (extend_by == 0) + { + if (!(flags & EB_SKIP_EXTENSION_LOCK)) + UnlockRelationForExtension(bmr.rel, ExclusiveLock); + *extended_by = extend_by; + return first_block; + } + } + + /* Fail if relation is already at maximum possible length */ + if ((uint64) first_block + extend_by >= MaxBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend relation %s beyond %u blocks", + relpath(bmr.smgr->smgr_rlocator, fork), + MaxBlockNumber))); + + /* + * Insert buffers into buffer table, mark as IO_IN_PROGRESS. + * + * This needs to happen before we extend the relation, because as soon as + * we do, other backends can start to read in those pages. + */ + for (int i = 0; i < extend_by; i++) + { + Buffer victim_buf = buffers[i]; + BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1); + BufferTag tag; + uint32 hash; + LWLock *partition_lock; + int existing_id; + + InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i); + hash = BufTableHashCode(&tag); + partition_lock = BufMappingPartitionLock(hash); + + LWLockAcquire(partition_lock, LW_EXCLUSIVE); + + existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id); + + /* + * We get here only in the corner case where we are trying to extend + * the relation but we found a pre-existing buffer. This can happen + * because a prior attempt at extending the relation failed, and + * because mdread doesn't complain about reads beyond EOF (when + * zero_damaged_pages is ON) and so a previous attempt to read a block + * beyond EOF could have left a "valid" zero-filled buffer. + * Unfortunately, we have also seen this case occurring because of + * buggy Linux kernels that sometimes return an lseek(SEEK_END) result + * that doesn't account for a recent write. In that situation, the + * pre-existing buffer would contain valid data that we don't want to + * overwrite. Since the legitimate cases should always have left a + * zero-filled buffer, complain if not PageIsNew. + */ + if (existing_id >= 0) + { + BufferDesc *existing_hdr = GetBufferDescriptor(existing_id); + Block buf_block; + bool valid; + + /* + * Pin the existing buffer before releasing the partition lock, + * preventing it from being evicted. + */ + valid = PinBuffer(existing_hdr, strategy); + + LWLockRelease(partition_lock); + + /* + * The victim buffer we acquired previously is clean and unused, + * let it be found again quickly + */ + StrategyFreeBuffer(victim_buf_hdr); + UnpinBuffer(victim_buf_hdr); + + buffers[i] = BufferDescriptorGetBuffer(existing_hdr); + buf_block = BufHdrGetBlock(existing_hdr); + + if (valid && !PageIsNew((Page) buf_block)) + ereport(ERROR, + (errmsg("unexpected data beyond EOF in block %u of relation %s", + existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)), + errhint("This has been seen to occur with buggy kernels; consider updating your system."))); + + /* + * We *must* do smgr[zero]extend before succeeding, else the page + * will not be reserved by the kernel, and the next P_NEW call + * will decide to return the same page. Clear the BM_VALID bit, + * do StartBufferIO() and proceed. + * + * Loop to handle the very small possibility that someone re-sets + * BM_VALID between our clearing it and StartBufferIO inspecting + * it. + */ + do + { + uint32 buf_state = LockBufHdr(existing_hdr); + + buf_state &= ~BM_VALID; + UnlockBufHdr(existing_hdr, buf_state); + } while (!StartBufferIO(existing_hdr, true)); + } + else + { + uint32 buf_state; + + buf_state = LockBufHdr(victim_buf_hdr); + + /* some sanity checks while we hold the buffer header lock */ + Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED))); + Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1); + + victim_buf_hdr->tag = tag; + + buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM) + buf_state |= BM_PERMANENT; + + UnlockBufHdr(victim_buf_hdr, buf_state); + + LWLockRelease(partition_lock); + + /* XXX: could combine the locked operations in it with the above */ + StartBufferIO(victim_buf_hdr, true); + } + } + + io_start = pgstat_prepare_io_time(); + + /* + * Note: if smgrzeroextend fails, we will end up with buffers that are + * allocated but not marked BM_VALID. The next relation extension will + * still select the same block number (because the relation didn't get any + * longer on disk) and so future attempts to extend the relation will find + * the same buffers (if they have not been recycled) but come right back + * here to try smgrzeroextend again. + * + * We don't need to set checksum for all-zero pages. + */ + smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false); + + /* + * Release the file-extension lock; it's now OK for someone else to extend + * the relation some more. + * + * We remove IO_IN_PROGRESS after this, as waking up waiting backends can + * take noticeable time. + */ + if (!(flags & EB_SKIP_EXTENSION_LOCK)) + UnlockRelationForExtension(bmr.rel, ExclusiveLock); + + pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND, + io_start, extend_by); + + /* Set BM_VALID, terminate IO, and wake up any waiters */ + for (int i = 0; i < extend_by; i++) + { + Buffer buf = buffers[i]; + BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1); + bool lock = false; + + if (flags & EB_LOCK_FIRST && i == 0) + lock = true; + else if (flags & EB_LOCK_TARGET) + { + Assert(extend_upto != InvalidBlockNumber); + if (first_block + i + 1 == extend_upto) + lock = true; + } + + if (lock) + LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE); + + TerminateBufferIO(buf_hdr, false, BM_VALID); + } + + pgBufferUsage.shared_blks_written += extend_by; + + *extended_by = extend_by; + + return first_block; +} + +/* + * MarkBufferDirty + * + * Marks buffer contents as dirty (actual write happens later). + * + * Buffer must be pinned and exclusive-locked. (If caller does not hold + * exclusive lock, then somebody could be in process of writing the buffer, + * leading to risk of bad data written to disk.) + */ +void +MarkBufferDirty(Buffer buffer) +{ + BufferDesc *bufHdr; + uint32 buf_state; + uint32 old_buf_state; + + if (!BufferIsValid(buffer)) + elog(ERROR, "bad buffer ID: %d", buffer); + + if (BufferIsLocal(buffer)) + { + MarkLocalBufferDirty(buffer); + return; + } + + bufHdr = GetBufferDescriptor(buffer - 1); + + Assert(BufferIsPinned(buffer)); + Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), + LW_EXCLUSIVE)); + + old_buf_state = pg_atomic_read_u32(&bufHdr->state); + for (;;) + { + if (old_buf_state & BM_LOCKED) + old_buf_state = WaitBufHdrUnlocked(bufHdr); + + buf_state = old_buf_state; + + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + buf_state |= BM_DIRTY | BM_JUST_DIRTIED; + + if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state, + buf_state)) + break; + } + + /* + * If the buffer was not dirty already, do vacuum accounting. + */ + if (!(old_buf_state & BM_DIRTY)) + { + VacuumPageDirty++; + pgBufferUsage.shared_blks_dirtied++; + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageDirty; + } +} + +/* + * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer() + * + * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock + * compared to calling the two routines separately. Now it's mainly just + * a convenience function. However, if the passed buffer is valid and + * already contains the desired block, we just return it as-is; and that + * does save considerable work compared to a full release and reacquire. + * + * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old + * buffer actually needs to be released. This case is the same as ReadBuffer, + * but can save some tests in the caller. + */ +Buffer +ReleaseAndReadBuffer(Buffer buffer, + Relation relation, + BlockNumber blockNum) +{ + ForkNumber forkNum = MAIN_FORKNUM; + BufferDesc *bufHdr; + + if (BufferIsValid(buffer)) + { + Assert(BufferIsPinned(buffer)); + if (BufferIsLocal(buffer)) + { + bufHdr = GetLocalBufferDescriptor(-buffer - 1); + if (bufHdr->tag.blockNum == blockNum && + BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum) + return buffer; + UnpinLocalBuffer(buffer); + } + else + { + bufHdr = GetBufferDescriptor(buffer - 1); + /* we have pin, so it's ok to examine tag without spinlock */ + if (bufHdr->tag.blockNum == blockNum && + BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum) + return buffer; + UnpinBuffer(bufHdr); + } + } + + return ReadBuffer(relation, blockNum); +} + +/* + * PinBuffer -- make buffer unavailable for replacement. + * + * For the default access strategy, the buffer's usage_count is incremented + * when we first pin it; for other strategies we just make sure the usage_count + * isn't zero. (The idea of the latter is that we don't want synchronized + * heap scans to inflate the count, but we need it to not be zero to discourage + * other backends from stealing buffers from our ring. As long as we cycle + * through the ring faster than the global clock-sweep cycles, buffers in + * our ring won't be chosen as victims for replacement by other backends.) + * + * This should be applied only to shared buffers, never local ones. + * + * Since buffers are pinned/unpinned very frequently, pin buffers without + * taking the buffer header lock; instead update the state variable in loop of + * CAS operations. Hopefully it's just a single CAS. + * + * Note that ResourceOwnerEnlargeBuffers must have been done already. + * + * Returns true if buffer is BM_VALID, else false. This provision allows + * some callers to avoid an extra spinlock cycle. + */ +static bool +PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy) +{ + Buffer b = BufferDescriptorGetBuffer(buf); + bool result; + PrivateRefCountEntry *ref; + + Assert(!BufferIsLocal(b)); + + ref = GetPrivateRefCountEntry(b, true); + + if (ref == NULL) + { + uint32 buf_state; + uint32 old_buf_state; + + ReservePrivateRefCountEntry(); + ref = NewPrivateRefCountEntry(b); + + old_buf_state = pg_atomic_read_u32(&buf->state); + for (;;) + { + if (old_buf_state & BM_LOCKED) + old_buf_state = WaitBufHdrUnlocked(buf); + + buf_state = old_buf_state; + + /* increase refcount */ + buf_state += BUF_REFCOUNT_ONE; + + if (strategy == NULL) + { + /* Default case: increase usagecount unless already max. */ + if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT) + buf_state += BUF_USAGECOUNT_ONE; + } + else + { + /* + * Ring buffers shouldn't evict others from pool. Thus we + * don't make usagecount more than 1. + */ + if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0) + buf_state += BUF_USAGECOUNT_ONE; + } + + if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, + buf_state)) + { + result = (buf_state & BM_VALID) != 0; + + /* + * Assume that we acquired a buffer pin for the purposes of + * Valgrind buffer client checks (even in !result case) to + * keep things simple. Buffers that are unsafe to access are + * not generally guaranteed to be marked undefined or + * non-accessible in any case. + */ + VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); + break; + } + } + } + else + { + /* + * If we previously pinned the buffer, it must surely be valid. + * + * Note: We deliberately avoid a Valgrind client request here. + * Individual access methods can optionally superimpose buffer page + * client requests on top of our client requests to enforce that + * buffers are only accessed while locked (and pinned). It's possible + * that the buffer page is legitimately non-accessible here. We + * cannot meddle with that. + */ + result = true; + } + + ref->refcount++; + Assert(ref->refcount > 0); + ResourceOwnerRememberBuffer(CurrentResourceOwner, b); + return result; +} + +/* + * PinBuffer_Locked -- as above, but caller already locked the buffer header. + * The spinlock is released before return. + * + * As this function is called with the spinlock held, the caller has to + * previously call ReservePrivateRefCountEntry(). + * + * Currently, no callers of this function want to modify the buffer's + * usage_count at all, so there's no need for a strategy parameter. + * Also we don't bother with a BM_VALID test (the caller could check that for + * itself). + * + * Also all callers only ever use this function when it's known that the + * buffer can't have a preexisting pin by this backend. That allows us to skip + * searching the private refcount array & hash, which is a boon, because the + * spinlock is still held. + * + * Note: use of this routine is frequently mandatory, not just an optimization + * to save a spin lock/unlock cycle, because we need to pin a buffer before + * its state can change under us. + */ +static void +PinBuffer_Locked(BufferDesc *buf) +{ + Buffer b; + PrivateRefCountEntry *ref; + uint32 buf_state; + + /* + * As explained, We don't expect any preexisting pins. That allows us to + * manipulate the PrivateRefCount after releasing the spinlock + */ + Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL); + + /* + * Buffer can't have a preexisting pin, so mark its page as defined to + * Valgrind (this is similar to the PinBuffer() case where the backend + * doesn't already have a buffer pin) + */ + VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ); + + /* + * Since we hold the buffer spinlock, we can update the buffer state and + * release the lock in one operation. + */ + buf_state = pg_atomic_read_u32(&buf->state); + Assert(buf_state & BM_LOCKED); + buf_state += BUF_REFCOUNT_ONE; + UnlockBufHdr(buf, buf_state); + + b = BufferDescriptorGetBuffer(buf); + + ref = NewPrivateRefCountEntry(b); + ref->refcount++; + + ResourceOwnerRememberBuffer(CurrentResourceOwner, b); +} + +/* + * UnpinBuffer -- make buffer available for replacement. + * + * This should be applied only to shared buffers, never local ones. This + * always adjusts CurrentResourceOwner. + */ +static void +UnpinBuffer(BufferDesc *buf) +{ + PrivateRefCountEntry *ref; + Buffer b = BufferDescriptorGetBuffer(buf); + + Assert(!BufferIsLocal(b)); + + /* not moving as we're likely deleting it soon anyway */ + ref = GetPrivateRefCountEntry(b, false); + Assert(ref != NULL); + + ResourceOwnerForgetBuffer(CurrentResourceOwner, b); + + Assert(ref->refcount > 0); + ref->refcount--; + if (ref->refcount == 0) + { + uint32 buf_state; + uint32 old_buf_state; + + /* + * Mark buffer non-accessible to Valgrind. + * + * Note that the buffer may have already been marked non-accessible + * within access method code that enforces that buffers are only + * accessed while a buffer lock is held. + */ + VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ); + + /* I'd better not still hold the buffer content lock */ + Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf))); + + /* + * Decrement the shared reference count. + * + * Since buffer spinlock holder can update status using just write, + * it's not safe to use atomic decrement here; thus use a CAS loop. + */ + old_buf_state = pg_atomic_read_u32(&buf->state); + for (;;) + { + if (old_buf_state & BM_LOCKED) + old_buf_state = WaitBufHdrUnlocked(buf); + + buf_state = old_buf_state; + + buf_state -= BUF_REFCOUNT_ONE; + + if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state, + buf_state)) + break; + } + + /* Support LockBufferForCleanup() */ + if (buf_state & BM_PIN_COUNT_WAITER) + { + /* + * Acquire the buffer header lock, re-check that there's a waiter. + * Another backend could have unpinned this buffer, and already + * woken up the waiter. There's no danger of the buffer being + * replaced after we unpinned it above, as it's pinned by the + * waiter. + */ + buf_state = LockBufHdr(buf); + + if ((buf_state & BM_PIN_COUNT_WAITER) && + BUF_STATE_GET_REFCOUNT(buf_state) == 1) + { + /* we just released the last pin other than the waiter's */ + int wait_backend_pgprocno = buf->wait_backend_pgprocno; + + buf_state &= ~BM_PIN_COUNT_WAITER; + UnlockBufHdr(buf, buf_state); + ProcSendSignal(wait_backend_pgprocno); + } + else + UnlockBufHdr(buf, buf_state); + } + ForgetPrivateRefCountEntry(ref); + } +} + +#define ST_SORT sort_checkpoint_bufferids +#define ST_ELEMENT_TYPE CkptSortItem +#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b) +#define ST_SCOPE static +#define ST_DEFINE +#include + +/* + * BufferSync -- Write out all dirty buffers in the pool. + * + * This is called at checkpoint time to write out all dirty shared buffers. + * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE + * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN, + * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even + * unlogged buffers, which are otherwise skipped. The remaining flags + * currently have no effect here. + */ +static void +BufferSync(int flags) +{ + uint32 buf_state; + int buf_id; + int num_to_scan; + int num_spaces; + int num_processed; + int num_written; + CkptTsStatus *per_ts_stat = NULL; + Oid last_tsid; + binaryheap *ts_heap; + int i; + int mask = BM_DIRTY; + WritebackContext wb_context; + + /* Make sure we can handle the pin inside SyncOneBuffer */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + /* + * Unless this is a shutdown checkpoint or we have been explicitly told, + * we write only permanent, dirty buffers. But at shutdown or end of + * recovery, we write all dirty buffers. + */ + if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY | + CHECKPOINT_FLUSH_ALL)))) + mask |= BM_PERMANENT; + + /* + * Loop over all buffers, and mark the ones that need to be written with + * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we + * can estimate how much work needs to be done. + * + * This allows us to write only those pages that were dirty when the + * checkpoint began, and not those that get dirtied while it proceeds. + * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us + * later in this function, or by normal backends or the bgwriter cleaning + * scan, the flag is cleared. Any buffer dirtied after this point won't + * have the flag set. + * + * Note that if we fail to write some buffer, we may leave buffers with + * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would + * certainly need to be written for the next checkpoint attempt, too. + */ + num_to_scan = 0; + for (buf_id = 0; buf_id < NBuffers; buf_id++) + { + BufferDesc *bufHdr = GetBufferDescriptor(buf_id); + + /* + * Header spinlock is enough to examine BM_DIRTY, see comment in + * SyncOneBuffer. + */ + buf_state = LockBufHdr(bufHdr); + + if ((buf_state & mask) == mask) + { + CkptSortItem *item; + + buf_state |= BM_CHECKPOINT_NEEDED; + + item = &CkptBufferIds[num_to_scan++]; + item->buf_id = buf_id; + item->tsId = bufHdr->tag.spcOid; + item->relNumber = BufTagGetRelNumber(&bufHdr->tag); + item->forkNum = BufTagGetForkNum(&bufHdr->tag); + item->blockNum = bufHdr->tag.blockNum; + } + + UnlockBufHdr(bufHdr, buf_state); + + /* Check for barrier events in case NBuffers is large. */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + } + + if (num_to_scan == 0) + return; /* nothing to do */ + + WritebackContextInit(&wb_context, &checkpoint_flush_after); + + TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan); + + /* + * Sort buffers that need to be written to reduce the likelihood of random + * IO. The sorting is also important for the implementation of balancing + * writes between tablespaces. Without balancing writes we'd potentially + * end up writing to the tablespaces one-by-one; possibly overloading the + * underlying system. + */ + sort_checkpoint_bufferids(CkptBufferIds, num_to_scan); + + num_spaces = 0; + + /* + * Allocate progress status for each tablespace with buffers that need to + * be flushed. This requires the to-be-flushed array to be sorted. + */ + last_tsid = InvalidOid; + for (i = 0; i < num_to_scan; i++) + { + CkptTsStatus *s; + Oid cur_tsid; + + cur_tsid = CkptBufferIds[i].tsId; + + /* + * Grow array of per-tablespace status structs, every time a new + * tablespace is found. + */ + if (last_tsid == InvalidOid || last_tsid != cur_tsid) + { + Size sz; + + num_spaces++; + + /* + * Not worth adding grow-by-power-of-2 logic here - even with a + * few hundred tablespaces this should be fine. + */ + sz = sizeof(CkptTsStatus) * num_spaces; + + if (per_ts_stat == NULL) + per_ts_stat = (CkptTsStatus *) palloc(sz); + else + per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz); + + s = &per_ts_stat[num_spaces - 1]; + memset(s, 0, sizeof(*s)); + s->tsId = cur_tsid; + + /* + * The first buffer in this tablespace. As CkptBufferIds is sorted + * by tablespace all (s->num_to_scan) buffers in this tablespace + * will follow afterwards. + */ + s->index = i; + + /* + * progress_slice will be determined once we know how many buffers + * are in each tablespace, i.e. after this loop. + */ + + last_tsid = cur_tsid; + } + else + { + s = &per_ts_stat[num_spaces - 1]; + } + + s->num_to_scan++; + + /* Check for barrier events. */ + if (ProcSignalBarrierPending) + ProcessProcSignalBarrier(); + } + + Assert(num_spaces > 0); + + /* + * Build a min-heap over the write-progress in the individual tablespaces, + * and compute how large a portion of the total progress a single + * processed buffer is. + */ + ts_heap = binaryheap_allocate(num_spaces, + ts_ckpt_progress_comparator, + NULL); + + for (i = 0; i < num_spaces; i++) + { + CkptTsStatus *ts_stat = &per_ts_stat[i]; + + ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan; + + binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat)); + } + + binaryheap_build(ts_heap); + + /* + * Iterate through to-be-checkpointed buffers and write the ones (still) + * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between + * tablespaces; otherwise the sorting would lead to only one tablespace + * receiving writes at a time, making inefficient use of the hardware. + */ + num_processed = 0; + num_written = 0; + while (!binaryheap_empty(ts_heap)) + { + BufferDesc *bufHdr = NULL; + CkptTsStatus *ts_stat = (CkptTsStatus *) + DatumGetPointer(binaryheap_first(ts_heap)); + + buf_id = CkptBufferIds[ts_stat->index].buf_id; + Assert(buf_id != -1); + + bufHdr = GetBufferDescriptor(buf_id); + + num_processed++; + + /* + * We don't need to acquire the lock here, because we're only looking + * at a single bit. It's possible that someone else writes the buffer + * and clears the flag right after we check, but that doesn't matter + * since SyncOneBuffer will then do nothing. However, there is a + * further race condition: it's conceivable that between the time we + * examine the bit here and the time SyncOneBuffer acquires the lock, + * someone else not only wrote the buffer but replaced it with another + * page and dirtied it. In that improbable case, SyncOneBuffer will + * write the buffer though we didn't need to. It doesn't seem worth + * guarding against this, though. + */ + if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED) + { + if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN) + { + TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); + PendingCheckpointerStats.buf_written_checkpoints++; + num_written++; + } + } + + /* + * Measure progress independent of actually having to flush the buffer + * - otherwise writing become unbalanced. + */ + ts_stat->progress += ts_stat->progress_slice; + ts_stat->num_scanned++; + ts_stat->index++; + + /* Have all the buffers from the tablespace been processed? */ + if (ts_stat->num_scanned == ts_stat->num_to_scan) + { + binaryheap_remove_first(ts_heap); + } + else + { + /* update heap with the new progress */ + binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat)); + } + + /* + * Sleep to throttle our I/O rate. + * + * (This will check for barrier events even if it doesn't sleep.) + */ + CheckpointWriteDelay(flags, (double) num_processed / num_to_scan); + } + + /* + * Issue all pending flushes. Only checkpointer calls BufferSync(), so + * IOContext will always be IOCONTEXT_NORMAL. + */ + IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL); + + pfree(per_ts_stat); + per_ts_stat = NULL; + binaryheap_free(ts_heap); + + /* + * Update checkpoint statistics. As noted above, this doesn't include + * buffers written by other backends or bgwriter scan. + */ + CheckpointStats.ckpt_bufs_written += num_written; + + TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan); +} + +/* + * BgBufferSync -- Write out some dirty buffers in the pool. + * + * This is called periodically by the background writer process. + * + * Returns true if it's appropriate for the bgwriter process to go into + * low-power hibernation mode. (This happens if the strategy clock sweep + * has been "lapped" and no buffer allocations have occurred recently, + * or if the bgwriter has been effectively disabled by setting + * bgwriter_lru_maxpages to 0.) + */ +bool +BgBufferSync(WritebackContext *wb_context) +{ + /* info obtained from freelist.c */ + int strategy_buf_id; + uint32 strategy_passes; + uint32 recent_alloc; + + /* + * Information saved between calls so we can determine the strategy + * point's advance rate and avoid scanning already-cleaned buffers. + */ + static bool saved_info_valid = false; + static int prev_strategy_buf_id; + static uint32 prev_strategy_passes; + static int next_to_clean; + static uint32 next_passes; + + /* Moving averages of allocation rate and clean-buffer density */ + static float smoothed_alloc = 0; + static float smoothed_density = 10.0; + + /* Potentially these could be tunables, but for now, not */ + float smoothing_samples = 16; + float scan_whole_pool_milliseconds = 120000.0; + + /* Used to compute how far we scan ahead */ + long strategy_delta; + int bufs_to_lap; + int bufs_ahead; + float scans_per_alloc; + int reusable_buffers_est; + int upcoming_alloc_est; + int min_scan_buffers; + + /* Variables for the scanning loop proper */ + int num_to_scan; + int num_written; + int reusable_buffers; + + /* Variables for final smoothed_density update */ + long new_strategy_delta; + uint32 new_recent_alloc; + + /* + * Find out where the freelist clock sweep currently is, and how many + * buffer allocations have happened since our last call. + */ + strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc); + + /* Report buffer alloc counts to pgstat */ + PendingBgWriterStats.buf_alloc += recent_alloc; + + /* + * If we're not running the LRU scan, just stop after doing the stats + * stuff. We mark the saved state invalid so that we can recover sanely + * if LRU scan is turned back on later. + */ + if (bgwriter_lru_maxpages <= 0) + { + saved_info_valid = false; + return true; + } + + /* + * Compute strategy_delta = how many buffers have been scanned by the + * clock sweep since last time. If first time through, assume none. Then + * see if we are still ahead of the clock sweep, and if so, how many + * buffers we could scan before we'd catch up with it and "lap" it. Note: + * weird-looking coding of xxx_passes comparisons are to avoid bogus + * behavior when the passes counts wrap around. + */ + if (saved_info_valid) + { + int32 passes_delta = strategy_passes - prev_strategy_passes; + + strategy_delta = strategy_buf_id - prev_strategy_buf_id; + strategy_delta += (long) passes_delta * NBuffers; + + Assert(strategy_delta >= 0); + + if ((int32) (next_passes - strategy_passes) > 0) + { + /* we're one pass ahead of the strategy point */ + bufs_to_lap = strategy_buf_id - next_to_clean; +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta, bufs_to_lap); +#endif + } + else if (next_passes == strategy_passes && + next_to_clean >= strategy_buf_id) + { + /* on same pass, but ahead or at least not behind */ + bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id); +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta, bufs_to_lap); +#endif + } + else + { + /* + * We're behind, so skip forward to the strategy point and start + * cleaning from there. + */ +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld", + next_passes, next_to_clean, + strategy_passes, strategy_buf_id, + strategy_delta); +#endif + next_to_clean = strategy_buf_id; + next_passes = strategy_passes; + bufs_to_lap = NBuffers; + } + } + else + { + /* + * Initializing at startup or after LRU scanning had been off. Always + * start at the strategy point. + */ +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter initializing: strategy %u-%u", + strategy_passes, strategy_buf_id); +#endif + strategy_delta = 0; + next_to_clean = strategy_buf_id; + next_passes = strategy_passes; + bufs_to_lap = NBuffers; + } + + /* Update saved info for next time */ + prev_strategy_buf_id = strategy_buf_id; + prev_strategy_passes = strategy_passes; + saved_info_valid = true; + + /* + * Compute how many buffers had to be scanned for each new allocation, ie, + * 1/density of reusable buffers, and track a moving average of that. + * + * If the strategy point didn't move, we don't update the density estimate + */ + if (strategy_delta > 0 && recent_alloc > 0) + { + scans_per_alloc = (float) strategy_delta / (float) recent_alloc; + smoothed_density += (scans_per_alloc - smoothed_density) / + smoothing_samples; + } + + /* + * Estimate how many reusable buffers there are between the current + * strategy point and where we've scanned ahead to, based on the smoothed + * density estimate. + */ + bufs_ahead = NBuffers - bufs_to_lap; + reusable_buffers_est = (float) bufs_ahead / smoothed_density; + + /* + * Track a moving average of recent buffer allocations. Here, rather than + * a true average we want a fast-attack, slow-decline behavior: we + * immediately follow any increase. + */ + if (smoothed_alloc <= (float) recent_alloc) + smoothed_alloc = recent_alloc; + else + smoothed_alloc += ((float) recent_alloc - smoothed_alloc) / + smoothing_samples; + + /* Scale the estimate by a GUC to allow more aggressive tuning. */ + upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier); + + /* + * If recent_alloc remains at zero for many cycles, smoothed_alloc will + * eventually underflow to zero, and the underflows produce annoying + * kernel warnings on some platforms. Once upcoming_alloc_est has gone to + * zero, there's no point in tracking smaller and smaller values of + * smoothed_alloc, so just reset it to exactly zero to avoid this + * syndrome. It will pop back up as soon as recent_alloc increases. + */ + if (upcoming_alloc_est == 0) + smoothed_alloc = 0; + + /* + * Even in cases where there's been little or no buffer allocation + * activity, we want to make a small amount of progress through the buffer + * cache so that as many reusable buffers as possible are clean after an + * idle period. + * + * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times + * the BGW will be called during the scan_whole_pool time; slice the + * buffer pool into that many sections. + */ + min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay)); + + if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est)) + { +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d", + upcoming_alloc_est, min_scan_buffers, reusable_buffers_est); +#endif + upcoming_alloc_est = min_scan_buffers + reusable_buffers_est; + } + + /* + * Now write out dirty reusable buffers, working forward from the + * next_to_clean point, until we have lapped the strategy scan, or cleaned + * enough buffers to match our estimate of the next cycle's allocation + * requirements, or hit the bgwriter_lru_maxpages limit. + */ + + /* Make sure we can handle the pin inside SyncOneBuffer */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + num_to_scan = bufs_to_lap; + num_written = 0; + reusable_buffers = reusable_buffers_est; + + /* Execute the LRU scan */ + while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) + { + int sync_state = SyncOneBuffer(next_to_clean, true, + wb_context); + + if (++next_to_clean >= NBuffers) + { + next_to_clean = 0; + next_passes++; + } + num_to_scan--; + + if (sync_state & BUF_WRITTEN) + { + reusable_buffers++; + if (++num_written >= bgwriter_lru_maxpages) + { + PendingBgWriterStats.maxwritten_clean++; + break; + } + } + else if (sync_state & BUF_REUSABLE) + reusable_buffers++; + } + + PendingBgWriterStats.buf_written_clean += num_written; + +#ifdef BGW_DEBUG + elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d", + recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead, + smoothed_density, reusable_buffers_est, upcoming_alloc_est, + bufs_to_lap - num_to_scan, + num_written, + reusable_buffers - reusable_buffers_est); +#endif + + /* + * Consider the above scan as being like a new allocation scan. + * Characterize its density and update the smoothed one based on it. This + * effectively halves the moving average period in cases where both the + * strategy and the background writer are doing some useful scanning, + * which is helpful because a long memory isn't as desirable on the + * density estimates. + */ + new_strategy_delta = bufs_to_lap - num_to_scan; + new_recent_alloc = reusable_buffers - reusable_buffers_est; + if (new_strategy_delta > 0 && new_recent_alloc > 0) + { + scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc; + smoothed_density += (scans_per_alloc - smoothed_density) / + smoothing_samples; + +#ifdef BGW_DEBUG + elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f", + new_recent_alloc, new_strategy_delta, + scans_per_alloc, smoothed_density); +#endif + } + + /* Return true if OK to hibernate */ + return (bufs_to_lap == 0 && recent_alloc == 0); +} + +/* + * SyncOneBuffer -- process a single buffer during syncing. + * + * If skip_recently_used is true, we don't write currently-pinned buffers, nor + * buffers marked recently used, as these are not replacement candidates. + * + * Returns a bitmask containing the following flag bits: + * BUF_WRITTEN: we wrote the buffer. + * BUF_REUSABLE: buffer is available for replacement, ie, it has + * pin count 0 and usage count 0. + * + * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean + * after locking it, but we don't care all that much.) + * + * Note: caller must have done ResourceOwnerEnlargeBuffers. + */ +static int +SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) +{ + BufferDesc *bufHdr = GetBufferDescriptor(buf_id); + int result = 0; + uint32 buf_state; + BufferTag tag; + + ReservePrivateRefCountEntry(); + + /* + * Check whether buffer needs writing. + * + * We can make this check without taking the buffer content lock so long + * as we mark pages dirty in access methods *before* logging changes with + * XLogInsert(): if someone marks the buffer dirty just after our check we + * don't worry because our checkpoint.redo points before log record for + * upcoming changes and so we are not required to write such dirty buffer. + */ + buf_state = LockBufHdr(bufHdr); + + if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 && + BUF_STATE_GET_USAGECOUNT(buf_state) == 0) + { + result |= BUF_REUSABLE; + } + else if (skip_recently_used) + { + /* Caller told us not to write recently-used buffers */ + UnlockBufHdr(bufHdr, buf_state); + return result; + } + + if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY)) + { + /* It's clean, so nothing to do */ + UnlockBufHdr(bufHdr, buf_state); + return result; + } + + /* + * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the + * buffer is clean by the time we've locked it.) + */ + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + + FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + + tag = bufHdr->tag; + + UnpinBuffer(bufHdr); + + /* + * SyncOneBuffer() is only called by checkpointer and bgwriter, so + * IOContext will always be IOCONTEXT_NORMAL. + */ + ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag); + + return result | BUF_WRITTEN; +} + +/* + * AtEOXact_Buffers - clean up at end of transaction. + * + * As of PostgreSQL 8.0, buffer pins should get released by the + * ResourceOwner mechanism. This routine is just a debugging + * cross-check that no pins remain. + */ +void +AtEOXact_Buffers(bool isCommit) +{ + CheckForBufferLeaks(); + + AtEOXact_LocalBuffers(isCommit); + + Assert(PrivateRefCountOverflowed == 0); +} + +/* + * Initialize access to shared buffer pool + * + * This is called during backend startup (whether standalone or under the + * postmaster). It sets up for this backend's access to the already-existing + * buffer pool. + */ +void +InitBufferPoolAccess(void) +{ + HASHCTL hash_ctl; + + memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray)); + + hash_ctl.keysize = sizeof(int32); + hash_ctl.entrysize = sizeof(PrivateRefCountEntry); + + PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * AtProcExit_Buffers needs LWLock access, and thereby has to be called at + * the corresponding phase of backend shutdown. + */ + Assert(MyProc != NULL); + on_shmem_exit(AtProcExit_Buffers, 0); +} + +/* + * During backend exit, ensure that we released all shared-buffer locks and + * assert that we have no remaining pins. + */ +static void +AtProcExit_Buffers(int code, Datum arg) +{ + UnlockBuffers(); + + CheckForBufferLeaks(); + + /* localbuf.c needs a chance too */ + AtProcExit_LocalBuffers(); +} + +/* + * CheckForBufferLeaks - ensure this backend holds no buffer pins + * + * As of PostgreSQL 8.0, buffer pins should get released by the + * ResourceOwner mechanism. This routine is just a debugging + * cross-check that no pins remain. + */ +static void +CheckForBufferLeaks(void) +{ +#ifdef USE_ASSERT_CHECKING + int RefCountErrors = 0; + PrivateRefCountEntry *res; + int i; + + /* check the array */ + for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++) + { + res = &PrivateRefCountArray[i]; + + if (res->buffer != InvalidBuffer) + { + PrintBufferLeakWarning(res->buffer); + RefCountErrors++; + } + } + + /* if necessary search the hash */ + if (PrivateRefCountOverflowed) + { + HASH_SEQ_STATUS hstat; + + hash_seq_init(&hstat, PrivateRefCountHash); + while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL) + { + PrintBufferLeakWarning(res->buffer); + RefCountErrors++; + } + } + + Assert(RefCountErrors == 0); +#endif +} + +/* + * Helper routine to issue warnings when a buffer is unexpectedly pinned + */ +void +PrintBufferLeakWarning(Buffer buffer) +{ + BufferDesc *buf; + int32 loccount; + char *path; + BackendId backend; + uint32 buf_state; + + Assert(BufferIsValid(buffer)); + if (BufferIsLocal(buffer)) + { + buf = GetLocalBufferDescriptor(-buffer - 1); + loccount = LocalRefCount[-buffer - 1]; + backend = MyBackendId; + } + else + { + buf = GetBufferDescriptor(buffer - 1); + loccount = GetPrivateRefCount(buffer); + backend = InvalidBackendId; + } + + /* theoretically we should lock the bufhdr here */ + path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend, + BufTagGetForkNum(&buf->tag)); + buf_state = pg_atomic_read_u32(&buf->state); + elog(WARNING, + "buffer refcount leak: [%03d] " + "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)", + buffer, path, + buf->tag.blockNum, buf_state & BUF_FLAG_MASK, + BUF_STATE_GET_REFCOUNT(buf_state), loccount); + pfree(path); +} + +/* + * CheckPointBuffers + * + * Flush all dirty blocks in buffer pool to disk at checkpoint time. + * + * Note: temporary relations do not participate in checkpoints, so they don't + * need to be flushed. + */ +void +CheckPointBuffers(int flags) +{ + BufferSync(flags); +} + +/* + * BufferGetBlockNumber + * Returns the block number associated with a buffer. + * + * Note: + * Assumes that the buffer is valid and pinned, else the + * value may be obsolete immediately... + */ +BlockNumber +BufferGetBlockNumber(Buffer buffer) +{ + BufferDesc *bufHdr; + + Assert(BufferIsPinned(buffer)); + + if (BufferIsLocal(buffer)) + bufHdr = GetLocalBufferDescriptor(-buffer - 1); + else + bufHdr = GetBufferDescriptor(buffer - 1); + + /* pinned, so OK to read tag without spinlock */ + return bufHdr->tag.blockNum; +} + +/* + * BufferGetTag + * Returns the relfilelocator, fork number and block number associated with + * a buffer. + */ +void +BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum, + BlockNumber *blknum) +{ + BufferDesc *bufHdr; + + /* Do the same checks as BufferGetBlockNumber. */ + Assert(BufferIsPinned(buffer)); + + if (BufferIsLocal(buffer)) + bufHdr = GetLocalBufferDescriptor(-buffer - 1); + else + bufHdr = GetBufferDescriptor(buffer - 1); + + /* pinned, so OK to read tag without spinlock */ + *rlocator = BufTagGetRelFileLocator(&bufHdr->tag); + *forknum = BufTagGetForkNum(&bufHdr->tag); + *blknum = bufHdr->tag.blockNum; +} + +/* + * FlushBuffer + * Physically write out a shared buffer. + * + * NOTE: this actually just passes the buffer contents to the kernel; the + * real write to disk won't happen until the kernel feels like it. This + * is okay from our point of view since we can redo the changes from WAL. + * However, we will need to force the changes to disk via fsync before + * we can checkpoint WAL. + * + * The caller must hold a pin on the buffer and have share-locked the + * buffer contents. (Note: a share-lock does not prevent updates of + * hint bits in the buffer, so the page could change while the write + * is in progress, but we assume that that will not invalidate the data + * written.) + * + * If the caller has an smgr reference for the buffer's relation, pass it + * as the second parameter. If not, pass NULL. + */ +static void +FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object, + IOContext io_context) +{ + XLogRecPtr recptr; + ErrorContextCallback errcallback; + instr_time io_start; + Block bufBlock; + char *bufToWrite; + uint32 buf_state; + + /* + * Try to start an I/O operation. If StartBufferIO returns false, then + * someone else flushed the buffer before we could, so we need not do + * anything. + */ + if (!StartBufferIO(buf, false)) + return; + + /* Setup error traceback support for ereport() */ + errcallback.callback = shared_buffer_write_error_callback; + errcallback.arg = (void *) buf; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + /* Find smgr relation for buffer */ + if (reln == NULL) + reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId); + + TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag), + buf->tag.blockNum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber); + + buf_state = LockBufHdr(buf); + + /* + * Run PageGetLSN while holding header lock, since we don't have the + * buffer locked exclusively in all cases. + */ + recptr = BufferGetLSN(buf); + + /* To check if block content changes while flushing. - vadim 01/17/97 */ + buf_state &= ~BM_JUST_DIRTIED; + UnlockBufHdr(buf, buf_state); + + /* + * Force XLOG flush up to buffer's LSN. This implements the basic WAL + * rule that log updates must hit disk before any of the data-file changes + * they describe do. + * + * However, this rule does not apply to unlogged relations, which will be + * lost after a crash anyway. Most unlogged relation pages do not bear + * LSNs since we never emit WAL records for them, and therefore flushing + * up through the buffer LSN would be useless, but harmless. However, + * GiST indexes use LSNs internally to track page-splits, and therefore + * unlogged GiST pages bear "fake" LSNs generated by + * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake + * LSN counter could advance past the WAL insertion point; and if it did + * happen, attempting to flush WAL through that location would fail, with + * disastrous system-wide consequences. To make sure that can't happen, + * skip the flush if the buffer isn't permanent. + */ + if (buf_state & BM_PERMANENT) + XLogFlush(recptr); + + /* + * Now it's safe to write buffer to disk. Note that no one else should + * have been able to write it while we were busy with log flushing because + * only one process at a time can set the BM_IO_IN_PROGRESS bit. + */ + bufBlock = BufHdrGetBlock(buf); + + /* + * Update page checksum if desired. Since we have only shared lock on the + * buffer, other processes might be updating hint bits in it, so we must + * copy the page to private storage if we do checksumming. + */ + bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum); + + io_start = pgstat_prepare_io_time(); + + /* + * bufToWrite is either the shared buffer or a copy, as appropriate. + */ + smgrwrite(reln, + BufTagGetForkNum(&buf->tag), + buf->tag.blockNum, + bufToWrite, + false); + + /* + * When a strategy is in use, only flushes of dirty buffers already in the + * strategy ring are counted as strategy writes (IOCONTEXT + * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO + * statistics tracking. + * + * If a shared buffer initially added to the ring must be flushed before + * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE. + * + * If a shared buffer which was added to the ring later because the + * current strategy buffer is pinned or in use or because all strategy + * buffers were dirty and rejected (for BAS_BULKREAD operations only) + * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE + * (from_ring will be false). + * + * When a strategy is not in use, the write can only be a "regular" write + * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE). + */ + pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, + IOOP_WRITE, io_start, 1); + + pgBufferUsage.shared_blks_written++; + + /* + * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and + * end the BM_IO_IN_PROGRESS state. + */ + TerminateBufferIO(buf, true, 0); + + TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag), + buf->tag.blockNum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber); + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; +} + +/* + * RelationGetNumberOfBlocksInFork + * Determines the current number of pages in the specified relation fork. + * + * Note that the accuracy of the result will depend on the details of the + * relation's storage. For builtin AMs it'll be accurate, but for external AMs + * it might not be. + */ +BlockNumber +RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum) +{ + if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind)) + { + /* + * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore + * tableam returns the size in bytes - but for the purpose of this + * routine, we want the number of blocks. Therefore divide, rounding + * up. + */ + uint64 szbytes; + + szbytes = table_relation_size(relation, forkNum); + + return (szbytes + (BLCKSZ - 1)) / BLCKSZ; + } + else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind)) + { + return smgrnblocks(RelationGetSmgr(relation), forkNum); + } + else + Assert(false); + + return 0; /* keep compiler quiet */ +} + +/* + * BufferIsPermanent + * Determines whether a buffer will potentially still be around after + * a crash. Caller must hold a buffer pin. + */ +bool +BufferIsPermanent(Buffer buffer) +{ + BufferDesc *bufHdr; + + /* Local buffers are used only for temp relations. */ + if (BufferIsLocal(buffer)) + return false; + + /* Make sure we've got a real buffer, and that we hold a pin on it. */ + Assert(BufferIsValid(buffer)); + Assert(BufferIsPinned(buffer)); + + /* + * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we + * need not bother with the buffer header spinlock. Even if someone else + * changes the buffer header state while we're doing this, the state is + * changed atomically, so we'll read the old value or the new value, but + * not random garbage. + */ + bufHdr = GetBufferDescriptor(buffer - 1); + return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0; +} + +/* + * BufferGetLSNAtomic + * Retrieves the LSN of the buffer atomically using a buffer header lock. + * This is necessary for some callers who may not have an exclusive lock + * on the buffer. + */ +XLogRecPtr +BufferGetLSNAtomic(Buffer buffer) +{ + BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1); + char *page = BufferGetPage(buffer); + XLogRecPtr lsn; + uint32 buf_state; + + /* + * If we don't need locking for correctness, fastpath out. + */ + if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer)) + return PageGetLSN(page); + + /* Make sure we've got a real buffer, and that we hold a pin on it. */ + Assert(BufferIsValid(buffer)); + Assert(BufferIsPinned(buffer)); + + buf_state = LockBufHdr(bufHdr); + lsn = PageGetLSN(page); + UnlockBufHdr(bufHdr, buf_state); + + return lsn; +} + +/* --------------------------------------------------------------------- + * DropRelationBuffers + * + * This function removes from the buffer pool all the pages of the + * specified relation forks that have block numbers >= firstDelBlock. + * (In particular, with firstDelBlock = 0, all pages are removed.) + * Dirty pages are simply dropped, without bothering to write them + * out first. Therefore, this is NOT rollback-able, and so should be + * used only with extreme caution! + * + * Currently, this is called only from smgr.c when the underlying file + * is about to be deleted or truncated (firstDelBlock is needed for + * the truncation case). The data in the affected pages would therefore + * be deleted momentarily anyway, and there is no point in writing it. + * It is the responsibility of higher-level code to ensure that the + * deletion or truncation does not lose any data that could be needed + * later. It is also the responsibility of higher-level code to ensure + * that no other process could be trying to load more pages of the + * relation into buffers. + * -------------------------------------------------------------------- + */ +void +DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum, + int nforks, BlockNumber *firstDelBlock) +{ + int i; + int j; + RelFileLocatorBackend rlocator; + BlockNumber nForkBlock[MAX_FORKNUM]; + uint64 nBlocksToInvalidate = 0; + + rlocator = smgr_reln->smgr_rlocator; + + /* If it's a local relation, it's localbuf.c's problem. */ + if (RelFileLocatorBackendIsTemp(rlocator)) + { + if (rlocator.backend == MyBackendId) + { + for (j = 0; j < nforks; j++) + DropRelationLocalBuffers(rlocator.locator, forkNum[j], + firstDelBlock[j]); + } + return; + } + + /* + * To remove all the pages of the specified relation forks from the buffer + * pool, we need to scan the entire buffer pool but we can optimize it by + * finding the buffers from BufMapping table provided we know the exact + * size of each fork of the relation. The exact size is required to ensure + * that we don't leave any buffer for the relation being dropped as + * otherwise the background writer or checkpointer can lead to a PANIC + * error while flushing buffers corresponding to files that don't exist. + * + * To know the exact size, we rely on the size cached for each fork by us + * during recovery which limits the optimization to recovery and on + * standbys but we can easily extend it once we have shared cache for + * relation size. + * + * In recovery, we cache the value returned by the first lseek(SEEK_END) + * and the future writes keeps the cached value up-to-date. See + * smgrextend. It is possible that the value of the first lseek is smaller + * than the actual number of existing blocks in the file due to buggy + * Linux kernels that might not have accounted for the recent write. But + * that should be fine because there must not be any buffers after that + * file size. + */ + for (i = 0; i < nforks; i++) + { + /* Get the number of blocks for a relation's fork */ + nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]); + + if (nForkBlock[i] == InvalidBlockNumber) + { + nBlocksToInvalidate = InvalidBlockNumber; + break; + } + + /* calculate the number of blocks to be invalidated */ + nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]); + } + + /* + * We apply the optimization iff the total number of blocks to invalidate + * is below the BUF_DROP_FULL_SCAN_THRESHOLD. + */ + if (BlockNumberIsValid(nBlocksToInvalidate) && + nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD) + { + for (j = 0; j < nforks; j++) + FindAndDropRelationBuffers(rlocator.locator, forkNum[j], + nForkBlock[j], firstDelBlock[j]); + return; + } + + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + /* + * We can make this a tad faster by prechecking the buffer tag before + * we attempt to lock the buffer; this saves a lot of lock + * acquisitions in typical cases. It should be safe because the + * caller must have AccessExclusiveLock on the relation, or some other + * reason to be certain that no one is loading new pages of the rel + * into the buffer pool. (Otherwise we might well miss such pages + * entirely.) Therefore, while the tag might be changing while we + * look at it, it can't be changing *to* a value we care about, only + * *away* from such a value. So false negatives are impossible, and + * false positives are safe because we'll recheck after getting the + * buffer lock. + * + * We could check forkNum and blockNum as well as the rlocator, but + * the incremental win from doing so seems small. + */ + if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator)) + continue; + + buf_state = LockBufHdr(bufHdr); + + for (j = 0; j < nforks; j++) + { + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum[j] && + bufHdr->tag.blockNum >= firstDelBlock[j]) + { + InvalidateBuffer(bufHdr); /* releases spinlock */ + break; + } + } + if (j >= nforks) + UnlockBufHdr(bufHdr, buf_state); + } +} + +/* --------------------------------------------------------------------- + * DropRelationsAllBuffers + * + * This function removes from the buffer pool all the pages of all + * forks of the specified relations. It's equivalent to calling + * DropRelationBuffers once per fork per relation with firstDelBlock = 0. + * -------------------------------------------------------------------- + */ +void +DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators) +{ + int i; + int n = 0; + SMgrRelation *rels; + BlockNumber (*block)[MAX_FORKNUM + 1]; + uint64 nBlocksToInvalidate = 0; + RelFileLocator *locators; + bool cached = true; + bool use_bsearch; + + if (nlocators == 0) + return; + + rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */ + + /* If it's a local relation, it's localbuf.c's problem. */ + for (i = 0; i < nlocators; i++) + { + if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator)) + { + if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId) + DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator); + } + else + rels[n++] = smgr_reln[i]; + } + + /* + * If there are no non-local relations, then we're done. Release the + * memory and return. + */ + if (n == 0) + { + pfree(rels); + return; + } + + /* + * This is used to remember the number of blocks for all the relations + * forks. + */ + block = (BlockNumber (*)[MAX_FORKNUM + 1]) + palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1)); + + /* + * We can avoid scanning the entire buffer pool if we know the exact size + * of each of the given relation forks. See DropRelationBuffers. + */ + for (i = 0; i < n && cached; i++) + { + for (int j = 0; j <= MAX_FORKNUM; j++) + { + /* Get the number of blocks for a relation's fork. */ + block[i][j] = smgrnblocks_cached(rels[i], j); + + /* We need to only consider the relation forks that exists. */ + if (block[i][j] == InvalidBlockNumber) + { + if (!smgrexists(rels[i], j)) + continue; + cached = false; + break; + } + + /* calculate the total number of blocks to be invalidated */ + nBlocksToInvalidate += block[i][j]; + } + } + + /* + * We apply the optimization iff the total number of blocks to invalidate + * is below the BUF_DROP_FULL_SCAN_THRESHOLD. + */ + if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD) + { + for (i = 0; i < n; i++) + { + for (int j = 0; j <= MAX_FORKNUM; j++) + { + /* ignore relation forks that doesn't exist */ + if (!BlockNumberIsValid(block[i][j])) + continue; + + /* drop all the buffers for a particular relation fork */ + FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator, + j, block[i][j], 0); + } + } + + pfree(block); + pfree(rels); + return; + } + + pfree(block); + locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */ + for (i = 0; i < n; i++) + locators[i] = rels[i]->smgr_rlocator.locator; + + /* + * For low number of relations to drop just use a simple walk through, to + * save the bsearch overhead. The threshold to use is rather a guess than + * an exactly determined value, as it depends on many factors (CPU and RAM + * speeds, amount of shared buffers etc.). + */ + use_bsearch = n > RELS_BSEARCH_THRESHOLD; + + /* sort the list of rlocators if necessary */ + if (use_bsearch) + pg_qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator); + + for (i = 0; i < NBuffers; i++) + { + RelFileLocator *rlocator = NULL; + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + /* + * As in DropRelationBuffers, an unlocked precheck should be safe and + * saves some cycles. + */ + + if (!use_bsearch) + { + int j; + + for (j = 0; j < n; j++) + { + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j])) + { + rlocator = &locators[j]; + break; + } + } + } + else + { + RelFileLocator locator; + + locator = BufTagGetRelFileLocator(&bufHdr->tag); + rlocator = bsearch((const void *) &(locator), + locators, n, sizeof(RelFileLocator), + rlocator_comparator); + } + + /* buffer doesn't belong to any of the given relfilelocators; skip it */ + if (rlocator == NULL) + continue; + + buf_state = LockBufHdr(bufHdr); + if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator)) + InvalidateBuffer(bufHdr); /* releases spinlock */ + else + UnlockBufHdr(bufHdr, buf_state); + } + + pfree(locators); + pfree(rels); +} + +/* --------------------------------------------------------------------- + * FindAndDropRelationBuffers + * + * This function performs look up in BufMapping table and removes from the + * buffer pool all the pages of the specified relation fork that has block + * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all + * pages are removed.) + * -------------------------------------------------------------------- + */ +static void +FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum, + BlockNumber nForkBlock, + BlockNumber firstDelBlock) +{ + BlockNumber curBlock; + + for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++) + { + uint32 bufHash; /* hash value for tag */ + BufferTag bufTag; /* identity of requested block */ + LWLock *bufPartitionLock; /* buffer partition lock for it */ + int buf_id; + BufferDesc *bufHdr; + uint32 buf_state; + + /* create a tag so we can lookup the buffer */ + InitBufferTag(&bufTag, &rlocator, forkNum, curBlock); + + /* determine its hash code and partition lock ID */ + bufHash = BufTableHashCode(&bufTag); + bufPartitionLock = BufMappingPartitionLock(bufHash); + + /* Check that it is in the buffer pool. If not, do nothing. */ + LWLockAcquire(bufPartitionLock, LW_SHARED); + buf_id = BufTableLookup(&bufTag, bufHash); + LWLockRelease(bufPartitionLock); + + if (buf_id < 0) + continue; + + bufHdr = GetBufferDescriptor(buf_id); + + /* + * We need to lock the buffer header and recheck if the buffer is + * still associated with the same block because the buffer could be + * evicted by some other backend loading blocks for a different + * relation after we release lock on the BufMapping table. + */ + buf_state = LockBufHdr(bufHdr); + + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum && + bufHdr->tag.blockNum >= firstDelBlock) + InvalidateBuffer(bufHdr); /* releases spinlock */ + else + UnlockBufHdr(bufHdr, buf_state); + } +} + +/* --------------------------------------------------------------------- + * DropDatabaseBuffers + * + * This function removes all the buffers in the buffer cache for a + * particular database. Dirty pages are simply dropped, without + * bothering to write them out first. This is used when we destroy a + * database, to avoid trying to flush data to disk when the directory + * tree no longer exists. Implementation is pretty similar to + * DropRelationBuffers() which is for destroying just one relation. + * -------------------------------------------------------------------- + */ +void +DropDatabaseBuffers(Oid dbid) +{ + int i; + + /* + * We needn't consider local buffers, since by assumption the target + * database isn't our own. + */ + + for (i = 0; i < NBuffers; i++) + { + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + /* + * As in DropRelationBuffers, an unlocked precheck should be safe and + * saves some cycles. + */ + if (bufHdr->tag.dbOid != dbid) + continue; + + buf_state = LockBufHdr(bufHdr); + if (bufHdr->tag.dbOid == dbid) + InvalidateBuffer(bufHdr); /* releases spinlock */ + else + UnlockBufHdr(bufHdr, buf_state); + } +} + +/* ----------------------------------------------------------------- + * PrintBufferDescs + * + * this function prints all the buffer descriptors, for debugging + * use only. + * ----------------------------------------------------------------- + */ +#ifdef NOT_USED +void +PrintBufferDescs(void) +{ + int i; + + for (i = 0; i < NBuffers; ++i) + { + BufferDesc *buf = GetBufferDescriptor(i); + Buffer b = BufferDescriptorGetBuffer(buf); + + /* theoretically we should lock the bufhdr here */ + elog(LOG, + "[%02d] (freeNext=%d, rel=%s, " + "blockNum=%u, flags=0x%x, refcount=%u %d)", + i, buf->freeNext, + relpathbackend(BufTagGetRelFileLocator(&buf->tag), + InvalidBackendId, BufTagGetForkNum(&buf->tag)), + buf->tag.blockNum, buf->flags, + buf->refcount, GetPrivateRefCount(b)); + } +} +#endif + +#ifdef NOT_USED +void +PrintPinnedBufs(void) +{ + int i; + + for (i = 0; i < NBuffers; ++i) + { + BufferDesc *buf = GetBufferDescriptor(i); + Buffer b = BufferDescriptorGetBuffer(buf); + + if (GetPrivateRefCount(b) > 0) + { + /* theoretically we should lock the bufhdr here */ + elog(LOG, + "[%02d] (freeNext=%d, rel=%s, " + "blockNum=%u, flags=0x%x, refcount=%u %d)", + i, buf->freeNext, + relpathperm(BufTagGetRelFileLocator(&buf->tag), + BufTagGetForkNum(&buf->tag)), + buf->tag.blockNum, buf->flags, + buf->refcount, GetPrivateRefCount(b)); + } + } +} +#endif + +/* --------------------------------------------------------------------- + * FlushRelationBuffers + * + * This function writes all dirty pages of a relation out to disk + * (or more accurately, out to kernel disk buffers), ensuring that the + * kernel has an up-to-date view of the relation. + * + * Generally, the caller should be holding AccessExclusiveLock on the + * target relation to ensure that no other backend is busy dirtying + * more blocks of the relation; the effects can't be expected to last + * after the lock is released. + * + * XXX currently it sequentially searches the buffer pool, should be + * changed to more clever ways of searching. This routine is not + * used in any performance-critical code paths, so it's not worth + * adding additional overhead to normal paths to make it go faster. + * -------------------------------------------------------------------- + */ +void +FlushRelationBuffers(Relation rel) +{ + int i; + BufferDesc *bufHdr; + + if (RelationUsesLocalBuffers(rel)) + { + for (i = 0; i < NLocBuffer; i++) + { + uint32 buf_state; + instr_time io_start; + + bufHdr = GetLocalBufferDescriptor(i); + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) && + ((buf_state = pg_atomic_read_u32(&bufHdr->state)) & + (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + ErrorContextCallback errcallback; + Page localpage; + + localpage = (char *) LocalBufHdrGetBlock(bufHdr); + + /* Setup error traceback support for ereport() */ + errcallback.callback = local_buffer_write_error_callback; + errcallback.arg = (void *) bufHdr; + errcallback.previous = error_context_stack; + error_context_stack = &errcallback; + + PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + + io_start = pgstat_prepare_io_time(); + + smgrwrite(RelationGetSmgr(rel), + BufTagGetForkNum(&bufHdr->tag), + bufHdr->tag.blockNum, + localpage, + false); + + pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, + IOCONTEXT_NORMAL, IOOP_WRITE, + io_start, 1); + + buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED); + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + + pgBufferUsage.local_blks_written++; + + /* Pop the error context stack */ + error_context_stack = errcallback.previous; + } + } + + return; + } + + /* Make sure we can handle the pin inside the loop */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + uint32 buf_state; + + bufHdr = GetBufferDescriptor(i); + + /* + * As in DropRelationBuffers, an unlocked precheck should be safe and + * saves some cycles. + */ + if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator)) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) && + (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + FlushBuffer(bufHdr, RelationGetSmgr(rel), IOOBJECT_RELATION, IOCONTEXT_NORMAL); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr); + } + else + UnlockBufHdr(bufHdr, buf_state); + } +} + +/* --------------------------------------------------------------------- + * FlushRelationsAllBuffers + * + * This function flushes out of the buffer pool all the pages of all + * forks of the specified smgr relations. It's equivalent to calling + * FlushRelationBuffers once per relation. The relations are assumed not + * to use local buffers. + * -------------------------------------------------------------------- + */ +void +FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels) +{ + int i; + SMgrSortArray *srels; + bool use_bsearch; + + if (nrels == 0) + return; + + /* fill-in array for qsort */ + srels = palloc(sizeof(SMgrSortArray) * nrels); + + for (i = 0; i < nrels; i++) + { + Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator)); + + srels[i].rlocator = smgrs[i]->smgr_rlocator.locator; + srels[i].srel = smgrs[i]; + } + + /* + * Save the bsearch overhead for low number of relations to sync. See + * DropRelationsAllBuffers for details. + */ + use_bsearch = nrels > RELS_BSEARCH_THRESHOLD; + + /* sort the list of SMgrRelations if necessary */ + if (use_bsearch) + pg_qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator); + + /* Make sure we can handle the pin inside the loop */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + SMgrSortArray *srelent = NULL; + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + /* + * As in DropRelationBuffers, an unlocked precheck should be safe and + * saves some cycles. + */ + + if (!use_bsearch) + { + int j; + + for (j = 0; j < nrels; j++) + { + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator)) + { + srelent = &srels[j]; + break; + } + } + } + else + { + RelFileLocator rlocator; + + rlocator = BufTagGetRelFileLocator(&bufHdr->tag); + srelent = bsearch((const void *) &(rlocator), + srels, nrels, sizeof(SMgrSortArray), + rlocator_comparator); + } + + /* buffer doesn't belong to any of the given relfilelocators; skip it */ + if (srelent == NULL) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) && + (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr); + } + else + UnlockBufHdr(bufHdr, buf_state); + } + + pfree(srels); +} + +/* --------------------------------------------------------------------- + * RelationCopyStorageUsingBuffer + * + * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead + * of using smgrread and smgrextend this will copy using bufmgr APIs. + * + * Refer comments atop CreateAndCopyRelationData() for details about + * 'permanent' parameter. + * -------------------------------------------------------------------- + */ +static void +RelationCopyStorageUsingBuffer(RelFileLocator srclocator, + RelFileLocator dstlocator, + ForkNumber forkNum, bool permanent) +{ + Buffer srcBuf; + Buffer dstBuf; + Page srcPage; + Page dstPage; + bool use_wal; + BlockNumber nblocks; + BlockNumber blkno; + PGIOAlignedBlock buf; + BufferAccessStrategy bstrategy_src; + BufferAccessStrategy bstrategy_dst; + + /* + * In general, we want to write WAL whenever wal_level > 'minimal', but we + * can skip it when copying any fork of an unlogged relation other than + * the init fork. + */ + use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM); + + /* Get number of blocks in the source relation. */ + nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId), + forkNum); + + /* Nothing to copy; just return. */ + if (nblocks == 0) + return; + + /* + * Bulk extend the destination relation of the same size as the source + * relation before starting to copy block by block. + */ + memset(buf.data, 0, BLCKSZ); + smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1, + buf.data, true); + + /* This is a bulk operation, so use buffer access strategies. */ + bstrategy_src = GetAccessStrategy(BAS_BULKREAD); + bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE); + + /* Iterate over each block of the source relation file. */ + for (blkno = 0; blkno < nblocks; blkno++) + { + CHECK_FOR_INTERRUPTS(); + + /* Read block from source relation. */ + srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno, + RBM_NORMAL, bstrategy_src, + permanent); + LockBuffer(srcBuf, BUFFER_LOCK_SHARE); + srcPage = BufferGetPage(srcBuf); + + dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno, + RBM_ZERO_AND_LOCK, bstrategy_dst, + permanent); + dstPage = BufferGetPage(dstBuf); + + START_CRIT_SECTION(); + + /* Copy page data from the source to the destination. */ + memcpy(dstPage, srcPage, BLCKSZ); + MarkBufferDirty(dstBuf); + + /* WAL-log the copied page. */ + if (use_wal) + log_newpage_buffer(dstBuf, true); + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(dstBuf); + UnlockReleaseBuffer(srcBuf); + } + + FreeAccessStrategy(bstrategy_src); + FreeAccessStrategy(bstrategy_dst); +} + +/* --------------------------------------------------------------------- + * CreateAndCopyRelationData + * + * Create destination relation storage and copy all forks from the + * source relation to the destination. + * + * Pass permanent as true for permanent relations and false for + * unlogged relations. Currently this API is not supported for + * temporary relations. + * -------------------------------------------------------------------- + */ +void +CreateAndCopyRelationData(RelFileLocator src_rlocator, + RelFileLocator dst_rlocator, bool permanent) +{ + RelFileLocatorBackend rlocator; + char relpersistence; + + /* Set the relpersistence. */ + relpersistence = permanent ? + RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED; + + /* + * Create and copy all forks of the relation. During create database we + * have a separate cleanup mechanism which deletes complete database + * directory. Therefore, each individual relation doesn't need to be + * registered for cleanup. + */ + RelationCreateStorage(dst_rlocator, relpersistence, false); + + /* copy main fork. */ + RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM, + permanent); + + /* copy those extra forks that exist */ + for (ForkNumber forkNum = MAIN_FORKNUM + 1; + forkNum <= MAX_FORKNUM; forkNum++) + { + if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum)) + { + smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false); + + /* + * WAL log creation if the relation is persistent, or this is the + * init fork of an unlogged relation. + */ + if (permanent || forkNum == INIT_FORKNUM) + log_smgrcreate(&dst_rlocator, forkNum); + + /* Copy a fork's data, block by block. */ + RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum, + permanent); + } + } + + /* close source and destination smgr if exists. */ + rlocator.backend = InvalidBackendId; + + rlocator.locator = src_rlocator; + smgrcloserellocator(rlocator); + + rlocator.locator = dst_rlocator; + smgrcloserellocator(rlocator); +} + +/* --------------------------------------------------------------------- + * FlushDatabaseBuffers + * + * This function writes all dirty pages of a database out to disk + * (or more accurately, out to kernel disk buffers), ensuring that the + * kernel has an up-to-date view of the database. + * + * Generally, the caller should be holding an appropriate lock to ensure + * no other backend is active in the target database; otherwise more + * pages could get dirtied. + * + * Note we don't worry about flushing any pages of temporary relations. + * It's assumed these wouldn't be interesting. + * -------------------------------------------------------------------- + */ +void +FlushDatabaseBuffers(Oid dbid) +{ + int i; + BufferDesc *bufHdr; + + /* Make sure we can handle the pin inside the loop */ + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + for (i = 0; i < NBuffers; i++) + { + uint32 buf_state; + + bufHdr = GetBufferDescriptor(i); + + /* + * As in DropRelationBuffers, an unlocked precheck should be safe and + * saves some cycles. + */ + if (bufHdr->tag.dbOid != dbid) + continue; + + ReservePrivateRefCountEntry(); + + buf_state = LockBufHdr(bufHdr); + if (bufHdr->tag.dbOid == dbid && + (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY)) + { + PinBuffer_Locked(bufHdr); + LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED); + FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); + LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + UnpinBuffer(bufHdr); + } + else + UnlockBufHdr(bufHdr, buf_state); + } +} + +/* + * Flush a previously, shared or exclusively, locked and pinned buffer to the + * OS. + */ +void +FlushOneBuffer(Buffer buffer) +{ + BufferDesc *bufHdr; + + /* currently not needed, but no fundamental reason not to support */ + Assert(!BufferIsLocal(buffer)); + + Assert(BufferIsPinned(buffer)); + + bufHdr = GetBufferDescriptor(buffer - 1); + + Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); + + FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL); +} + +/* + * ReleaseBuffer -- release the pin on a buffer + */ +void +ReleaseBuffer(Buffer buffer) +{ + if (!BufferIsValid(buffer)) + elog(ERROR, "bad buffer ID: %d", buffer); + + if (BufferIsLocal(buffer)) + UnpinLocalBuffer(buffer); + else + UnpinBuffer(GetBufferDescriptor(buffer - 1)); +} + +/* + * UnlockReleaseBuffer -- release the content lock and pin on a buffer + * + * This is just a shorthand for a common combination. + */ +void +UnlockReleaseBuffer(Buffer buffer) +{ + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); +} + +/* + * IncrBufferRefCount + * Increment the pin count on a buffer that we have *already* pinned + * at least once. + * + * This function cannot be used on a buffer we do not have pinned, + * because it doesn't change the shared buffer state. + */ +void +IncrBufferRefCount(Buffer buffer) +{ + Assert(BufferIsPinned(buffer)); + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + if (BufferIsLocal(buffer)) + LocalRefCount[-buffer - 1]++; + else + { + PrivateRefCountEntry *ref; + + ref = GetPrivateRefCountEntry(buffer, true); + Assert(ref != NULL); + ref->refcount++; + } + ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer); +} + +/* + * MarkBufferDirtyHint + * + * Mark a buffer dirty for non-critical changes. + * + * This is essentially the same as MarkBufferDirty, except: + * + * 1. The caller does not write WAL; so if checksums are enabled, we may need + * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages. + * 2. The caller might have only share-lock instead of exclusive-lock on the + * buffer's content lock. + * 3. This function does not guarantee that the buffer is always marked dirty + * (due to a race condition), so it cannot be used for important changes. + */ +void +MarkBufferDirtyHint(Buffer buffer, bool buffer_std) +{ + BufferDesc *bufHdr; + Page page = BufferGetPage(buffer); + + if (!BufferIsValid(buffer)) + elog(ERROR, "bad buffer ID: %d", buffer); + + if (BufferIsLocal(buffer)) + { + MarkLocalBufferDirty(buffer); + return; + } + + bufHdr = GetBufferDescriptor(buffer - 1); + + Assert(GetPrivateRefCount(buffer) > 0); + /* here, either share or exclusive lock is OK */ + Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr))); + + /* + * This routine might get called many times on the same page, if we are + * making the first scan after commit of an xact that added/deleted many + * tuples. So, be as quick as we can if the buffer is already dirty. We + * do this by not acquiring spinlock if it looks like the status bits are + * already set. Since we make this test unlocked, there's a chance we + * might fail to notice that the flags have just been cleared, and failed + * to reset them, due to memory-ordering issues. But since this function + * is only intended to be used in cases where failing to write out the + * data would be harmless anyway, it doesn't really matter. + */ + if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) != + (BM_DIRTY | BM_JUST_DIRTIED)) + { + XLogRecPtr lsn = InvalidXLogRecPtr; + bool dirtied = false; + bool delayChkptFlags = false; + uint32 buf_state; + + /* + * If we need to protect hint bit updates from torn writes, WAL-log a + * full page image of the page. This full page image is only necessary + * if the hint bit update is the first change to the page since the + * last checkpoint. + * + * We don't check full_page_writes here because that logic is included + * when we call XLogInsert() since the value changes dynamically. + */ + if (XLogHintBitIsNeeded() && + (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT)) + { + /* + * If we must not write WAL, due to a relfilelocator-specific + * condition or being in recovery, don't dirty the page. We can + * set the hint, just not dirty the page as a result so the hint + * is lost when we evict the page or shutdown. + * + * See src/backend/storage/page/README for longer discussion. + */ + if (RecoveryInProgress() || + RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag))) + return; + + /* + * If the block is already dirty because we either made a change + * or set a hint already, then we don't need to write a full page + * image. Note that aggressive cleaning of blocks dirtied by hint + * bit setting would increase the call rate. Bulk setting of hint + * bits would reduce the call rate... + * + * We must issue the WAL record before we mark the buffer dirty. + * Otherwise we might write the page before we write the WAL. That + * causes a race condition, since a checkpoint might occur between + * writing the WAL record and marking the buffer dirty. We solve + * that with a kluge, but one that is already in use during + * transaction commit to prevent race conditions. Basically, we + * simply prevent the checkpoint WAL record from being written + * until we have marked the buffer dirty. We don't start the + * checkpoint flush until we have marked dirty, so our checkpoint + * must flush the change to disk successfully or the checkpoint + * never gets written, so crash recovery will fix. + * + * It's possible we may enter here without an xid, so it is + * essential that CreateCheckPoint waits for virtual transactions + * rather than full transactionids. + */ + Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0); + MyProc->delayChkptFlags |= DELAY_CHKPT_START; + delayChkptFlags = true; + lsn = XLogSaveBufferForHint(buffer, buffer_std); + } + + buf_state = LockBufHdr(bufHdr); + + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + + if (!(buf_state & BM_DIRTY)) + { + dirtied = true; /* Means "will be dirtied by this action" */ + + /* + * Set the page LSN if we wrote a backup block. We aren't supposed + * to set this when only holding a share lock but as long as we + * serialise it somehow we're OK. We choose to set LSN while + * holding the buffer header lock, which causes any reader of an + * LSN who holds only a share lock to also obtain a buffer header + * lock before using PageGetLSN(), which is enforced in + * BufferGetLSNAtomic(). + * + * If checksums are enabled, you might think we should reset the + * checksum here. That will happen when the page is written + * sometime later in this checkpoint cycle. + */ + if (!XLogRecPtrIsInvalid(lsn)) + PageSetLSN(page, lsn); + } + + buf_state |= BM_DIRTY | BM_JUST_DIRTIED; + UnlockBufHdr(bufHdr, buf_state); + + if (delayChkptFlags) + MyProc->delayChkptFlags &= ~DELAY_CHKPT_START; + + if (dirtied) + { + VacuumPageDirty++; + pgBufferUsage.shared_blks_dirtied++; + if (VacuumCostActive) + VacuumCostBalance += VacuumCostPageDirty; + } + } +} + +/* + * Release buffer content locks for shared buffers. + * + * Used to clean up after errors. + * + * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care + * of releasing buffer content locks per se; the only thing we need to deal + * with here is clearing any PIN_COUNT request that was in progress. + */ +void +UnlockBuffers(void) +{ + BufferDesc *buf = PinCountWaitBuf; + + if (buf) + { + uint32 buf_state; + + buf_state = LockBufHdr(buf); + + /* + * Don't complain if flag bit not set; it could have been reset but we + * got a cancel/die interrupt before getting the signal. + */ + if ((buf_state & BM_PIN_COUNT_WAITER) != 0 && + buf->wait_backend_pgprocno == MyProc->pgprocno) + buf_state &= ~BM_PIN_COUNT_WAITER; + + UnlockBufHdr(buf, buf_state); + + PinCountWaitBuf = NULL; + } +} + +/* + * Acquire or release the content_lock for the buffer. + */ +void +LockBuffer(Buffer buffer, int mode) +{ + BufferDesc *buf; + + Assert(BufferIsPinned(buffer)); + if (BufferIsLocal(buffer)) + return; /* local buffers need no lock */ + + buf = GetBufferDescriptor(buffer - 1); + + if (mode == BUFFER_LOCK_UNLOCK) + LWLockRelease(BufferDescriptorGetContentLock(buf)); + else if (mode == BUFFER_LOCK_SHARE) + LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED); + else if (mode == BUFFER_LOCK_EXCLUSIVE) + LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE); + else + elog(ERROR, "unrecognized buffer lock mode: %d", mode); +} + +/* + * Acquire the content_lock for the buffer, but only if we don't have to wait. + * + * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode. + */ +bool +ConditionalLockBuffer(Buffer buffer) +{ + BufferDesc *buf; + + Assert(BufferIsPinned(buffer)); + if (BufferIsLocal(buffer)) + return true; /* act as though we got it */ + + buf = GetBufferDescriptor(buffer - 1); + + return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf), + LW_EXCLUSIVE); +} + +/* + * Verify that this backend is pinning the buffer exactly once. + * + * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend + * holds a pin on the buffer. We do not care whether some other backend does. + */ +void +CheckBufferIsPinnedOnce(Buffer buffer) +{ + if (BufferIsLocal(buffer)) + { + if (LocalRefCount[-buffer - 1] != 1) + elog(ERROR, "incorrect local pin count: %d", + LocalRefCount[-buffer - 1]); + } + else + { + if (GetPrivateRefCount(buffer) != 1) + elog(ERROR, "incorrect local pin count: %d", + GetPrivateRefCount(buffer)); + } +} + +/* + * LockBufferForCleanup - lock a buffer in preparation for deleting items + * + * Items may be deleted from a disk page only when the caller (a) holds an + * exclusive lock on the buffer and (b) has observed that no other backend + * holds a pin on the buffer. If there is a pin, then the other backend + * might have a pointer into the buffer (for example, a heapscan reference + * to an item --- see README for more details). It's OK if a pin is added + * after the cleanup starts, however; the newly-arrived backend will be + * unable to look at the page until we release the exclusive lock. + * + * To implement this protocol, a would-be deleter must pin the buffer and + * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to + * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until + * it has successfully observed pin count = 1. + */ +void +LockBufferForCleanup(Buffer buffer) +{ + BufferDesc *bufHdr; + TimestampTz waitStart = 0; + bool waiting = false; + bool logged_recovery_conflict = false; + + Assert(BufferIsPinned(buffer)); + Assert(PinCountWaitBuf == NULL); + + CheckBufferIsPinnedOnce(buffer); + + /* Nobody else to wait for */ + if (BufferIsLocal(buffer)) + return; + + bufHdr = GetBufferDescriptor(buffer - 1); + + for (;;) + { + uint32 buf_state; + + /* Try to acquire lock */ + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + buf_state = LockBufHdr(bufHdr); + + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) + { + /* Successfully acquired exclusive lock with pincount 1 */ + UnlockBufHdr(bufHdr, buf_state); + + /* + * Emit the log message if recovery conflict on buffer pin was + * resolved but the startup process waited longer than + * deadlock_timeout for it. + */ + if (logged_recovery_conflict) + LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, + waitStart, GetCurrentTimestamp(), + NULL, false); + + if (waiting) + { + /* reset ps display to remove the suffix if we added one */ + set_ps_display_remove_suffix(); + waiting = false; + } + return; + } + /* Failed, so mark myself as waiting for pincount 1 */ + if (buf_state & BM_PIN_COUNT_WAITER) + { + UnlockBufHdr(bufHdr, buf_state); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + elog(ERROR, "multiple backends attempting to wait for pincount 1"); + } + bufHdr->wait_backend_pgprocno = MyProc->pgprocno; + PinCountWaitBuf = bufHdr; + buf_state |= BM_PIN_COUNT_WAITER; + UnlockBufHdr(bufHdr, buf_state); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + + /* Wait to be signaled by UnpinBuffer() */ + if (InHotStandby) + { + if (!waiting) + { + /* adjust the process title to indicate that it's waiting */ + set_ps_display_suffix("waiting"); + waiting = true; + } + + /* + * Emit the log message if the startup process is waiting longer + * than deadlock_timeout for recovery conflict on buffer pin. + * + * Skip this if first time through because the startup process has + * not started waiting yet in this case. So, the wait start + * timestamp is set after this logic. + */ + if (waitStart != 0 && !logged_recovery_conflict) + { + TimestampTz now = GetCurrentTimestamp(); + + if (TimestampDifferenceExceeds(waitStart, now, + DeadlockTimeout)) + { + LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN, + waitStart, now, NULL, true); + logged_recovery_conflict = true; + } + } + + /* + * Set the wait start timestamp if logging is enabled and first + * time through. + */ + if (log_recovery_conflict_waits && waitStart == 0) + waitStart = GetCurrentTimestamp(); + + /* Publish the bufid that Startup process waits on */ + SetStartupBufferPinWaitBufId(buffer - 1); + /* Set alarm and then wait to be signaled by UnpinBuffer() */ + ResolveRecoveryConflictWithBufferPin(); + /* Reset the published bufid */ + SetStartupBufferPinWaitBufId(-1); + } + else + ProcWaitForSignal(PG_WAIT_BUFFER_PIN); + + /* + * Remove flag marking us as waiter. Normally this will not be set + * anymore, but ProcWaitForSignal() can return for other signals as + * well. We take care to only reset the flag if we're the waiter, as + * theoretically another backend could have started waiting. That's + * impossible with the current usages due to table level locking, but + * better be safe. + */ + buf_state = LockBufHdr(bufHdr); + if ((buf_state & BM_PIN_COUNT_WAITER) != 0 && + bufHdr->wait_backend_pgprocno == MyProc->pgprocno) + buf_state &= ~BM_PIN_COUNT_WAITER; + UnlockBufHdr(bufHdr, buf_state); + + PinCountWaitBuf = NULL; + /* Loop back and try again */ + } +} + +/* + * Check called from RecoveryConflictInterrupt handler when Startup + * process requests cancellation of all pin holders that are blocking it. + */ +bool +HoldingBufferPinThatDelaysRecovery(void) +{ + int bufid = GetStartupBufferPinWaitBufId(); + + /* + * If we get woken slowly then it's possible that the Startup process was + * already woken by other backends before we got here. Also possible that + * we get here by multiple interrupts or interrupts at inappropriate + * times, so make sure we do nothing if the bufid is not set. + */ + if (bufid < 0) + return false; + + if (GetPrivateRefCount(bufid + 1) > 0) + return true; + + return false; +} + +/* + * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock + * + * We won't loop, but just check once to see if the pin count is OK. If + * not, return false with no lock held. + */ +bool +ConditionalLockBufferForCleanup(Buffer buffer) +{ + BufferDesc *bufHdr; + uint32 buf_state, + refcount; + + Assert(BufferIsValid(buffer)); + + if (BufferIsLocal(buffer)) + { + refcount = LocalRefCount[-buffer - 1]; + /* There should be exactly one pin */ + Assert(refcount > 0); + if (refcount != 1) + return false; + /* Nobody else to wait for */ + return true; + } + + /* There should be exactly one local pin */ + refcount = GetPrivateRefCount(buffer); + Assert(refcount); + if (refcount != 1) + return false; + + /* Try to acquire lock */ + if (!ConditionalLockBuffer(buffer)) + return false; + + bufHdr = GetBufferDescriptor(buffer - 1); + buf_state = LockBufHdr(bufHdr); + refcount = BUF_STATE_GET_REFCOUNT(buf_state); + + Assert(refcount > 0); + if (refcount == 1) + { + /* Successfully acquired exclusive lock with pincount 1 */ + UnlockBufHdr(bufHdr, buf_state); + return true; + } + + /* Failed, so release the lock */ + UnlockBufHdr(bufHdr, buf_state); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + return false; +} + +/* + * IsBufferCleanupOK - as above, but we already have the lock + * + * Check whether it's OK to perform cleanup on a buffer we've already + * locked. If we observe that the pin count is 1, our exclusive lock + * happens to be a cleanup lock, and we can proceed with anything that + * would have been allowable had we sought a cleanup lock originally. + */ +bool +IsBufferCleanupOK(Buffer buffer) +{ + BufferDesc *bufHdr; + uint32 buf_state; + + Assert(BufferIsValid(buffer)); + + if (BufferIsLocal(buffer)) + { + /* There should be exactly one pin */ + if (LocalRefCount[-buffer - 1] != 1) + return false; + /* Nobody else to wait for */ + return true; + } + + /* There should be exactly one local pin */ + if (GetPrivateRefCount(buffer) != 1) + return false; + + bufHdr = GetBufferDescriptor(buffer - 1); + + /* caller must hold exclusive lock on buffer */ + Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr), + LW_EXCLUSIVE)); + + buf_state = LockBufHdr(bufHdr); + + Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0); + if (BUF_STATE_GET_REFCOUNT(buf_state) == 1) + { + /* pincount is OK. */ + UnlockBufHdr(bufHdr, buf_state); + return true; + } + + UnlockBufHdr(bufHdr, buf_state); + return false; +} + + +/* + * Functions for buffer I/O handling + * + * Note: We assume that nested buffer I/O never occurs. + * i.e at most one BM_IO_IN_PROGRESS bit is set per proc. + * + * Also note that these are used only for shared buffers, not local ones. + */ + +/* + * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared. + */ +static void +WaitIO(BufferDesc *buf) +{ + ConditionVariable *cv = BufferDescriptorGetIOCV(buf); + + ConditionVariablePrepareToSleep(cv); + for (;;) + { + uint32 buf_state; + + /* + * It may not be necessary to acquire the spinlock to check the flag + * here, but since this test is essential for correctness, we'd better + * play it safe. + */ + buf_state = LockBufHdr(buf); + UnlockBufHdr(buf, buf_state); + + if (!(buf_state & BM_IO_IN_PROGRESS)) + break; + ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO); + } + ConditionVariableCancelSleep(); +} + +/* + * StartBufferIO: begin I/O on this buffer + * (Assumptions) + * My process is executing no IO + * The buffer is Pinned + * + * In some scenarios there are race conditions in which multiple backends + * could attempt the same I/O operation concurrently. If someone else + * has already started I/O on this buffer then we will block on the + * I/O condition variable until he's done. + * + * Input operations are only attempted on buffers that are not BM_VALID, + * and output operations only on buffers that are BM_VALID and BM_DIRTY, + * so we can always tell if the work is already done. + * + * Returns true if we successfully marked the buffer as I/O busy, + * false if someone else already did the work. + */ +static bool +StartBufferIO(BufferDesc *buf, bool forInput) +{ + uint32 buf_state; + + ResourceOwnerEnlargeBufferIOs(CurrentResourceOwner); + + for (;;) + { + buf_state = LockBufHdr(buf); + + if (!(buf_state & BM_IO_IN_PROGRESS)) + break; + UnlockBufHdr(buf, buf_state); + WaitIO(buf); + } + + /* Once we get here, there is definitely no I/O active on this buffer */ + + if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY)) + { + /* someone else already did the I/O */ + UnlockBufHdr(buf, buf_state); + return false; + } + + buf_state |= BM_IO_IN_PROGRESS; + UnlockBufHdr(buf, buf_state); + + ResourceOwnerRememberBufferIO(CurrentResourceOwner, + BufferDescriptorGetBuffer(buf)); + + return true; +} + +/* + * TerminateBufferIO: release a buffer we were doing I/O on + * (Assumptions) + * My process is executing IO for the buffer + * BM_IO_IN_PROGRESS bit is set for the buffer + * The buffer is Pinned + * + * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the + * buffer's BM_DIRTY flag. This is appropriate when terminating a + * successful write. The check on BM_JUST_DIRTIED is necessary to avoid + * marking the buffer clean if it was re-dirtied while we were writing. + * + * set_flag_bits gets ORed into the buffer's flags. It must include + * BM_IO_ERROR in a failure case. For successful completion it could + * be 0, or BM_VALID if we just finished reading in the page. + */ +static void +TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits) +{ + uint32 buf_state; + + buf_state = LockBufHdr(buf); + + Assert(buf_state & BM_IO_IN_PROGRESS); + + buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR); + if (clear_dirty && !(buf_state & BM_JUST_DIRTIED)) + buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED); + + buf_state |= set_flag_bits; + UnlockBufHdr(buf, buf_state); + + ResourceOwnerForgetBufferIO(CurrentResourceOwner, + BufferDescriptorGetBuffer(buf)); + + ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf)); +} + +/* + * AbortBufferIO: Clean up active buffer I/O after an error. + * + * All LWLocks we might have held have been released, + * but we haven't yet released buffer pins, so the buffer is still pinned. + * + * If I/O was in progress, we always set BM_IO_ERROR, even though it's + * possible the error condition wasn't related to the I/O. + */ +void +AbortBufferIO(Buffer buffer) +{ + BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1); + uint32 buf_state; + + buf_state = LockBufHdr(buf_hdr); + Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID)); + + if (!(buf_state & BM_VALID)) + { + Assert(!(buf_state & BM_DIRTY)); + UnlockBufHdr(buf_hdr, buf_state); + } + else + { + Assert(buf_state & BM_DIRTY); + UnlockBufHdr(buf_hdr, buf_state); + + /* Issue notice if this is not the first failure... */ + if (buf_state & BM_IO_ERROR) + { + /* Buffer is pinned, so we can read tag without spinlock */ + char *path; + + path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag), + BufTagGetForkNum(&buf_hdr->tag)); + ereport(WARNING, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not write block %u of %s", + buf_hdr->tag.blockNum, path), + errdetail("Multiple failures --- write error might be permanent."))); + pfree(path); + } + } + + TerminateBufferIO(buf_hdr, false, BM_IO_ERROR); +} + +/* + * Error context callback for errors occurring during shared buffer writes. + */ +static void +shared_buffer_write_error_callback(void *arg) +{ + BufferDesc *bufHdr = (BufferDesc *) arg; + + /* Buffer is pinned, so we can read the tag without locking the spinlock */ + if (bufHdr != NULL) + { + char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag), + BufTagGetForkNum(&bufHdr->tag)); + + errcontext("writing block %u of relation %s", + bufHdr->tag.blockNum, path); + pfree(path); + } +} + +/* + * Error context callback for errors occurring during local buffer writes. + */ +static void +local_buffer_write_error_callback(void *arg) +{ + BufferDesc *bufHdr = (BufferDesc *) arg; + + if (bufHdr != NULL) + { + char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), + MyBackendId, + BufTagGetForkNum(&bufHdr->tag)); + + errcontext("writing block %u of relation %s", + bufHdr->tag.blockNum, path); + pfree(path); + } +} + +/* + * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals. + */ +static int +rlocator_comparator(const void *p1, const void *p2) +{ + RelFileLocator n1 = *(const RelFileLocator *) p1; + RelFileLocator n2 = *(const RelFileLocator *) p2; + + if (n1.relNumber < n2.relNumber) + return -1; + else if (n1.relNumber > n2.relNumber) + return 1; + + if (n1.dbOid < n2.dbOid) + return -1; + else if (n1.dbOid > n2.dbOid) + return 1; + + if (n1.spcOid < n2.spcOid) + return -1; + else if (n1.spcOid > n2.spcOid) + return 1; + else + return 0; +} + +/* + * Lock buffer header - set BM_LOCKED in buffer state. + */ +uint32 +LockBufHdr(BufferDesc *desc) +{ + SpinDelayStatus delayStatus; + uint32 old_buf_state; + + Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc))); + + init_local_spin_delay(&delayStatus); + + while (true) + { + /* set BM_LOCKED flag */ + old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED); + /* if it wasn't set before we're OK */ + if (!(old_buf_state & BM_LOCKED)) + break; + perform_spin_delay(&delayStatus); + } + finish_spin_delay(&delayStatus); + return old_buf_state | BM_LOCKED; +} + +/* + * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's + * state at that point. + * + * Obviously the buffer could be locked by the time the value is returned, so + * this is primarily useful in CAS style loops. + */ +static uint32 +WaitBufHdrUnlocked(BufferDesc *buf) +{ + SpinDelayStatus delayStatus; + uint32 buf_state; + + init_local_spin_delay(&delayStatus); + + buf_state = pg_atomic_read_u32(&buf->state); + + while (buf_state & BM_LOCKED) + { + perform_spin_delay(&delayStatus); + buf_state = pg_atomic_read_u32(&buf->state); + } + + finish_spin_delay(&delayStatus); + + return buf_state; +} + +/* + * BufferTag comparator. + */ +static inline int +buffertag_comparator(const BufferTag *ba, const BufferTag *bb) +{ + int ret; + RelFileLocator rlocatora; + RelFileLocator rlocatorb; + + rlocatora = BufTagGetRelFileLocator(ba); + rlocatorb = BufTagGetRelFileLocator(bb); + + ret = rlocator_comparator(&rlocatora, &rlocatorb); + + if (ret != 0) + return ret; + + if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb)) + return -1; + if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb)) + return 1; + + if (ba->blockNum < bb->blockNum) + return -1; + if (ba->blockNum > bb->blockNum) + return 1; + + return 0; +} + +/* + * Comparator determining the writeout order in a checkpoint. + * + * It is important that tablespaces are compared first, the logic balancing + * writes between tablespaces relies on it. + */ +static inline int +ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b) +{ + /* compare tablespace */ + if (a->tsId < b->tsId) + return -1; + else if (a->tsId > b->tsId) + return 1; + /* compare relation */ + if (a->relNumber < b->relNumber) + return -1; + else if (a->relNumber > b->relNumber) + return 1; + /* compare fork */ + else if (a->forkNum < b->forkNum) + return -1; + else if (a->forkNum > b->forkNum) + return 1; + /* compare block number */ + else if (a->blockNum < b->blockNum) + return -1; + else if (a->blockNum > b->blockNum) + return 1; + /* equal page IDs are unlikely, but not impossible */ + return 0; +} + +/* + * Comparator for a Min-Heap over the per-tablespace checkpoint completion + * progress. + */ +static int +ts_ckpt_progress_comparator(Datum a, Datum b, void *arg) +{ + CkptTsStatus *sa = (CkptTsStatus *) a; + CkptTsStatus *sb = (CkptTsStatus *) b; + + /* we want a min-heap, so return 1 for the a < b */ + if (sa->progress < sb->progress) + return 1; + else if (sa->progress == sb->progress) + return 0; + else + return -1; +} + +/* + * Initialize a writeback context, discarding potential previous state. + * + * *max_pending is a pointer instead of an immediate value, so the coalesce + * limits can easily changed by the GUC mechanism, and so calling code does + * not have to check the current configuration. A value of 0 means that no + * writeback control will be performed. + */ +void +WritebackContextInit(WritebackContext *context, int *max_pending) +{ + Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES); + + context->max_pending = max_pending; + context->nr_pending = 0; +} + +/* + * Add buffer to list of pending writeback requests. + */ +void +ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context, + BufferTag *tag) +{ + PendingWriteback *pending; + + if (io_direct_flags & IO_DIRECT_DATA) + return; + + /* + * Add buffer to the pending writeback array, unless writeback control is + * disabled. + */ + if (*wb_context->max_pending > 0) + { + Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES); + + pending = &wb_context->pending_writebacks[wb_context->nr_pending++]; + + pending->tag = *tag; + } + + /* + * Perform pending flushes if the writeback limit is exceeded. This + * includes the case where previously an item has been added, but control + * is now disabled. + */ + if (wb_context->nr_pending >= *wb_context->max_pending) + IssuePendingWritebacks(wb_context, io_context); +} + +#define ST_SORT sort_pending_writebacks +#define ST_ELEMENT_TYPE PendingWriteback +#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag) +#define ST_SCOPE static +#define ST_DEFINE +#include + +/* + * Issue all pending writeback requests, previously scheduled with + * ScheduleBufferTagForWriteback, to the OS. + * + * Because this is only used to improve the OSs IO scheduling we try to never + * error out - it's just a hint. + */ +void +IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context) +{ + instr_time io_start; + int i; + + if (wb_context->nr_pending == 0) + return; + + /* + * Executing the writes in-order can make them a lot faster, and allows to + * merge writeback requests to consecutive blocks into larger writebacks. + */ + sort_pending_writebacks(wb_context->pending_writebacks, + wb_context->nr_pending); + + io_start = pgstat_prepare_io_time(); + + /* + * Coalesce neighbouring writes, but nothing else. For that we iterate + * through the, now sorted, array of pending flushes, and look forward to + * find all neighbouring (or identical) writes. + */ + for (i = 0; i < wb_context->nr_pending; i++) + { + PendingWriteback *cur; + PendingWriteback *next; + SMgrRelation reln; + int ahead; + BufferTag tag; + RelFileLocator currlocator; + Size nblocks = 1; + + cur = &wb_context->pending_writebacks[i]; + tag = cur->tag; + currlocator = BufTagGetRelFileLocator(&tag); + + /* + * Peek ahead, into following writeback requests, to see if they can + * be combined with the current one. + */ + for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++) + { + + next = &wb_context->pending_writebacks[i + ahead + 1]; + + /* different file, stop */ + if (!RelFileLocatorEquals(currlocator, + BufTagGetRelFileLocator(&next->tag)) || + BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag)) + break; + + /* ok, block queued twice, skip */ + if (cur->tag.blockNum == next->tag.blockNum) + continue; + + /* only merge consecutive writes */ + if (cur->tag.blockNum + 1 != next->tag.blockNum) + break; + + nblocks++; + cur = next; + } + + i += ahead; + + /* and finally tell the kernel to write the data to storage */ + reln = smgropen(currlocator, InvalidBackendId); + smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks); + } + + /* + * Assume that writeback requests are only issued for buffers containing + * blocks of permanent relations. + */ + pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, + IOOP_WRITEBACK, io_start, wb_context->nr_pending); + + wb_context->nr_pending = 0; +} + + +/* + * Implement slower/larger portions of TestForOldSnapshot + * + * Smaller/faster portions are put inline, but the entire set of logic is too + * big for that. + */ +void +TestForOldSnapshot_impl(Snapshot snapshot, Relation relation) +{ + if (RelationAllowsEarlyPruning(relation) + && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp()) + ereport(ERROR, + (errcode(ERRCODE_SNAPSHOT_TOO_OLD), + errmsg("snapshot too old"))); +} diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c new file mode 100644 index 0000000..1c804fd --- /dev/null +++ b/src/backend/storage/buffer/freelist.c @@ -0,0 +1,774 @@ +/*------------------------------------------------------------------------- + * + * freelist.c + * routines for managing the buffer pool's replacement strategy. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/buffer/freelist.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "pgstat.h" +#include "port/atomics.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "storage/proc.h" + +#define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) + + +/* + * The shared freelist control information. + */ +typedef struct +{ + /* Spinlock: protects the values below */ + slock_t buffer_strategy_lock; + + /* + * Clock sweep hand: index of next buffer to consider grabbing. Note that + * this isn't a concrete buffer - we only ever increase the value. So, to + * get an actual buffer, it needs to be used modulo NBuffers. + */ + pg_atomic_uint32 nextVictimBuffer; + + int firstFreeBuffer; /* Head of list of unused buffers */ + int lastFreeBuffer; /* Tail of list of unused buffers */ + + /* + * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, + * when the list is empty) + */ + + /* + * Statistics. These counters should be wide enough that they can't + * overflow during a single bgwriter cycle. + */ + uint32 completePasses; /* Complete cycles of the clock sweep */ + pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ + + /* + * Bgworker process to be notified upon activity or -1 if none. See + * StrategyNotifyBgWriter. + */ + int bgwprocno; +} BufferStrategyControl; + +/* Pointers to shared state */ +static BufferStrategyControl *StrategyControl = NULL; + +/* + * Private (non-shared) state for managing a ring of shared buffers to re-use. + * This is currently the only kind of BufferAccessStrategy object, but someday + * we might have more kinds. + */ +typedef struct BufferAccessStrategyData +{ + /* Overall strategy type */ + BufferAccessStrategyType btype; + /* Number of elements in buffers[] array */ + int nbuffers; + + /* + * Index of the "current" slot in the ring, ie, the one most recently + * returned by GetBufferFromRing. + */ + int current; + + /* + * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we + * have not yet selected a buffer for this ring slot. For allocation + * simplicity this is palloc'd together with the fixed fields of the + * struct. + */ + Buffer buffers[FLEXIBLE_ARRAY_MEMBER]; +} BufferAccessStrategyData; + + +/* Prototypes for internal functions */ +static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, + uint32 *buf_state); +static void AddBufferToRing(BufferAccessStrategy strategy, + BufferDesc *buf); + +/* + * ClockSweepTick - Helper routine for StrategyGetBuffer() + * + * Move the clock hand one buffer ahead of its current position and return the + * id of the buffer now under the hand. + */ +static inline uint32 +ClockSweepTick(void) +{ + uint32 victim; + + /* + * Atomically move hand ahead one buffer - if there's several processes + * doing this, this can lead to buffers being returned slightly out of + * apparent order. + */ + victim = + pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); + + if (victim >= NBuffers) + { + uint32 originalVictim = victim; + + /* always wrap what we look up in BufferDescriptors */ + victim = victim % NBuffers; + + /* + * If we're the one that just caused a wraparound, force + * completePasses to be incremented while holding the spinlock. We + * need the spinlock so StrategySyncStart() can return a consistent + * value consisting of nextVictimBuffer and completePasses. + */ + if (victim == 0) + { + uint32 expected; + uint32 wrapped; + bool success = false; + + expected = originalVictim + 1; + + while (!success) + { + /* + * Acquire the spinlock while increasing completePasses. That + * allows other readers to read nextVictimBuffer and + * completePasses in a consistent manner which is required for + * StrategySyncStart(). In theory delaying the increment + * could lead to an overflow of nextVictimBuffers, but that's + * highly unlikely and wouldn't be particularly harmful. + */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + wrapped = expected % NBuffers; + + success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, + &expected, wrapped); + if (success) + StrategyControl->completePasses++; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + } + } + } + return victim; +} + +/* + * have_free_buffer -- a lockless check to see if there is a free buffer in + * buffer pool. + * + * If the result is true that will become stale once free buffers are moved out + * by other operations, so the caller who strictly want to use a free buffer + * should not call this. + */ +bool +have_free_buffer(void) +{ + if (StrategyControl->firstFreeBuffer >= 0) + return true; + else + return false; +} + +/* + * StrategyGetBuffer + * + * Called by the bufmgr to get the next candidate buffer to use in + * BufferAlloc(). The only hard requirement BufferAlloc() has is that + * the selected buffer must not currently be pinned by anyone. + * + * strategy is a BufferAccessStrategy object, or NULL for default strategy. + * + * To ensure that no one else can pin the buffer before we do, we must + * return the buffer with the buffer header spinlock still held. + */ +BufferDesc * +StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring) +{ + BufferDesc *buf; + int bgwprocno; + int trycounter; + uint32 local_buf_state; /* to avoid repeated (de-)referencing */ + + *from_ring = false; + + /* + * If given a strategy object, see whether it can select a buffer. We + * assume strategy objects don't need buffer_strategy_lock. + */ + if (strategy != NULL) + { + buf = GetBufferFromRing(strategy, buf_state); + if (buf != NULL) + { + *from_ring = true; + return buf; + } + } + + /* + * If asked, we need to waken the bgwriter. Since we don't want to rely on + * a spinlock for this we force a read from shared memory once, and then + * set the latch based on that value. We need to go through that length + * because otherwise bgwprocno might be reset while/after we check because + * the compiler might just reread from memory. + * + * This can possibly set the latch of the wrong process if the bgwriter + * dies in the wrong moment. But since PGPROC->procLatch is never + * deallocated the worst consequence of that is that we set the latch of + * some arbitrary process. + */ + bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno); + if (bgwprocno != -1) + { + /* reset bgwprocno first, before setting the latch */ + StrategyControl->bgwprocno = -1; + + /* + * Not acquiring ProcArrayLock here which is slightly icky. It's + * actually fine because procLatch isn't ever freed, so we just can + * potentially set the wrong process' (or no process') latch. + */ + SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch); + } + + /* + * We count buffer allocation requests so that the bgwriter can estimate + * the rate of buffer consumption. Note that buffers recycled by a + * strategy object are intentionally not counted here. + */ + pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); + + /* + * First check, without acquiring the lock, whether there's buffers in the + * freelist. Since we otherwise don't require the spinlock in every + * StrategyGetBuffer() invocation, it'd be sad to acquire it here - + * uselessly in most cases. That obviously leaves a race where a buffer is + * put on the freelist but we don't see the store yet - but that's pretty + * harmless, it'll just get used during the next buffer acquisition. + * + * If there's buffers on the freelist, acquire the spinlock to pop one + * buffer of the freelist. Then check whether that buffer is usable and + * repeat if not. + * + * Note that the freeNext fields are considered to be protected by the + * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to + * manipulate them without holding the spinlock. + */ + if (StrategyControl->firstFreeBuffer >= 0) + { + while (true) + { + /* Acquire the spinlock to remove element from the freelist */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + if (StrategyControl->firstFreeBuffer < 0) + { + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + break; + } + + buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); + Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); + + /* Unconditionally remove buffer from freelist */ + StrategyControl->firstFreeBuffer = buf->freeNext; + buf->freeNext = FREENEXT_NOT_IN_LIST; + + /* + * Release the lock so someone else can access the freelist while + * we check out this buffer. + */ + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + + /* + * If the buffer is pinned or has a nonzero usage_count, we cannot + * use it; discard it and retry. (This can only happen if VACUUM + * put a valid buffer in the freelist and then someone else used + * it before we got to it. It's probably impossible altogether as + * of 8.3, but we'd better check anyway.) + */ + local_buf_state = LockBufHdr(buf); + if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 + && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) + { + if (strategy != NULL) + AddBufferToRing(strategy, buf); + *buf_state = local_buf_state; + return buf; + } + UnlockBufHdr(buf, local_buf_state); + } + } + + /* Nothing on the freelist, so run the "clock sweep" algorithm */ + trycounter = NBuffers; + for (;;) + { + buf = GetBufferDescriptor(ClockSweepTick()); + + /* + * If the buffer is pinned or has a nonzero usage_count, we cannot use + * it; decrement the usage_count (unless pinned) and keep scanning. + */ + local_buf_state = LockBufHdr(buf); + + if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) + { + if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) + { + local_buf_state -= BUF_USAGECOUNT_ONE; + + trycounter = NBuffers; + } + else + { + /* Found a usable buffer */ + if (strategy != NULL) + AddBufferToRing(strategy, buf); + *buf_state = local_buf_state; + return buf; + } + } + else if (--trycounter == 0) + { + /* + * We've scanned all the buffers without making any state changes, + * so all the buffers are pinned (or were when we looked at them). + * We could hope that someone will free one eventually, but it's + * probably better to fail than to risk getting stuck in an + * infinite loop. + */ + UnlockBufHdr(buf, local_buf_state); + elog(ERROR, "no unpinned buffers available"); + } + UnlockBufHdr(buf, local_buf_state); + } +} + +/* + * StrategyFreeBuffer: put a buffer on the freelist + */ +void +StrategyFreeBuffer(BufferDesc *buf) +{ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + /* + * It is possible that we are told to put something in the freelist that + * is already in it; don't screw up the list if so. + */ + if (buf->freeNext == FREENEXT_NOT_IN_LIST) + { + buf->freeNext = StrategyControl->firstFreeBuffer; + if (buf->freeNext < 0) + StrategyControl->lastFreeBuffer = buf->buf_id; + StrategyControl->firstFreeBuffer = buf->buf_id; + } + + SpinLockRelease(&StrategyControl->buffer_strategy_lock); +} + +/* + * StrategySyncStart -- tell BufferSync where to start syncing + * + * The result is the buffer index of the best buffer to sync first. + * BufferSync() will proceed circularly around the buffer array from there. + * + * In addition, we return the completed-pass count (which is effectively + * the higher-order bits of nextVictimBuffer) and the count of recent buffer + * allocs if non-NULL pointers are passed. The alloc count is reset after + * being read. + */ +int +StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) +{ + uint32 nextVictimBuffer; + int result; + + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); + result = nextVictimBuffer % NBuffers; + + if (complete_passes) + { + *complete_passes = StrategyControl->completePasses; + + /* + * Additionally add the number of wraparounds that happened before + * completePasses could be incremented. C.f. ClockSweepTick(). + */ + *complete_passes += nextVictimBuffer / NBuffers; + } + + if (num_buf_alloc) + { + *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); + } + SpinLockRelease(&StrategyControl->buffer_strategy_lock); + return result; +} + +/* + * StrategyNotifyBgWriter -- set or clear allocation notification latch + * + * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will + * set that latch. Pass -1 to clear the pending notification before it + * happens. This feature is used by the bgwriter process to wake itself up + * from hibernation, and is not meant for anybody else to use. + */ +void +StrategyNotifyBgWriter(int bgwprocno) +{ + /* + * We acquire buffer_strategy_lock just to ensure that the store appears + * atomic to StrategyGetBuffer. The bgwriter should call this rather + * infrequently, so there's no performance penalty from being safe. + */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + StrategyControl->bgwprocno = bgwprocno; + SpinLockRelease(&StrategyControl->buffer_strategy_lock); +} + + +/* + * StrategyShmemSize + * + * estimate the size of shared memory used by the freelist-related structures. + * + * Note: for somewhat historical reasons, the buffer lookup hashtable size + * is also determined here. + */ +Size +StrategyShmemSize(void) +{ + Size size = 0; + + /* size of lookup hash table ... see comment in StrategyInitialize */ + size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); + + /* size of the shared replacement strategy control block */ + size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); + + return size; +} + +/* + * StrategyInitialize -- initialize the buffer cache replacement + * strategy. + * + * Assumes: All of the buffers are already built into a linked list. + * Only called by postmaster and only during initialization. + */ +void +StrategyInitialize(bool init) +{ + bool found; + + /* + * Initialize the shared buffer lookup hashtable. + * + * Since we can't tolerate running out of lookup table entries, we must be + * sure to specify an adequate table size here. The maximum steady-state + * usage is of course NBuffers entries, but BufferAlloc() tries to insert + * a new entry before deleting the old. In principle this could be + * happening in each partition concurrently, so we could need as many as + * NBuffers + NUM_BUFFER_PARTITIONS entries. + */ + InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); + + /* + * Get or create the shared strategy control block + */ + StrategyControl = (BufferStrategyControl *) + ShmemInitStruct("Buffer Strategy Status", + sizeof(BufferStrategyControl), + &found); + + if (!found) + { + /* + * Only done once, usually in postmaster + */ + Assert(init); + + SpinLockInit(&StrategyControl->buffer_strategy_lock); + + /* + * Grab the whole linked list of free buffers for our strategy. We + * assume it was previously set up by InitBufferPool(). + */ + StrategyControl->firstFreeBuffer = 0; + StrategyControl->lastFreeBuffer = NBuffers - 1; + + /* Initialize the clock sweep pointer */ + pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); + + /* Clear statistics */ + StrategyControl->completePasses = 0; + pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); + + /* No pending notification */ + StrategyControl->bgwprocno = -1; + } + else + Assert(!init); +} + + +/* ---------------------------------------------------------------- + * Backend-private buffer ring management + * ---------------------------------------------------------------- + */ + + +/* + * GetAccessStrategy -- create a BufferAccessStrategy object + * + * The object is allocated in the current memory context. + */ +BufferAccessStrategy +GetAccessStrategy(BufferAccessStrategyType btype) +{ + int ring_size_kb; + + /* + * Select ring size to use. See buffer/README for rationales. + * + * Note: if you change the ring size for BAS_BULKREAD, see also + * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c. + */ + switch (btype) + { + case BAS_NORMAL: + /* if someone asks for NORMAL, just give 'em a "default" object */ + return NULL; + + case BAS_BULKREAD: + ring_size_kb = 256; + break; + case BAS_BULKWRITE: + ring_size_kb = 16 * 1024; + break; + case BAS_VACUUM: + ring_size_kb = 256; + break; + + default: + elog(ERROR, "unrecognized buffer access strategy: %d", + (int) btype); + return NULL; /* keep compiler quiet */ + } + + return GetAccessStrategyWithSize(btype, ring_size_kb); +} + +/* + * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a + * number of buffers equivalent to the passed in size. + * + * If the given ring size is 0, no BufferAccessStrategy will be created and + * the function will return NULL. ring_size_kb must not be negative. + */ +BufferAccessStrategy +GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb) +{ + int ring_buffers; + BufferAccessStrategy strategy; + + Assert(ring_size_kb >= 0); + + /* Figure out how many buffers ring_size_kb is */ + ring_buffers = ring_size_kb / (BLCKSZ / 1024); + + /* 0 means unlimited, so no BufferAccessStrategy required */ + if (ring_buffers == 0) + return NULL; + + /* Cap to 1/8th of shared_buffers */ + ring_buffers = Min(NBuffers / 8, ring_buffers); + + /* NBuffers should never be less than 16, so this shouldn't happen */ + Assert(ring_buffers > 0); + + /* Allocate the object and initialize all elements to zeroes */ + strategy = (BufferAccessStrategy) + palloc0(offsetof(BufferAccessStrategyData, buffers) + + ring_buffers * sizeof(Buffer)); + + /* Set fields that don't start out zero */ + strategy->btype = btype; + strategy->nbuffers = ring_buffers; + + return strategy; +} + +/* + * GetAccessStrategyBufferCount -- an accessor for the number of buffers in + * the ring + * + * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize() + * returning NULL with 0 size. + */ +int +GetAccessStrategyBufferCount(BufferAccessStrategy strategy) +{ + if (strategy == NULL) + return 0; + + return strategy->nbuffers; +} + +/* + * FreeAccessStrategy -- release a BufferAccessStrategy object + * + * A simple pfree would do at the moment, but we would prefer that callers + * don't assume that much about the representation of BufferAccessStrategy. + */ +void +FreeAccessStrategy(BufferAccessStrategy strategy) +{ + /* don't crash if called on a "default" strategy */ + if (strategy != NULL) + pfree(strategy); +} + +/* + * GetBufferFromRing -- returns a buffer from the ring, or NULL if the + * ring is empty / not usable. + * + * The bufhdr spin lock is held on the returned buffer. + */ +static BufferDesc * +GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) +{ + BufferDesc *buf; + Buffer bufnum; + uint32 local_buf_state; /* to avoid repeated (de-)referencing */ + + + /* Advance to next ring slot */ + if (++strategy->current >= strategy->nbuffers) + strategy->current = 0; + + /* + * If the slot hasn't been filled yet, tell the caller to allocate a new + * buffer with the normal allocation strategy. He will then fill this + * slot by calling AddBufferToRing with the new buffer. + */ + bufnum = strategy->buffers[strategy->current]; + if (bufnum == InvalidBuffer) + return NULL; + + /* + * If the buffer is pinned we cannot use it under any circumstances. + * + * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, + * since our own previous usage of the ring element would have left it + * there, but it might've been decremented by clock sweep since then). A + * higher usage_count indicates someone else has touched the buffer, so we + * shouldn't re-use it. + */ + buf = GetBufferDescriptor(bufnum - 1); + local_buf_state = LockBufHdr(buf); + if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 + && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1) + { + *buf_state = local_buf_state; + return buf; + } + UnlockBufHdr(buf, local_buf_state); + + /* + * Tell caller to allocate a new buffer with the normal allocation + * strategy. He'll then replace this ring element via AddBufferToRing. + */ + return NULL; +} + +/* + * AddBufferToRing -- add a buffer to the buffer ring + * + * Caller must hold the buffer header spinlock on the buffer. Since this + * is called with the spinlock held, it had better be quite cheap. + */ +static void +AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf) +{ + strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); +} + +/* + * Utility function returning the IOContext of a given BufferAccessStrategy's + * strategy ring. + */ +IOContext +IOContextForStrategy(BufferAccessStrategy strategy) +{ + if (!strategy) + return IOCONTEXT_NORMAL; + + switch (strategy->btype) + { + case BAS_NORMAL: + + /* + * Currently, GetAccessStrategy() returns NULL for + * BufferAccessStrategyType BAS_NORMAL, so this case is + * unreachable. + */ + pg_unreachable(); + return IOCONTEXT_NORMAL; + case BAS_BULKREAD: + return IOCONTEXT_BULKREAD; + case BAS_BULKWRITE: + return IOCONTEXT_BULKWRITE; + case BAS_VACUUM: + return IOCONTEXT_VACUUM; + } + + elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype); + pg_unreachable(); +} + +/* + * StrategyRejectBuffer -- consider rejecting a dirty buffer + * + * When a nondefault strategy is used, the buffer manager calls this function + * when it turns out that the buffer selected by StrategyGetBuffer needs to + * be written out and doing so would require flushing WAL too. This gives us + * a chance to choose a different victim. + * + * Returns true if buffer manager should ask for a new victim, and false + * if this buffer should be written and re-used. + */ +bool +StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring) +{ + /* We only do this in bulkread mode */ + if (strategy->btype != BAS_BULKREAD) + return false; + + /* Don't muck with behavior of normal buffer-replacement strategy */ + if (!from_ring || + strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) + return false; + + /* + * Remove the dirty buffer from the ring; necessary to prevent infinite + * loop if all ring members are dirty. + */ + strategy->buffers[strategy->current] = InvalidBuffer; + + return true; +} diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c new file mode 100644 index 0000000..55953c3 --- /dev/null +++ b/src/backend/storage/buffer/localbuf.c @@ -0,0 +1,821 @@ +/*------------------------------------------------------------------------- + * + * localbuf.c + * local buffer manager. Fast buffer manager for temporary tables, + * which never need to be WAL-logged or checkpointed, etc. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994-5, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/buffer/localbuf.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/parallel.h" +#include "catalog/catalog.h" +#include "executor/instrument.h" +#include "pgstat.h" +#include "storage/buf_internals.h" +#include "storage/bufmgr.h" +#include "utils/guc_hooks.h" +#include "utils/memutils.h" +#include "utils/resowner_private.h" + + +/*#define LBDEBUG*/ + +/* entry for buffer lookup hashtable */ +typedef struct +{ + BufferTag key; /* Tag of a disk page */ + int id; /* Associated local buffer's index */ +} LocalBufferLookupEnt; + +/* Note: this macro only works on local buffers, not shared ones! */ +#define LocalBufHdrGetBlock(bufHdr) \ + LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)] + +int NLocBuffer = 0; /* until buffers are initialized */ + +BufferDesc *LocalBufferDescriptors = NULL; +Block *LocalBufferBlockPointers = NULL; +int32 *LocalRefCount = NULL; + +static int nextFreeLocalBufId = 0; + +static HTAB *LocalBufHash = NULL; + +/* number of local buffers pinned at least once */ +static int NLocalPinnedBuffers = 0; + + +static void InitLocalBuffers(void); +static Block GetLocalBufferStorage(void); +static Buffer GetLocalVictimBuffer(void); + + +/* + * PrefetchLocalBuffer - + * initiate asynchronous read of a block of a relation + * + * Do PrefetchBuffer's work for temporary relations. + * No-op if prefetching isn't compiled in. + */ +PrefetchBufferResult +PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, + BlockNumber blockNum) +{ + PrefetchBufferResult result = {InvalidBuffer, false}; + BufferTag newTag; /* identity of requested block */ + LocalBufferLookupEnt *hresult; + + InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum); + + /* Initialize local buffers if first request in this session */ + if (LocalBufHash == NULL) + InitLocalBuffers(); + + /* See if the desired buffer already exists */ + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, &newTag, HASH_FIND, NULL); + + if (hresult) + { + /* Yes, so nothing to do */ + result.recent_buffer = -hresult->id - 1; + } + else + { +#ifdef USE_PREFETCH + /* Not in buffers, so initiate prefetch */ + if ((io_direct_flags & IO_DIRECT_DATA) == 0 && + smgrprefetch(smgr, forkNum, blockNum)) + { + result.initiated_io = true; + } +#endif /* USE_PREFETCH */ + } + + return result; +} + + +/* + * LocalBufferAlloc - + * Find or create a local buffer for the given page of the given relation. + * + * API is similar to bufmgr.c's BufferAlloc, except that we do not need + * to do any locking since this is all local. Also, IO_IN_PROGRESS + * does not get set. Lastly, we support only default access strategy + * (hence, usage_count is always advanced). + */ +BufferDesc * +LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, + bool *foundPtr) +{ + BufferTag newTag; /* identity of requested block */ + LocalBufferLookupEnt *hresult; + BufferDesc *bufHdr; + Buffer victim_buffer; + int bufid; + bool found; + + InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum); + + /* Initialize local buffers if first request in this session */ + if (LocalBufHash == NULL) + InitLocalBuffers(); + + /* See if the desired buffer already exists */ + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, &newTag, HASH_FIND, NULL); + + if (hresult) + { + bufid = hresult->id; + bufHdr = GetLocalBufferDescriptor(bufid); + Assert(BufferTagsEqual(&bufHdr->tag, &newTag)); + + *foundPtr = PinLocalBuffer(bufHdr, true); + } + else + { + uint32 buf_state; + + victim_buffer = GetLocalVictimBuffer(); + bufid = -victim_buffer - 1; + bufHdr = GetLocalBufferDescriptor(bufid); + + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, &newTag, HASH_ENTER, &found); + if (found) /* shouldn't happen */ + elog(ERROR, "local buffer hash table corrupted"); + hresult->id = bufid; + + /* + * it's all ours now. + */ + bufHdr->tag = newTag; + + buf_state = pg_atomic_read_u32(&bufHdr->state); + buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK); + buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + + *foundPtr = false; + } + + return bufHdr; +} + +static Buffer +GetLocalVictimBuffer(void) +{ + int victim_bufid; + int trycounter; + uint32 buf_state; + BufferDesc *bufHdr; + + ResourceOwnerEnlargeBuffers(CurrentResourceOwner); + + /* + * Need to get a new buffer. We use a clock sweep algorithm (essentially + * the same as what freelist.c does now...) + */ + trycounter = NLocBuffer; + for (;;) + { + victim_bufid = nextFreeLocalBufId; + + if (++nextFreeLocalBufId >= NLocBuffer) + nextFreeLocalBufId = 0; + + bufHdr = GetLocalBufferDescriptor(victim_bufid); + + if (LocalRefCount[victim_bufid] == 0) + { + buf_state = pg_atomic_read_u32(&bufHdr->state); + + if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0) + { + buf_state -= BUF_USAGECOUNT_ONE; + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + trycounter = NLocBuffer; + } + else + { + /* Found a usable buffer */ + PinLocalBuffer(bufHdr, false); + break; + } + } + else if (--trycounter == 0) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("no empty local buffer available"))); + } + + /* + * lazy memory allocation: allocate space on first use of a buffer. + */ + if (LocalBufHdrGetBlock(bufHdr) == NULL) + { + /* Set pointer for use by BufferGetBlock() macro */ + LocalBufHdrGetBlock(bufHdr) = GetLocalBufferStorage(); + } + + /* + * this buffer is not referenced but it might still be dirty. if that's + * the case, write it out before reusing it! + */ + if (buf_state & BM_DIRTY) + { + instr_time io_start; + SMgrRelation oreln; + Page localpage = (char *) LocalBufHdrGetBlock(bufHdr); + + /* Find smgr relation for buffer */ + oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId); + + PageSetChecksumInplace(localpage, bufHdr->tag.blockNum); + + io_start = pgstat_prepare_io_time(); + + /* And write... */ + smgrwrite(oreln, + BufTagGetForkNum(&bufHdr->tag), + bufHdr->tag.blockNum, + localpage, + false); + + /* Temporary table I/O does not use Buffer Access Strategies */ + pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, + IOOP_WRITE, io_start, 1); + + /* Mark not-dirty now in case we error out below */ + buf_state &= ~BM_DIRTY; + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + + pgBufferUsage.local_blks_written++; + } + + /* + * Remove the victim buffer from the hashtable and mark as invalid. + */ + if (buf_state & BM_TAG_VALID) + { + LocalBufferLookupEnt *hresult; + + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, &bufHdr->tag, HASH_REMOVE, NULL); + if (!hresult) /* shouldn't happen */ + elog(ERROR, "local buffer hash table corrupted"); + /* mark buffer invalid just in case hash insert fails */ + ClearBufferTag(&bufHdr->tag); + buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK); + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EVICT); + } + + return BufferDescriptorGetBuffer(bufHdr); +} + +/* see LimitAdditionalPins() */ +static void +LimitAdditionalLocalPins(uint32 *additional_pins) +{ + uint32 max_pins; + + if (*additional_pins <= 1) + return; + + /* + * In contrast to LimitAdditionalPins() other backends don't play a role + * here. We can allow up to NLocBuffer pins in total. + */ + max_pins = (NLocBuffer - NLocalPinnedBuffers); + + if (*additional_pins >= max_pins) + *additional_pins = max_pins; +} + +/* + * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for + * temporary buffers. + */ +BlockNumber +ExtendBufferedRelLocal(BufferManagerRelation bmr, + ForkNumber fork, + uint32 flags, + uint32 extend_by, + BlockNumber extend_upto, + Buffer *buffers, + uint32 *extended_by) +{ + BlockNumber first_block; + instr_time io_start; + + /* Initialize local buffers if first request in this session */ + if (LocalBufHash == NULL) + InitLocalBuffers(); + + LimitAdditionalLocalPins(&extend_by); + + for (uint32 i = 0; i < extend_by; i++) + { + BufferDesc *buf_hdr; + Block buf_block; + + buffers[i] = GetLocalVictimBuffer(); + buf_hdr = GetLocalBufferDescriptor(-buffers[i] - 1); + buf_block = LocalBufHdrGetBlock(buf_hdr); + + /* new buffers are zero-filled */ + MemSet((char *) buf_block, 0, BLCKSZ); + } + + first_block = smgrnblocks(bmr.smgr, fork); + + if (extend_upto != InvalidBlockNumber) + { + /* + * In contrast to shared relations, nothing could change the relation + * size concurrently. Thus we shouldn't end up finding that we don't + * need to do anything. + */ + Assert(first_block <= extend_upto); + + Assert((uint64) first_block + extend_by <= extend_upto); + } + + /* Fail if relation is already at maximum possible length */ + if ((uint64) first_block + extend_by >= MaxBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend relation %s beyond %u blocks", + relpath(bmr.smgr->smgr_rlocator, fork), + MaxBlockNumber))); + + for (int i = 0; i < extend_by; i++) + { + int victim_buf_id; + BufferDesc *victim_buf_hdr; + BufferTag tag; + LocalBufferLookupEnt *hresult; + bool found; + + victim_buf_id = -buffers[i] - 1; + victim_buf_hdr = GetLocalBufferDescriptor(victim_buf_id); + + InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i); + + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, (void *) &tag, HASH_ENTER, &found); + if (found) + { + BufferDesc *existing_hdr; + uint32 buf_state; + + UnpinLocalBuffer(BufferDescriptorGetBuffer(victim_buf_hdr)); + + existing_hdr = GetLocalBufferDescriptor(hresult->id); + PinLocalBuffer(existing_hdr, false); + buffers[i] = BufferDescriptorGetBuffer(existing_hdr); + + buf_state = pg_atomic_read_u32(&existing_hdr->state); + Assert(buf_state & BM_TAG_VALID); + Assert(!(buf_state & BM_DIRTY)); + buf_state &= ~BM_VALID; + pg_atomic_unlocked_write_u32(&existing_hdr->state, buf_state); + } + else + { + uint32 buf_state = pg_atomic_read_u32(&victim_buf_hdr->state); + + Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED))); + + victim_buf_hdr->tag = tag; + + buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE; + + pg_atomic_unlocked_write_u32(&victim_buf_hdr->state, buf_state); + + hresult->id = victim_buf_id; + } + } + + io_start = pgstat_prepare_io_time(); + + /* actually extend relation */ + smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false); + + pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EXTEND, + io_start, extend_by); + + for (int i = 0; i < extend_by; i++) + { + Buffer buf = buffers[i]; + BufferDesc *buf_hdr; + uint32 buf_state; + + buf_hdr = GetLocalBufferDescriptor(-buf - 1); + + buf_state = pg_atomic_read_u32(&buf_hdr->state); + buf_state |= BM_VALID; + pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state); + } + + *extended_by = extend_by; + + pgBufferUsage.local_blks_written += extend_by; + + return first_block; +} + +/* + * MarkLocalBufferDirty - + * mark a local buffer dirty + */ +void +MarkLocalBufferDirty(Buffer buffer) +{ + int bufid; + BufferDesc *bufHdr; + uint32 buf_state; + + Assert(BufferIsLocal(buffer)); + +#ifdef LBDEBUG + fprintf(stderr, "LB DIRTY %d\n", buffer); +#endif + + bufid = -buffer - 1; + + Assert(LocalRefCount[bufid] > 0); + + bufHdr = GetLocalBufferDescriptor(bufid); + + buf_state = pg_atomic_read_u32(&bufHdr->state); + + if (!(buf_state & BM_DIRTY)) + pgBufferUsage.local_blks_dirtied++; + + buf_state |= BM_DIRTY; + + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); +} + +/* + * DropRelationLocalBuffers + * This function removes from the buffer pool all the pages of the + * specified relation that have block numbers >= firstDelBlock. + * (In particular, with firstDelBlock = 0, all pages are removed.) + * Dirty pages are simply dropped, without bothering to write them + * out first. Therefore, this is NOT rollback-able, and so should be + * used only with extreme caution! + * + * See DropRelationBuffers in bufmgr.c for more notes. + */ +void +DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum, + BlockNumber firstDelBlock) +{ + int i; + + for (i = 0; i < NLocBuffer; i++) + { + BufferDesc *bufHdr = GetLocalBufferDescriptor(i); + LocalBufferLookupEnt *hresult; + uint32 buf_state; + + buf_state = pg_atomic_read_u32(&bufHdr->state); + + if ((buf_state & BM_TAG_VALID) && + BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) && + BufTagGetForkNum(&bufHdr->tag) == forkNum && + bufHdr->tag.blockNum >= firstDelBlock) + { + if (LocalRefCount[i] != 0) + elog(ERROR, "block %u of %s is still referenced (local %u)", + bufHdr->tag.blockNum, + relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), + MyBackendId, + BufTagGetForkNum(&bufHdr->tag)), + LocalRefCount[i]); + + /* Remove entry from hashtable */ + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, &bufHdr->tag, HASH_REMOVE, NULL); + if (!hresult) /* shouldn't happen */ + elog(ERROR, "local buffer hash table corrupted"); + /* Mark buffer invalid */ + ClearBufferTag(&bufHdr->tag); + buf_state &= ~BUF_FLAG_MASK; + buf_state &= ~BUF_USAGECOUNT_MASK; + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + } + } +} + +/* + * DropRelationAllLocalBuffers + * This function removes from the buffer pool all pages of all forks + * of the specified relation. + * + * See DropRelationsAllBuffers in bufmgr.c for more notes. + */ +void +DropRelationAllLocalBuffers(RelFileLocator rlocator) +{ + int i; + + for (i = 0; i < NLocBuffer; i++) + { + BufferDesc *bufHdr = GetLocalBufferDescriptor(i); + LocalBufferLookupEnt *hresult; + uint32 buf_state; + + buf_state = pg_atomic_read_u32(&bufHdr->state); + + if ((buf_state & BM_TAG_VALID) && + BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator)) + { + if (LocalRefCount[i] != 0) + elog(ERROR, "block %u of %s is still referenced (local %u)", + bufHdr->tag.blockNum, + relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag), + MyBackendId, + BufTagGetForkNum(&bufHdr->tag)), + LocalRefCount[i]); + /* Remove entry from hashtable */ + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, &bufHdr->tag, HASH_REMOVE, NULL); + if (!hresult) /* shouldn't happen */ + elog(ERROR, "local buffer hash table corrupted"); + /* Mark buffer invalid */ + ClearBufferTag(&bufHdr->tag); + buf_state &= ~BUF_FLAG_MASK; + buf_state &= ~BUF_USAGECOUNT_MASK; + pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state); + } + } +} + +/* + * InitLocalBuffers - + * init the local buffer cache. Since most queries (esp. multi-user ones) + * don't involve local buffers, we delay allocating actual memory for the + * buffers until we need them; just make the buffer headers here. + */ +static void +InitLocalBuffers(void) +{ + int nbufs = num_temp_buffers; + HASHCTL info; + int i; + + /* + * Parallel workers can't access data in temporary tables, because they + * have no visibility into the local buffers of their leader. This is a + * convenient, low-cost place to provide a backstop check for that. Note + * that we don't wish to prevent a parallel worker from accessing catalog + * metadata about a temp table, so checks at higher levels would be + * inappropriate. + */ + if (IsParallelWorker()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TRANSACTION_STATE), + errmsg("cannot access temporary tables during a parallel operation"))); + + /* Allocate and zero buffer headers and auxiliary arrays */ + LocalBufferDescriptors = (BufferDesc *) calloc(nbufs, sizeof(BufferDesc)); + LocalBufferBlockPointers = (Block *) calloc(nbufs, sizeof(Block)); + LocalRefCount = (int32 *) calloc(nbufs, sizeof(int32)); + if (!LocalBufferDescriptors || !LocalBufferBlockPointers || !LocalRefCount) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + nextFreeLocalBufId = 0; + + /* initialize fields that need to start off nonzero */ + for (i = 0; i < nbufs; i++) + { + BufferDesc *buf = GetLocalBufferDescriptor(i); + + /* + * negative to indicate local buffer. This is tricky: shared buffers + * start with 0. We have to start with -2. (Note that the routine + * BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id + * is -1.) + */ + buf->buf_id = -i - 2; + + /* + * Intentionally do not initialize the buffer's atomic variable + * (besides zeroing the underlying memory above). That way we get + * errors on platforms without atomics, if somebody (re-)introduces + * atomic operations for local buffers. + */ + } + + /* Create the lookup hash table */ + info.keysize = sizeof(BufferTag); + info.entrysize = sizeof(LocalBufferLookupEnt); + + LocalBufHash = hash_create("Local Buffer Lookup Table", + nbufs, + &info, + HASH_ELEM | HASH_BLOBS); + + if (!LocalBufHash) + elog(ERROR, "could not initialize local buffer hash table"); + + /* Initialization done, mark buffers allocated */ + NLocBuffer = nbufs; +} + +/* + * XXX: We could have a slightly more efficient version of PinLocalBuffer() + * that does not support adjusting the usagecount - but so far it does not + * seem worth the trouble. + */ +bool +PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount) +{ + uint32 buf_state; + Buffer buffer = BufferDescriptorGetBuffer(buf_hdr); + int bufid = -buffer - 1; + + buf_state = pg_atomic_read_u32(&buf_hdr->state); + + if (LocalRefCount[bufid] == 0) + { + NLocalPinnedBuffers++; + if (adjust_usagecount && + BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT) + { + buf_state += BUF_USAGECOUNT_ONE; + pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state); + } + } + LocalRefCount[bufid]++; + ResourceOwnerRememberBuffer(CurrentResourceOwner, + BufferDescriptorGetBuffer(buf_hdr)); + + return buf_state & BM_VALID; +} + +void +UnpinLocalBuffer(Buffer buffer) +{ + int buffid = -buffer - 1; + + Assert(BufferIsLocal(buffer)); + Assert(LocalRefCount[buffid] > 0); + Assert(NLocalPinnedBuffers > 0); + + ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer); + if (--LocalRefCount[buffid] == 0) + NLocalPinnedBuffers--; +} + +/* + * GUC check_hook for temp_buffers + */ +bool +check_temp_buffers(int *newval, void **extra, GucSource source) +{ + /* + * Once local buffers have been initialized, it's too late to change this. + * However, if this is only a test call, allow it. + */ + if (source != PGC_S_TEST && NLocBuffer && NLocBuffer != *newval) + { + GUC_check_errdetail("\"temp_buffers\" cannot be changed after any temporary tables have been accessed in the session."); + return false; + } + return true; +} + +/* + * GetLocalBufferStorage - allocate memory for a local buffer + * + * The idea of this function is to aggregate our requests for storage + * so that the memory manager doesn't see a whole lot of relatively small + * requests. Since we'll never give back a local buffer once it's created + * within a particular process, no point in burdening memmgr with separately + * managed chunks. + */ +static Block +GetLocalBufferStorage(void) +{ + static char *cur_block = NULL; + static int next_buf_in_block = 0; + static int num_bufs_in_block = 0; + static int total_bufs_allocated = 0; + static MemoryContext LocalBufferContext = NULL; + + char *this_buf; + + Assert(total_bufs_allocated < NLocBuffer); + + if (next_buf_in_block >= num_bufs_in_block) + { + /* Need to make a new request to memmgr */ + int num_bufs; + + /* + * We allocate local buffers in a context of their own, so that the + * space eaten for them is easily recognizable in MemoryContextStats + * output. Create the context on first use. + */ + if (LocalBufferContext == NULL) + LocalBufferContext = + AllocSetContextCreate(TopMemoryContext, + "LocalBufferContext", + ALLOCSET_DEFAULT_SIZES); + + /* Start with a 16-buffer request; subsequent ones double each time */ + num_bufs = Max(num_bufs_in_block * 2, 16); + /* But not more than what we need for all remaining local bufs */ + num_bufs = Min(num_bufs, NLocBuffer - total_bufs_allocated); + /* And don't overflow MaxAllocSize, either */ + num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ); + + /* Buffers should be I/O aligned. */ + cur_block = (char *) + TYPEALIGN(PG_IO_ALIGN_SIZE, + MemoryContextAlloc(LocalBufferContext, + num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE)); + next_buf_in_block = 0; + num_bufs_in_block = num_bufs; + } + + /* Allocate next buffer in current memory block */ + this_buf = cur_block + next_buf_in_block * BLCKSZ; + next_buf_in_block++; + total_bufs_allocated++; + + return (Block) this_buf; +} + +/* + * CheckForLocalBufferLeaks - ensure this backend holds no local buffer pins + * + * This is just like CheckForBufferLeaks(), but for local buffers. + */ +static void +CheckForLocalBufferLeaks(void) +{ +#ifdef USE_ASSERT_CHECKING + if (LocalRefCount) + { + int RefCountErrors = 0; + int i; + + for (i = 0; i < NLocBuffer; i++) + { + if (LocalRefCount[i] != 0) + { + Buffer b = -i - 1; + + PrintBufferLeakWarning(b); + RefCountErrors++; + } + } + Assert(RefCountErrors == 0); + } +#endif +} + +/* + * AtEOXact_LocalBuffers - clean up at end of transaction. + * + * This is just like AtEOXact_Buffers, but for local buffers. + */ +void +AtEOXact_LocalBuffers(bool isCommit) +{ + CheckForLocalBufferLeaks(); +} + +/* + * AtProcExit_LocalBuffers - ensure we have dropped pins during backend exit. + * + * This is just like AtProcExit_Buffers, but for local buffers. + */ +void +AtProcExit_LocalBuffers(void) +{ + /* + * We shouldn't be holding any remaining pins; if we are, and assertions + * aren't enabled, we'll fail later in DropRelationBuffers while trying to + * drop the temp rels. + */ + CheckForLocalBufferLeaks(); +} diff --git a/src/backend/storage/buffer/meson.build b/src/backend/storage/buffer/meson.build new file mode 100644 index 0000000..ea2f9c0 --- /dev/null +++ b/src/backend/storage/buffer/meson.build @@ -0,0 +1,9 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'buf_init.c', + 'buf_table.c', + 'bufmgr.c', + 'freelist.c', + 'localbuf.c', +) diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile new file mode 100644 index 0000000..660ac51 --- /dev/null +++ b/src/backend/storage/file/Makefile @@ -0,0 +1,23 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/file +# +# IDENTIFICATION +# src/backend/storage/file/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/file +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + buffile.o \ + copydir.o \ + fd.o \ + fileset.o \ + reinit.o \ + sharedfileset.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c new file mode 100644 index 0000000..41ab641 --- /dev/null +++ b/src/backend/storage/file/buffile.c @@ -0,0 +1,1039 @@ +/*------------------------------------------------------------------------- + * + * buffile.c + * Management of large buffered temporary files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/buffile.c + * + * NOTES: + * + * BufFiles provide a very incomplete emulation of stdio atop virtual Files + * (as managed by fd.c). Currently, we only support the buffered-I/O + * aspect of stdio: a read or write of the low-level File occurs only + * when the buffer is filled or emptied. This is an even bigger win + * for virtual Files than for ordinary kernel files, since reducing the + * frequency with which a virtual File is touched reduces "thrashing" + * of opening/closing file descriptors. + * + * Note that BufFile structs are allocated with palloc(), and therefore + * will go away automatically at query/transaction end. Since the underlying + * virtual Files are made with OpenTemporaryFile, all resources for + * the file are certain to be cleaned up even if processing is aborted + * by ereport(ERROR). The data structures required are made in the + * palloc context that was current when the BufFile was created, and + * any external resources such as temp files are owned by the ResourceOwner + * that was current at that time. + * + * BufFile also supports temporary files that exceed the OS file size limit + * (by opening multiple fd.c temporary files). This is an essential feature + * for sorts and hashjoins on large amounts of data. + * + * BufFile supports temporary files that can be shared with other backends, as + * infrastructure for parallel execution. Such files need to be created as a + * member of a SharedFileSet that all participants are attached to. + * + * BufFile also supports temporary files that can be used by the single backend + * when the corresponding files need to be survived across the transaction and + * need to be opened and closed multiple times. Such files need to be created + * as a member of a FileSet. + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "commands/tablespace.h" +#include "executor/instrument.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/buf_internals.h" +#include "storage/buffile.h" +#include "storage/fd.h" +#include "utils/resowner.h" + +/* + * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE. + * The reason is that we'd like large BufFiles to be spread across multiple + * tablespaces when available. + */ +#define MAX_PHYSICAL_FILESIZE 0x40000000 +#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ) + +/* + * This data structure represents a buffered file that consists of one or + * more physical files (each accessed through a virtual file descriptor + * managed by fd.c). + */ +struct BufFile +{ + int numFiles; /* number of physical files in set */ + /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */ + File *files; /* palloc'd array with numFiles entries */ + + bool isInterXact; /* keep open over transactions? */ + bool dirty; /* does buffer need to be written? */ + bool readOnly; /* has the file been set to read only? */ + + FileSet *fileset; /* space for fileset based segment files */ + const char *name; /* name of fileset based BufFile */ + + /* + * resowner is the ResourceOwner to use for underlying temp files. (We + * don't need to remember the memory context we're using explicitly, + * because after creation we only repalloc our arrays larger.) + */ + ResourceOwner resowner; + + /* + * "current pos" is position of start of buffer within the logical file. + * Position as seen by user of BufFile is (curFile, curOffset + pos). + */ + int curFile; /* file index (0..n) part of current pos */ + off_t curOffset; /* offset part of current pos */ + int pos; /* next read/write position in buffer */ + int nbytes; /* total # of valid bytes in buffer */ + + /* + * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid + * wasting per-file alignment padding when some users create many files. + */ + PGAlignedBlock buffer; +}; + +static BufFile *makeBufFileCommon(int nfiles); +static BufFile *makeBufFile(File firstfile); +static void extendBufFile(BufFile *file); +static void BufFileLoadBuffer(BufFile *file); +static void BufFileDumpBuffer(BufFile *file); +static void BufFileFlush(BufFile *file); +static File MakeNewFileSetSegment(BufFile *buffile, int segment); + +/* + * Create BufFile and perform the common initialization. + */ +static BufFile * +makeBufFileCommon(int nfiles) +{ + BufFile *file = (BufFile *) palloc(sizeof(BufFile)); + + file->numFiles = nfiles; + file->isInterXact = false; + file->dirty = false; + file->resowner = CurrentResourceOwner; + file->curFile = 0; + file->curOffset = 0; + file->pos = 0; + file->nbytes = 0; + + return file; +} + +/* + * Create a BufFile given the first underlying physical file. + * NOTE: caller must set isInterXact if appropriate. + */ +static BufFile * +makeBufFile(File firstfile) +{ + BufFile *file = makeBufFileCommon(1); + + file->files = (File *) palloc(sizeof(File)); + file->files[0] = firstfile; + file->readOnly = false; + file->fileset = NULL; + file->name = NULL; + + return file; +} + +/* + * Add another component temp file. + */ +static void +extendBufFile(BufFile *file) +{ + File pfile; + ResourceOwner oldowner; + + /* Be sure to associate the file with the BufFile's resource owner */ + oldowner = CurrentResourceOwner; + CurrentResourceOwner = file->resowner; + + if (file->fileset == NULL) + pfile = OpenTemporaryFile(file->isInterXact); + else + pfile = MakeNewFileSetSegment(file, file->numFiles); + + Assert(pfile >= 0); + + CurrentResourceOwner = oldowner; + + file->files = (File *) repalloc(file->files, + (file->numFiles + 1) * sizeof(File)); + file->files[file->numFiles] = pfile; + file->numFiles++; +} + +/* + * Create a BufFile for a new temporary file (which will expand to become + * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are + * written to it). + * + * If interXact is true, the temp file will not be automatically deleted + * at end of transaction. + * + * Note: if interXact is true, the caller had better be calling us in a + * memory context, and with a resource owner, that will survive across + * transaction boundaries. + */ +BufFile * +BufFileCreateTemp(bool interXact) +{ + BufFile *file; + File pfile; + + /* + * Ensure that temp tablespaces are set up for OpenTemporaryFile to use. + * Possibly the caller will have done this already, but it seems useful to + * double-check here. Failure to do this at all would result in the temp + * files always getting placed in the default tablespace, which is a + * pretty hard-to-detect bug. Callers may prefer to do it earlier if they + * want to be sure that any required catalog access is done in some other + * resource context. + */ + PrepareTempTablespaces(); + + pfile = OpenTemporaryFile(interXact); + Assert(pfile >= 0); + + file = makeBufFile(pfile); + file->isInterXact = interXact; + + return file; +} + +/* + * Build the name for a given segment of a given BufFile. + */ +static void +FileSetSegmentName(char *name, const char *buffile_name, int segment) +{ + snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment); +} + +/* + * Create a new segment file backing a fileset based BufFile. + */ +static File +MakeNewFileSetSegment(BufFile *buffile, int segment) +{ + char name[MAXPGPATH]; + File file; + + /* + * It is possible that there are files left over from before a crash + * restart with the same name. In order for BufFileOpenFileSet() not to + * get confused about how many segments there are, we'll unlink the next + * segment number if it already exists. + */ + FileSetSegmentName(name, buffile->name, segment + 1); + FileSetDelete(buffile->fileset, name, true); + + /* Create the new segment. */ + FileSetSegmentName(name, buffile->name, segment); + file = FileSetCreate(buffile->fileset, name); + + /* FileSetCreate would've errored out */ + Assert(file > 0); + + return file; +} + +/* + * Create a BufFile that can be discovered and opened read-only by other + * backends that are attached to the same SharedFileSet using the same name. + * + * The naming scheme for fileset based BufFiles is left up to the calling code. + * The name will appear as part of one or more filenames on disk, and might + * provide clues to administrators about which subsystem is generating + * temporary file data. Since each SharedFileSet object is backed by one or + * more uniquely named temporary directory, names don't conflict with + * unrelated SharedFileSet objects. + */ +BufFile * +BufFileCreateFileSet(FileSet *fileset, const char *name) +{ + BufFile *file; + + file = makeBufFileCommon(1); + file->fileset = fileset; + file->name = pstrdup(name); + file->files = (File *) palloc(sizeof(File)); + file->files[0] = MakeNewFileSetSegment(file, 0); + file->readOnly = false; + + return file; +} + +/* + * Open a file that was previously created in another backend (or this one) + * with BufFileCreateFileSet in the same FileSet using the same name. + * The backend that created the file must have called BufFileClose() or + * BufFileExportFileSet() to make sure that it is ready to be opened by other + * backends and render it read-only. If missing_ok is true, which indicates + * that missing files can be safely ignored, then return NULL if the BufFile + * with the given name is not found, otherwise, throw an error. + */ +BufFile * +BufFileOpenFileSet(FileSet *fileset, const char *name, int mode, + bool missing_ok) +{ + BufFile *file; + char segment_name[MAXPGPATH]; + Size capacity = 16; + File *files; + int nfiles = 0; + + files = palloc(sizeof(File) * capacity); + + /* + * We don't know how many segments there are, so we'll probe the + * filesystem to find out. + */ + for (;;) + { + /* See if we need to expand our file segment array. */ + if (nfiles + 1 > capacity) + { + capacity *= 2; + files = repalloc(files, sizeof(File) * capacity); + } + /* Try to load a segment. */ + FileSetSegmentName(segment_name, name, nfiles); + files[nfiles] = FileSetOpen(fileset, segment_name, mode); + if (files[nfiles] <= 0) + break; + ++nfiles; + + CHECK_FOR_INTERRUPTS(); + } + + /* + * If we didn't find any files at all, then no BufFile exists with this + * name. + */ + if (nfiles == 0) + { + /* free the memory */ + pfree(files); + + if (missing_ok) + return NULL; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m", + segment_name, name))); + } + + file = makeBufFileCommon(nfiles); + file->files = files; + file->readOnly = (mode == O_RDONLY); + file->fileset = fileset; + file->name = pstrdup(name); + + return file; +} + +/* + * Delete a BufFile that was created by BufFileCreateFileSet in the given + * FileSet using the given name. + * + * It is not necessary to delete files explicitly with this function. It is + * provided only as a way to delete files proactively, rather than waiting for + * the FileSet to be cleaned up. + * + * Only one backend should attempt to delete a given name, and should know + * that it exists and has been exported or closed otherwise missing_ok should + * be passed true. + */ +void +BufFileDeleteFileSet(FileSet *fileset, const char *name, bool missing_ok) +{ + char segment_name[MAXPGPATH]; + int segment = 0; + bool found = false; + + /* + * We don't know how many segments the file has. We'll keep deleting + * until we run out. If we don't manage to find even an initial segment, + * raise an error. + */ + for (;;) + { + FileSetSegmentName(segment_name, name, segment); + if (!FileSetDelete(fileset, segment_name, true)) + break; + found = true; + ++segment; + + CHECK_FOR_INTERRUPTS(); + } + + if (!found && !missing_ok) + elog(ERROR, "could not delete unknown BufFile \"%s\"", name); +} + +/* + * BufFileExportFileSet --- flush and make read-only, in preparation for sharing. + */ +void +BufFileExportFileSet(BufFile *file) +{ + /* Must be a file belonging to a FileSet. */ + Assert(file->fileset != NULL); + + /* It's probably a bug if someone calls this twice. */ + Assert(!file->readOnly); + + BufFileFlush(file); + file->readOnly = true; +} + +/* + * Close a BufFile + * + * Like fclose(), this also implicitly FileCloses the underlying File. + */ +void +BufFileClose(BufFile *file) +{ + int i; + + /* flush any unwritten data */ + BufFileFlush(file); + /* close and delete the underlying file(s) */ + for (i = 0; i < file->numFiles; i++) + FileClose(file->files[i]); + /* release the buffer space */ + pfree(file->files); + pfree(file); +} + +/* + * BufFileLoadBuffer + * + * Load some data into buffer, if possible, starting from curOffset. + * At call, must have dirty = false, pos and nbytes = 0. + * On exit, nbytes is number of bytes loaded. + */ +static void +BufFileLoadBuffer(BufFile *file) +{ + File thisfile; + instr_time io_start; + instr_time io_time; + + /* + * Advance to next component file if necessary and possible. + */ + if (file->curOffset >= MAX_PHYSICAL_FILESIZE && + file->curFile + 1 < file->numFiles) + { + file->curFile++; + file->curOffset = 0; + } + + thisfile = file->files[file->curFile]; + + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_start); + else + INSTR_TIME_SET_ZERO(io_start); + + /* + * Read whatever we can get, up to a full bufferload. + */ + file->nbytes = FileRead(thisfile, + file->buffer.data, + sizeof(file->buffer), + file->curOffset, + WAIT_EVENT_BUFFILE_READ); + if (file->nbytes < 0) + { + file->nbytes = 0; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", + FilePathName(thisfile)))); + } + + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start); + } + + /* we choose not to advance curOffset here */ + + if (file->nbytes > 0) + pgBufferUsage.temp_blks_read++; +} + +/* + * BufFileDumpBuffer + * + * Dump buffer contents starting at curOffset. + * At call, should have dirty = true, nbytes > 0. + * On exit, dirty is cleared if successful write, and curOffset is advanced. + */ +static void +BufFileDumpBuffer(BufFile *file) +{ + int wpos = 0; + int bytestowrite; + File thisfile; + + /* + * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it + * crosses a component-file boundary; so we need a loop. + */ + while (wpos < file->nbytes) + { + off_t availbytes; + instr_time io_start; + instr_time io_time; + + /* + * Advance to next component file if necessary and possible. + */ + if (file->curOffset >= MAX_PHYSICAL_FILESIZE) + { + while (file->curFile + 1 >= file->numFiles) + extendBufFile(file); + file->curFile++; + file->curOffset = 0; + } + + /* + * Determine how much we need to write into this file. + */ + bytestowrite = file->nbytes - wpos; + availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset; + + if ((off_t) bytestowrite > availbytes) + bytestowrite = (int) availbytes; + + thisfile = file->files[file->curFile]; + + if (track_io_timing) + INSTR_TIME_SET_CURRENT(io_start); + else + INSTR_TIME_SET_ZERO(io_start); + + bytestowrite = FileWrite(thisfile, + file->buffer.data + wpos, + bytestowrite, + file->curOffset, + WAIT_EVENT_BUFFILE_WRITE); + if (bytestowrite <= 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", + FilePathName(thisfile)))); + + if (track_io_timing) + { + INSTR_TIME_SET_CURRENT(io_time); + INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start); + } + + file->curOffset += bytestowrite; + wpos += bytestowrite; + + pgBufferUsage.temp_blks_written++; + } + file->dirty = false; + + /* + * At this point, curOffset has been advanced to the end of the buffer, + * ie, its original value + nbytes. We need to make it point to the + * logical file position, ie, original value + pos, in case that is less + * (as could happen due to a small backwards seek in a dirty buffer!) + */ + file->curOffset -= (file->nbytes - file->pos); + if (file->curOffset < 0) /* handle possible segment crossing */ + { + file->curFile--; + Assert(file->curFile >= 0); + file->curOffset += MAX_PHYSICAL_FILESIZE; + } + + /* + * Now we can set the buffer empty without changing the logical position + */ + file->pos = 0; + file->nbytes = 0; +} + +/* + * BufFileRead variants + * + * Like fread() except we assume 1-byte element size and report I/O errors via + * ereport(). + * + * If 'exact' is true, then an error is also raised if the number of bytes + * read is not exactly 'size' (no short reads). If 'exact' and 'eofOK' are + * true, then reading zero bytes is ok. + */ +static size_t +BufFileReadCommon(BufFile *file, void *ptr, size_t size, bool exact, bool eofOK) +{ + size_t start_size = size; + size_t nread = 0; + size_t nthistime; + + BufFileFlush(file); + + while (size > 0) + { + if (file->pos >= file->nbytes) + { + /* Try to load more data into buffer. */ + file->curOffset += file->pos; + file->pos = 0; + file->nbytes = 0; + BufFileLoadBuffer(file); + if (file->nbytes <= 0) + break; /* no more data available */ + } + + nthistime = file->nbytes - file->pos; + if (nthistime > size) + nthistime = size; + Assert(nthistime > 0); + + memcpy(ptr, file->buffer.data + file->pos, nthistime); + + file->pos += nthistime; + ptr = (char *) ptr + nthistime; + size -= nthistime; + nread += nthistime; + } + + if (exact && + (nread != start_size && !(nread == 0 && eofOK))) + ereport(ERROR, + errcode_for_file_access(), + file->name ? + errmsg("could not read from file set \"%s\": read only %zu of %zu bytes", + file->name, nread, start_size) : + errmsg("could not read from temporary file: read only %zu of %zu bytes", + nread, start_size)); + + return nread; +} + +/* + * Legacy interface where the caller needs to check for end of file or short + * reads. + */ +size_t +BufFileRead(BufFile *file, void *ptr, size_t size) +{ + return BufFileReadCommon(file, ptr, size, false, false); +} + +/* + * Require read of exactly the specified size. + */ +void +BufFileReadExact(BufFile *file, void *ptr, size_t size) +{ + BufFileReadCommon(file, ptr, size, true, false); +} + +/* + * Require read of exactly the specified size, but optionally allow end of + * file (in which case 0 is returned). + */ +size_t +BufFileReadMaybeEOF(BufFile *file, void *ptr, size_t size, bool eofOK) +{ + return BufFileReadCommon(file, ptr, size, true, eofOK); +} + +/* + * BufFileWrite + * + * Like fwrite() except we assume 1-byte element size and report errors via + * ereport(). + */ +void +BufFileWrite(BufFile *file, const void *ptr, size_t size) +{ + size_t nthistime; + + Assert(!file->readOnly); + + while (size > 0) + { + if (file->pos >= BLCKSZ) + { + /* Buffer full, dump it out */ + if (file->dirty) + BufFileDumpBuffer(file); + else + { + /* Hmm, went directly from reading to writing? */ + file->curOffset += file->pos; + file->pos = 0; + file->nbytes = 0; + } + } + + nthistime = BLCKSZ - file->pos; + if (nthistime > size) + nthistime = size; + Assert(nthistime > 0); + + memcpy(file->buffer.data + file->pos, ptr, nthistime); + + file->dirty = true; + file->pos += nthistime; + if (file->nbytes < file->pos) + file->nbytes = file->pos; + ptr = (const char *) ptr + nthistime; + size -= nthistime; + } +} + +/* + * BufFileFlush + * + * Like fflush(), except that I/O errors are reported with ereport(). + */ +static void +BufFileFlush(BufFile *file) +{ + if (file->dirty) + BufFileDumpBuffer(file); + + Assert(!file->dirty); +} + +/* + * BufFileSeek + * + * Like fseek(), except that target position needs two values in order to + * work when logical filesize exceeds maximum value representable by off_t. + * We do not support relative seeks across more than that, however. + * I/O errors are reported by ereport(). + * + * Result is 0 if OK, EOF if not. Logical position is not moved if an + * impossible seek is attempted. + */ +int +BufFileSeek(BufFile *file, int fileno, off_t offset, int whence) +{ + int newFile; + off_t newOffset; + + switch (whence) + { + case SEEK_SET: + if (fileno < 0) + return EOF; + newFile = fileno; + newOffset = offset; + break; + case SEEK_CUR: + + /* + * Relative seek considers only the signed offset, ignoring + * fileno. Note that large offsets (> 1 GB) risk overflow in this + * add, unless we have 64-bit off_t. + */ + newFile = file->curFile; + newOffset = (file->curOffset + file->pos) + offset; + break; + case SEEK_END: + + /* + * The file size of the last file gives us the end offset of that + * file. + */ + newFile = file->numFiles - 1; + newOffset = FileSize(file->files[file->numFiles - 1]); + if (newOffset < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m", + FilePathName(file->files[file->numFiles - 1]), + file->name))); + break; + default: + elog(ERROR, "invalid whence: %d", whence); + return EOF; + } + while (newOffset < 0) + { + if (--newFile < 0) + return EOF; + newOffset += MAX_PHYSICAL_FILESIZE; + } + if (newFile == file->curFile && + newOffset >= file->curOffset && + newOffset <= file->curOffset + file->nbytes) + { + /* + * Seek is to a point within existing buffer; we can just adjust + * pos-within-buffer, without flushing buffer. Note this is OK + * whether reading or writing, but buffer remains dirty if we were + * writing. + */ + file->pos = (int) (newOffset - file->curOffset); + return 0; + } + /* Otherwise, must reposition buffer, so flush any dirty data */ + BufFileFlush(file); + + /* + * At this point and no sooner, check for seek past last segment. The + * above flush could have created a new segment, so checking sooner would + * not work (at least not with this code). + */ + + /* convert seek to "start of next seg" to "end of last seg" */ + if (newFile == file->numFiles && newOffset == 0) + { + newFile--; + newOffset = MAX_PHYSICAL_FILESIZE; + } + while (newOffset > MAX_PHYSICAL_FILESIZE) + { + if (++newFile >= file->numFiles) + return EOF; + newOffset -= MAX_PHYSICAL_FILESIZE; + } + if (newFile >= file->numFiles) + return EOF; + /* Seek is OK! */ + file->curFile = newFile; + file->curOffset = newOffset; + file->pos = 0; + file->nbytes = 0; + return 0; +} + +void +BufFileTell(BufFile *file, int *fileno, off_t *offset) +{ + *fileno = file->curFile; + *offset = file->curOffset + file->pos; +} + +/* + * BufFileSeekBlock --- block-oriented seek + * + * Performs absolute seek to the start of the n'th BLCKSZ-sized block of + * the file. Note that users of this interface will fail if their files + * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work + * with tables bigger than that, either... + * + * Result is 0 if OK, EOF if not. Logical position is not moved if an + * impossible seek is attempted. + */ +int +BufFileSeekBlock(BufFile *file, long blknum) +{ + return BufFileSeek(file, + (int) (blknum / BUFFILE_SEG_SIZE), + (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ, + SEEK_SET); +} + +#ifdef NOT_USED +/* + * BufFileTellBlock --- block-oriented tell + * + * Any fractional part of a block in the current seek position is ignored. + */ +long +BufFileTellBlock(BufFile *file) +{ + long blknum; + + blknum = (file->curOffset + file->pos) / BLCKSZ; + blknum += file->curFile * BUFFILE_SEG_SIZE; + return blknum; +} + +#endif + +/* + * Return the current fileset based BufFile size. + * + * Counts any holes left behind by BufFileAppend as part of the size. + * ereport()s on failure. + */ +int64 +BufFileSize(BufFile *file) +{ + int64 lastFileSize; + + Assert(file->fileset != NULL); + + /* Get the size of the last physical file. */ + lastFileSize = FileSize(file->files[file->numFiles - 1]); + if (lastFileSize < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m", + FilePathName(file->files[file->numFiles - 1]), + file->name))); + + return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) + + lastFileSize; +} + +/* + * Append the contents of source file (managed within fileset) to + * end of target file (managed within same fileset). + * + * Note that operation subsumes ownership of underlying resources from + * "source". Caller should never call BufFileClose against source having + * called here first. Resource owners for source and target must match, + * too. + * + * This operation works by manipulating lists of segment files, so the + * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned + * boundary, typically creating empty holes before the boundary. These + * areas do not contain any interesting data, and cannot be read from by + * caller. + * + * Returns the block number within target where the contents of source + * begins. Caller should apply this as an offset when working off block + * positions that are in terms of the original BufFile space. + */ +long +BufFileAppend(BufFile *target, BufFile *source) +{ + long startBlock = target->numFiles * BUFFILE_SEG_SIZE; + int newNumFiles = target->numFiles + source->numFiles; + int i; + + Assert(target->fileset != NULL); + Assert(source->readOnly); + Assert(!source->dirty); + Assert(source->fileset != NULL); + + if (target->resowner != source->resowner) + elog(ERROR, "could not append BufFile with non-matching resource owner"); + + target->files = (File *) + repalloc(target->files, sizeof(File) * newNumFiles); + for (i = target->numFiles; i < newNumFiles; i++) + target->files[i] = source->files[i - target->numFiles]; + target->numFiles = newNumFiles; + + return startBlock; +} + +/* + * Truncate a BufFile created by BufFileCreateFileSet up to the given fileno + * and the offset. + */ +void +BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset) +{ + int numFiles = file->numFiles; + int newFile = fileno; + off_t newOffset = file->curOffset; + char segment_name[MAXPGPATH]; + int i; + + /* + * Loop over all the files up to the given fileno and remove the files + * that are greater than the fileno and truncate the given file up to the + * offset. Note that we also remove the given fileno if the offset is 0 + * provided it is not the first file in which we truncate it. + */ + for (i = file->numFiles - 1; i >= fileno; i--) + { + if ((i != fileno || offset == 0) && i != 0) + { + FileSetSegmentName(segment_name, file->name, i); + FileClose(file->files[i]); + if (!FileSetDelete(file->fileset, segment_name, true)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not delete fileset \"%s\": %m", + segment_name))); + numFiles--; + newOffset = MAX_PHYSICAL_FILESIZE; + + /* + * This is required to indicate that we have deleted the given + * fileno. + */ + if (i == fileno) + newFile--; + } + else + { + if (FileTruncate(file->files[i], offset, + WAIT_EVENT_BUFFILE_TRUNCATE) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\": %m", + FilePathName(file->files[i])))); + newOffset = offset; + } + } + + file->numFiles = numFiles; + + /* + * If the truncate point is within existing buffer then we can just adjust + * pos within buffer. + */ + if (newFile == file->curFile && + newOffset >= file->curOffset && + newOffset <= file->curOffset + file->nbytes) + { + /* No need to reset the current pos if the new pos is greater. */ + if (newOffset <= file->curOffset + file->pos) + file->pos = (int) (newOffset - file->curOffset); + + /* Adjust the nbytes for the current buffer. */ + file->nbytes = (int) (newOffset - file->curOffset); + } + else if (newFile == file->curFile && + newOffset < file->curOffset) + { + /* + * The truncate point is within the existing file but prior to the + * current position, so we can forget the current buffer and reset the + * current position. + */ + file->curOffset = newOffset; + file->pos = 0; + file->nbytes = 0; + } + else if (newFile < file->curFile) + { + /* + * The truncate point is prior to the current file, so need to reset + * the current position accordingly. + */ + file->curFile = newFile; + file->curOffset = newOffset; + file->pos = 0; + file->nbytes = 0; + } + /* Nothing to do, if the truncate point is beyond current file. */ +} diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c new file mode 100644 index 0000000..e04bc39 --- /dev/null +++ b/src/backend/storage/file/copydir.c @@ -0,0 +1,216 @@ +/*------------------------------------------------------------------------- + * + * copydir.c + * copies a directory + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * While "xcopy /e /i /q" works fine for copying directories, on Windows XP + * it requires a Window handle which prevents it from working when invoked + * as a service. + * + * IDENTIFICATION + * src/backend/storage/file/copydir.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include + +#include "common/file_utils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/copydir.h" +#include "storage/fd.h" + +/* + * copydir: copy a directory + * + * If recurse is false, subdirectories are ignored. Anything that's not + * a directory or a regular file is ignored. + */ +void +copydir(const char *fromdir, const char *todir, bool recurse) +{ + DIR *xldir; + struct dirent *xlde; + char fromfile[MAXPGPATH * 2]; + char tofile[MAXPGPATH * 2]; + + if (MakePGDirectory(todir) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", todir))); + + xldir = AllocateDir(fromdir); + + while ((xlde = ReadDir(xldir, fromdir)) != NULL) + { + PGFileType xlde_type; + + /* If we got a cancel signal during the copy of the directory, quit */ + CHECK_FOR_INTERRUPTS(); + + if (strcmp(xlde->d_name, ".") == 0 || + strcmp(xlde->d_name, "..") == 0) + continue; + + snprintf(fromfile, sizeof(fromfile), "%s/%s", fromdir, xlde->d_name); + snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name); + + xlde_type = get_dirent_type(fromfile, xlde, false, ERROR); + + if (xlde_type == PGFILETYPE_DIR) + { + /* recurse to handle subdirectories */ + if (recurse) + copydir(fromfile, tofile, true); + } + else if (xlde_type == PGFILETYPE_REG) + copy_file(fromfile, tofile); + } + FreeDir(xldir); + + /* + * Be paranoid here and fsync all files to ensure the copy is really done. + * But if fsync is disabled, we're done. + */ + if (!enableFsync) + return; + + xldir = AllocateDir(todir); + + while ((xlde = ReadDir(xldir, todir)) != NULL) + { + if (strcmp(xlde->d_name, ".") == 0 || + strcmp(xlde->d_name, "..") == 0) + continue; + + snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name); + + /* + * We don't need to sync subdirectories here since the recursive + * copydir will do it before it returns + */ + if (get_dirent_type(tofile, xlde, false, ERROR) == PGFILETYPE_REG) + fsync_fname(tofile, false); + } + FreeDir(xldir); + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. Recent versions of ext4 have made the window much wider but + * it's been true for ext3 and other filesystems in the past. + */ + fsync_fname(todir, true); +} + +/* + * copy one file + */ +void +copy_file(const char *fromfile, const char *tofile) +{ + char *buffer; + int srcfd; + int dstfd; + int nbytes; + off_t offset; + off_t flush_offset; + + /* Size of copy buffer (read and write requests) */ +#define COPY_BUF_SIZE (8 * BLCKSZ) + + /* + * Size of data flush requests. It seems beneficial on most platforms to + * do this every 1MB or so. But macOS, at least with early releases of + * APFS, is really unfriendly to small mmap/msync requests, so there do it + * only every 32MB. + */ +#if defined(__darwin__) +#define FLUSH_DISTANCE (32 * 1024 * 1024) +#else +#define FLUSH_DISTANCE (1024 * 1024) +#endif + + /* Use palloc to ensure we get a maxaligned buffer */ + buffer = palloc(COPY_BUF_SIZE); + + /* + * Open the files + */ + srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY); + if (srcfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fromfile))); + + dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + if (dstfd < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", tofile))); + + /* + * Do the data copying. + */ + flush_offset = 0; + for (offset = 0;; offset += nbytes) + { + /* If we got a cancel signal during the copy of the file, quit */ + CHECK_FOR_INTERRUPTS(); + + /* + * We fsync the files later, but during the copy, flush them every so + * often to avoid spamming the cache and hopefully get the kernel to + * start writing them out before the fsync comes. + */ + if (offset - flush_offset >= FLUSH_DISTANCE) + { + pg_flush_data(dstfd, flush_offset, offset - flush_offset); + flush_offset = offset; + } + + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ); + nbytes = read(srcfd, buffer, COPY_BUF_SIZE); + pgstat_report_wait_end(); + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read file \"%s\": %m", fromfile))); + if (nbytes == 0) + break; + errno = 0; + pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE); + if ((int) write(dstfd, buffer, nbytes) != nbytes) + { + /* if write didn't set errno, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", tofile))); + } + pgstat_report_wait_end(); + } + + if (offset > flush_offset) + pg_flush_data(dstfd, flush_offset, offset - flush_offset); + + if (CloseTransientFile(dstfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", tofile))); + + if (CloseTransientFile(srcfd) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", fromfile))); + + pfree(buffer); +} diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c new file mode 100644 index 0000000..16b3e8f --- /dev/null +++ b/src/backend/storage/file/fd.c @@ -0,0 +1,3976 @@ +/*------------------------------------------------------------------------- + * + * fd.c + * Virtual file descriptor code. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/fd.c + * + * NOTES: + * + * This code manages a cache of 'virtual' file descriptors (VFDs). + * The server opens many file descriptors for a variety of reasons, + * including base tables, scratch files (e.g., sort and hash spool + * files), and random calls to C library routines like system(3); it + * is quite easy to exceed system limits on the number of open files a + * single process can have. (This is around 1024 on many modern + * operating systems, but may be lower on others.) + * + * VFDs are managed as an LRU pool, with actual OS file descriptors + * being opened and closed as needed. Obviously, if a routine is + * opened using these interfaces, all subsequent operations must also + * be through these interfaces (the File type is not a real file + * descriptor). + * + * For this scheme to work, most (if not all) routines throughout the + * server should use these interfaces instead of calling the C library + * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we + * may find ourselves short of real file descriptors anyway. + * + * INTERFACE ROUTINES + * + * PathNameOpenFile and OpenTemporaryFile are used to open virtual files. + * A File opened with OpenTemporaryFile is automatically deleted when the + * File is closed, either explicitly or implicitly at end of transaction or + * process exit. PathNameOpenFile is intended for files that are held open + * for a long time, like relation files. It is the caller's responsibility + * to close them, there is no automatic mechanism in fd.c for that. + * + * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage + * temporary files that have names so that they can be shared between + * backends. Such files are automatically closed and count against the + * temporary file limit of the backend that creates them, but unlike anonymous + * files they are not automatically deleted. See sharedfileset.c for a shared + * ownership mechanism that provides automatic cleanup for shared files when + * the last of a group of backends detaches. + * + * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are + * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively. + * They behave like the corresponding native functions, except that the handle + * is registered with the current subtransaction, and will be automatically + * closed at abort. These are intended mainly for short operations like + * reading a configuration file; there is a limit on the number of files that + * can be opened using these functions at any one time. + * + * Finally, BasicOpenFile is just a thin wrapper around open() that can + * release file descriptors in use by the virtual file descriptors if + * necessary. There is no automatic cleanup of file descriptors returned by + * BasicOpenFile, it is solely the caller's responsibility to close the file + * descriptor by calling close(2). + * + * If a non-virtual file descriptor needs to be held open for any length of + * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD + * (and eventually ReleaseExternalFD), so that we can take it into account + * while deciding how many VFDs can be open. This applies to FDs obtained + * with BasicOpenFile as well as those obtained without use of any fd.c API. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#include /* for getrlimit */ +#include +#include +#ifndef WIN32 +#include +#endif +#include +#include +#include + +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/pg_tablespace.h" +#include "common/file_perm.h" +#include "common/file_utils.h" +#include "common/pg_prng.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "portability/mem.h" +#include "postmaster/startup.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "utils/guc.h" +#include "utils/guc_hooks.h" +#include "utils/resowner_private.h" +#include "utils/varlena.h" + +/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ +#if defined(HAVE_SYNC_FILE_RANGE) +#define PG_FLUSH_DATA_WORKS 1 +#elif !defined(WIN32) && defined(MS_ASYNC) +#define PG_FLUSH_DATA_WORKS 1 +#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) +#define PG_FLUSH_DATA_WORKS 1 +#endif + +/* + * We must leave some file descriptors free for system(), the dynamic loader, + * and other code that tries to open files without consulting fd.c. This + * is the number left free. (While we try fairly hard to prevent EMFILE + * errors, there's never any guarantee that we won't get ENFILE due to + * other processes chewing up FDs. So it's a bad idea to try to open files + * without consulting fd.c. Nonetheless we cannot control all code.) + * + * Because this is just a fixed setting, we are effectively assuming that + * no such code will leave FDs open over the long term; otherwise the slop + * is likely to be insufficient. Note in particular that we expect that + * loading a shared library does not result in any permanent increase in + * the number of open files. (This appears to be true on most if not + * all platforms as of Feb 2004.) + */ +#define NUM_RESERVED_FDS 10 + +/* + * If we have fewer than this many usable FDs after allowing for the reserved + * ones, choke. (This value is chosen to work with "ulimit -n 64", but not + * much less than that. Note that this value ensures numExternalFDs can be + * at least 16; as of this writing, the contrib/postgres_fdw regression tests + * will not pass unless that can grow to at least 14.) + */ +#define FD_MINFREE 48 + +/* + * A number of platforms allow individual processes to open many more files + * than they can really support when *many* processes do the same thing. + * This GUC parameter lets the DBA limit max_safe_fds to something less than + * what the postmaster's initial probe suggests will work. + */ +int max_files_per_process = 1000; + +/* + * Maximum number of file descriptors to open for operations that fd.c knows + * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized + * to a conservative value, and remains that way indefinitely in bootstrap or + * standalone-backend cases. In normal postmaster operation, the postmaster + * calls set_max_safe_fds() late in initialization to update the value, and + * that value is then inherited by forked subprocesses. + * + * Note: the value of max_files_per_process is taken into account while + * setting this variable, and so need not be tested separately. + */ +int max_safe_fds = FD_MINFREE; /* default if not changed */ + +/* Whether it is safe to continue running after fsync() fails. */ +bool data_sync_retry = false; + +/* How SyncDataDirectory() should do its job. */ +int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; + +/* Which kinds of files should be opened with PG_O_DIRECT. */ +int io_direct_flags; + +/* Debugging.... */ + +#ifdef FDDEBUG +#define DO_DB(A) \ + do { \ + int _do_db_save_errno = errno; \ + A; \ + errno = _do_db_save_errno; \ + } while (0) +#else +#define DO_DB(A) \ + ((void) 0) +#endif + +#define VFD_CLOSED (-1) + +#define FileIsValid(file) \ + ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL) + +#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED) + +/* these are the assigned bits in fdstate below: */ +#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */ +#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */ +#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */ + +typedef struct vfd +{ + int fd; /* current FD, or VFD_CLOSED if none */ + unsigned short fdstate; /* bitflags for VFD's state */ + ResourceOwner resowner; /* owner, for automatic cleanup */ + File nextFree; /* link to next free VFD, if in freelist */ + File lruMoreRecently; /* doubly linked recency-of-use list */ + File lruLessRecently; + off_t fileSize; /* current size of file (0 if not temporary) */ + char *fileName; /* name of file, or NULL for unused VFD */ + /* NB: fileName is malloc'd, and must be free'd when closing the VFD */ + int fileFlags; /* open(2) flags for (re)opening the file */ + mode_t fileMode; /* mode to pass to open(2) */ +} Vfd; + +/* + * Virtual File Descriptor array pointer and size. This grows as + * needed. 'File' values are indexes into this array. + * Note that VfdCache[0] is not a usable VFD, just a list header. + */ +static Vfd *VfdCache; +static Size SizeVfdCache = 0; + +/* + * Number of file descriptors known to be in use by VFD entries. + */ +static int nfile = 0; + +/* + * Flag to tell whether it's worth scanning VfdCache looking for temp files + * to close + */ +static bool have_xact_temporary_files = false; + +/* + * Tracks the total size of all temporary files. Note: when temp_file_limit + * is being enforced, this cannot overflow since the limit cannot be more + * than INT_MAX kilobytes. When not enforcing, it could theoretically + * overflow, but we don't care. + */ +static uint64 temporary_files_size = 0; + +/* Temporary file access initialized and not yet shut down? */ +#ifdef USE_ASSERT_CHECKING +static bool temporary_files_allowed = false; +#endif + +/* + * List of OS handles opened with AllocateFile, AllocateDir and + * OpenTransientFile. + */ +typedef enum +{ + AllocateDescFile, + AllocateDescPipe, + AllocateDescDir, + AllocateDescRawFD +} AllocateDescKind; + +typedef struct +{ + AllocateDescKind kind; + SubTransactionId create_subid; + union + { + FILE *file; + DIR *dir; + int fd; + } desc; +} AllocateDesc; + +static int numAllocatedDescs = 0; +static int maxAllocatedDescs = 0; +static AllocateDesc *allocatedDescs = NULL; + +/* + * Number of open "external" FDs reported to Reserve/ReleaseExternalFD. + */ +static int numExternalFDs = 0; + +/* + * Number of temporary files opened during the current session; + * this is used in generation of tempfile names. + */ +static long tempFileCounter = 0; + +/* + * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid, + * indicating that the current database's default tablespace should be used.) + * When numTempTableSpaces is -1, this has not been set in the current + * transaction. + */ +static Oid *tempTableSpaces = NULL; +static int numTempTableSpaces = -1; +static int nextTempTableSpace = 0; + + +/*-------------------- + * + * Private Routines + * + * Delete - delete a file from the Lru ring + * LruDelete - remove a file from the Lru ring and close its FD + * Insert - put a file at the front of the Lru ring + * LruInsert - put a file at the front of the Lru ring and open it + * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring + * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit + * AllocateVfd - grab a free (or new) file record (from VfdCache) + * FreeVfd - free a file record + * + * The Least Recently Used ring is a doubly linked list that begins and + * ends on element zero. Element zero is special -- it doesn't represent + * a file and its "fd" field always == VFD_CLOSED. Element zero is just an + * anchor that shows us the beginning/end of the ring. + * Only VFD elements that are currently really open (have an FD assigned) are + * in the Lru ring. Elements that are "virtually" open can be recognized + * by having a non-null fileName field. + * + * example: + * + * /--less----\ /---------\ + * v \ v \ + * #0 --more---> LeastRecentlyUsed --more-\ \ + * ^\ | | + * \\less--> MostRecentlyUsedFile <---/ | + * \more---/ \--less--/ + * + *-------------------- + */ +static void Delete(File file); +static void LruDelete(File file); +static void Insert(File file); +static int LruInsert(File file); +static bool ReleaseLruFile(void); +static void ReleaseLruFiles(void); +static File AllocateVfd(void); +static void FreeVfd(File file); + +static int FileAccess(File file); +static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError); +static bool reserveAllocatedDesc(void); +static int FreeDesc(AllocateDesc *desc); + +static void BeforeShmemExit_Files(int code, Datum arg); +static void CleanupTempFiles(bool isCommit, bool isProcExit); +static void RemovePgTempRelationFiles(const char *tsdirname); +static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); + +static void walkdir(const char *path, + void (*action) (const char *fname, bool isdir, int elevel), + bool process_symlinks, + int elevel); +#ifdef PG_FLUSH_DATA_WORKS +static void pre_sync_fname(const char *fname, bool isdir, int elevel); +#endif +static void datadir_fsync_fname(const char *fname, bool isdir, int elevel); +static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel); + +static int fsync_parent_path(const char *fname, int elevel); + + +/* + * pg_fsync --- do fsync with or without writethrough + */ +int +pg_fsync(int fd) +{ +#if !defined(WIN32) && defined(USE_ASSERT_CHECKING) + struct stat st; + + /* + * Some operating system implementations of fsync() have requirements + * about the file access modes that were used when their file descriptor + * argument was opened, and these requirements differ depending on whether + * the file descriptor is for a directory. + * + * For any file descriptor that may eventually be handed to fsync(), we + * should have opened it with access modes that are compatible with + * fsync() on all supported systems, otherwise the code may not be + * portable, even if it runs ok on the current system. + * + * We assert here that a descriptor for a file was opened with write + * permissions (either O_RDWR or O_WRONLY) and for a directory without + * write permissions (O_RDONLY). + * + * Ignore any fstat errors and let the follow-up fsync() do its work. + * Doing this sanity check here counts for the case where fsync() is + * disabled. + */ + if (fstat(fd, &st) == 0) + { + int desc_flags = fcntl(fd, F_GETFL); + + /* + * O_RDONLY is historically 0, so just make sure that for directories + * no write flags are used. + */ + if (S_ISDIR(st.st_mode)) + Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0); + else + Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0); + } + errno = 0; +#endif + + /* #if is to skip the sync_method test if there's no need for it */ +#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC) + if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH) + return pg_fsync_writethrough(fd); + else +#endif + return pg_fsync_no_writethrough(fd); +} + + +/* + * pg_fsync_no_writethrough --- same as fsync except does nothing if + * enableFsync is off + */ +int +pg_fsync_no_writethrough(int fd) +{ + int rc; + + if (!enableFsync) + return 0; + +retry: + rc = fsync(fd); + + if (rc == -1 && errno == EINTR) + goto retry; + + return rc; +} + +/* + * pg_fsync_writethrough + */ +int +pg_fsync_writethrough(int fd) +{ + if (enableFsync) + { +#ifdef WIN32 + return _commit(fd); +#elif defined(F_FULLFSYNC) + return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0; +#else + errno = ENOSYS; + return -1; +#endif + } + else + return 0; +} + +/* + * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off + */ +int +pg_fdatasync(int fd) +{ + int rc; + + if (!enableFsync) + return 0; + +retry: + rc = fdatasync(fd); + + if (rc == -1 && errno == EINTR) + goto retry; + + return rc; +} + +/* + * pg_flush_data --- advise OS that the described dirty data should be flushed + * + * offset of 0 with nbytes 0 means that the entire file should be flushed + */ +void +pg_flush_data(int fd, off_t offset, off_t nbytes) +{ + /* + * Right now file flushing is primarily used to avoid making later + * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes + * if fsyncs are disabled - that's a decision we might want to make + * configurable at some point. + */ + if (!enableFsync) + return; + + /* + * We compile all alternatives that are supported on the current platform, + * to find portability problems more easily. + */ +#if defined(HAVE_SYNC_FILE_RANGE) + { + int rc; + static bool not_implemented_by_kernel = false; + + if (not_implemented_by_kernel) + return; + +retry: + + /* + * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific, + * tells the OS that writeback for the specified blocks should be + * started, but that we don't want to wait for completion. Note that + * this call might block if too much dirty data exists in the range. + * This is the preferable method on OSs supporting it, as it works + * reliably when available (contrast to msync()) and doesn't flush out + * clean data (like FADV_DONTNEED). + */ + rc = sync_file_range(fd, offset, nbytes, + SYNC_FILE_RANGE_WRITE); + if (rc != 0) + { + int elevel; + + if (rc == EINTR) + goto retry; + + /* + * For systems that don't have an implementation of + * sync_file_range() such as Windows WSL, generate only one + * warning and then suppress all further attempts by this process. + */ + if (errno == ENOSYS) + { + elevel = WARNING; + not_implemented_by_kernel = true; + } + else + elevel = data_sync_elevel(WARNING); + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + } + + return; + } +#endif +#if !defined(WIN32) && defined(MS_ASYNC) + { + void *p; + static int pagesize = 0; + + /* + * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers + * writeback. On linux it only does so if MS_SYNC is specified, but + * then it does the writeback synchronously. Luckily all common linux + * systems have sync_file_range(). This is preferable over + * FADV_DONTNEED because it doesn't flush out clean data. + * + * We map the file (mmap()), tell the kernel to sync back the contents + * (msync()), and then remove the mapping again (munmap()). + */ + + /* mmap() needs actual length if we want to map whole file */ + if (offset == 0 && nbytes == 0) + { + nbytes = lseek(fd, 0, SEEK_END); + if (nbytes < 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not determine dirty data size: %m"))); + return; + } + } + + /* + * Some platforms reject partial-page mmap() attempts. To deal with + * that, just truncate the request to a page boundary. If any extra + * bytes don't get flushed, well, it's only a hint anyway. + */ + + /* fetch pagesize only once */ + if (pagesize == 0) + pagesize = sysconf(_SC_PAGESIZE); + + /* align length to pagesize, dropping any fractional page */ + if (pagesize > 0) + nbytes = (nbytes / pagesize) * pagesize; + + /* fractional-page request is a no-op */ + if (nbytes <= 0) + return; + + /* + * mmap could well fail, particularly on 32-bit platforms where there + * may simply not be enough address space. If so, silently fall + * through to the next implementation. + */ + if (nbytes <= (off_t) SSIZE_MAX) + p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset); + else + p = MAP_FAILED; + + if (p != MAP_FAILED) + { + int rc; + + rc = msync(p, (size_t) nbytes, MS_ASYNC); + if (rc != 0) + { + ereport(data_sync_elevel(WARNING), + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + /* NB: need to fall through to munmap()! */ + } + + rc = munmap(p, (size_t) nbytes); + if (rc != 0) + { + /* FATAL error because mapping would remain */ + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not munmap() while flushing data: %m"))); + } + + return; + } + } +#endif +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + { + int rc; + + /* + * Signal the kernel that the passed in range should not be cached + * anymore. This has the, desired, side effect of writing out dirty + * data, and the, undesired, side effect of likely discarding useful + * clean cached blocks. For the latter reason this is the least + * preferable method. + */ + + rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED); + + if (rc != 0) + { + /* don't error out, this is just a performance optimization */ + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + } + + return; + } +#endif +} + +/* + * Truncate an open file to a given length. + */ +static int +pg_ftruncate(int fd, off_t length) +{ + int ret; + +retry: + ret = ftruncate(fd, length); + + if (ret == -1 && errno == EINTR) + goto retry; + + return ret; +} + +/* + * Truncate a file to a given length by name. + */ +int +pg_truncate(const char *path, off_t length) +{ + int ret; +#ifdef WIN32 + int save_errno; + int fd; + + fd = OpenTransientFile(path, O_RDWR | PG_BINARY); + if (fd >= 0) + { + ret = pg_ftruncate(fd, length); + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + } + else + ret = -1; +#else + +retry: + ret = truncate(path, length); + + if (ret == -1 && errno == EINTR) + goto retry; +#endif + + return ret; +} + +/* + * fsync_fname -- fsync a file or directory, handling errors properly + * + * Try to fsync a file or directory. When doing the latter, ignore errors that + * indicate the OS just doesn't allow/require fsyncing directories. + */ +void +fsync_fname(const char *fname, bool isdir) +{ + fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR)); +} + +/* + * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability + * + * This routine ensures that, after returning, the effect of renaming file + * persists in case of a crash. A crash while this routine is running will + * leave you with either the pre-existing or the moved file in place of the + * new file; no mixed state or truncated files are possible. + * + * It does so by using fsync on the old filename and the possibly existing + * target filename before the rename, and the target file and directory after. + * + * Note that rename() cannot be used across arbitrary directories, as they + * might not be on the same filesystem. Therefore this routine does not + * support renaming across directories. + * + * Log errors with the caller specified severity. + * + * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not + * valid upon return. + */ +int +durable_rename(const char *oldfile, const char *newfile, int elevel) +{ + int fd; + + /* + * First fsync the old and target path (if it exists), to ensure that they + * are properly persistent on disk. Syncing the target file is not + * strictly necessary, but it makes it easier to reason about crashes; + * because it's then guaranteed that either source or target file exists + * after a crash. + */ + if (fsync_fname_ext(oldfile, false, false, elevel) != 0) + return -1; + + fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR); + if (fd < 0) + { + if (errno != ENOENT) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", newfile))); + return -1; + } + } + else + { + if (pg_fsync(fd) != 0) + { + int save_errno; + + /* close file upon error, might not be in transaction context */ + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", newfile))); + return -1; + } + + if (CloseTransientFile(fd) != 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", newfile))); + return -1; + } + } + + /* Time to do the real deal... */ + if (rename(oldfile, newfile) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not rename file \"%s\" to \"%s\": %m", + oldfile, newfile))); + return -1; + } + + /* + * To guarantee renaming the file is persistent, fsync the file with its + * new name, and its containing directory. + */ + if (fsync_fname_ext(newfile, false, false, elevel) != 0) + return -1; + + if (fsync_parent_path(newfile, elevel) != 0) + return -1; + + return 0; +} + +/* + * durable_unlink -- remove a file in a durable manner + * + * This routine ensures that, after returning, the effect of removing file + * persists in case of a crash. A crash while this routine is running will + * leave the system in no mixed state. + * + * It does so by using fsync on the parent directory of the file after the + * actual removal is done. + * + * Log errors with the severity specified by caller. + * + * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not + * valid upon return. + */ +int +durable_unlink(const char *fname, int elevel) +{ + if (unlink(fname) < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + fname))); + return -1; + } + + /* + * To guarantee that the removal of the file is persistent, fsync its + * parent directory. + */ + if (fsync_parent_path(fname, elevel) != 0) + return -1; + + return 0; +} + +/* + * InitFileAccess --- initialize this module during backend startup + * + * This is called during either normal or standalone backend start. + * It is *not* called in the postmaster. + * + * Note that this does not initialize temporary file access, that is + * separately initialized via InitTemporaryFileAccess(). + */ +void +InitFileAccess(void) +{ + Assert(SizeVfdCache == 0); /* call me only once */ + + /* initialize cache header entry */ + VfdCache = (Vfd *) malloc(sizeof(Vfd)); + if (VfdCache == NULL) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd)); + VfdCache->fd = VFD_CLOSED; + + SizeVfdCache = 1; +} + +/* + * InitTemporaryFileAccess --- initialize temporary file access during startup + * + * This is called during either normal or standalone backend start. + * It is *not* called in the postmaster. + * + * This is separate from InitFileAccess() because temporary file cleanup can + * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(), + * our reporting has to happen before that. Low level file access should be + * available for longer, hence the separate initialization / shutdown of + * temporary file handling. + */ +void +InitTemporaryFileAccess(void) +{ + Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */ + Assert(!temporary_files_allowed); /* call me only once */ + + /* + * Register before-shmem-exit hook to ensure temp files are dropped while + * we can still report stats. + */ + before_shmem_exit(BeforeShmemExit_Files, 0); + +#ifdef USE_ASSERT_CHECKING + temporary_files_allowed = true; +#endif +} + +/* + * count_usable_fds --- count how many FDs the system will let us open, + * and estimate how many are already open. + * + * We stop counting if usable_fds reaches max_to_probe. Note: a small + * value of max_to_probe might result in an underestimate of already_open; + * we must fill in any "gaps" in the set of used FDs before the calculation + * of already_open will give the right answer. In practice, max_to_probe + * of a couple of dozen should be enough to ensure good results. + * + * We assume stderr (FD 2) is available for dup'ing. While the calling + * script could theoretically close that, it would be a really bad idea, + * since then one risks loss of error messages from, e.g., libc. + */ +static void +count_usable_fds(int max_to_probe, int *usable_fds, int *already_open) +{ + int *fd; + int size; + int used = 0; + int highestfd = 0; + int j; + +#ifdef HAVE_GETRLIMIT + struct rlimit rlim; + int getrlimit_status; +#endif + + size = 1024; + fd = (int *) palloc(size * sizeof(int)); + +#ifdef HAVE_GETRLIMIT + getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim); + if (getrlimit_status != 0) + ereport(WARNING, (errmsg("getrlimit failed: %m"))); +#endif /* HAVE_GETRLIMIT */ + + /* dup until failure or probe limit reached */ + for (;;) + { + int thisfd; + +#ifdef HAVE_GETRLIMIT + + /* + * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on + * some platforms + */ + if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1) + break; +#endif + + thisfd = dup(2); + if (thisfd < 0) + { + /* Expect EMFILE or ENFILE, else it's fishy */ + if (errno != EMFILE && errno != ENFILE) + elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used); + break; + } + + if (used >= size) + { + size *= 2; + fd = (int *) repalloc(fd, size * sizeof(int)); + } + fd[used++] = thisfd; + + if (highestfd < thisfd) + highestfd = thisfd; + + if (used >= max_to_probe) + break; + } + + /* release the files we opened */ + for (j = 0; j < used; j++) + close(fd[j]); + + pfree(fd); + + /* + * Return results. usable_fds is just the number of successful dups. We + * assume that the system limit is highestfd+1 (remember 0 is a legal FD + * number) and so already_open is highestfd+1 - usable_fds. + */ + *usable_fds = used; + *already_open = highestfd + 1 - used; +} + +/* + * set_max_safe_fds + * Determine number of file descriptors that fd.c is allowed to use + */ +void +set_max_safe_fds(void) +{ + int usable_fds; + int already_open; + + /*---------- + * We want to set max_safe_fds to + * MIN(usable_fds, max_files_per_process - already_open) + * less the slop factor for files that are opened without consulting + * fd.c. This ensures that we won't exceed either max_files_per_process + * or the experimentally-determined EMFILE limit. + *---------- + */ + count_usable_fds(max_files_per_process, + &usable_fds, &already_open); + + max_safe_fds = Min(usable_fds, max_files_per_process - already_open); + + /* + * Take off the FDs reserved for system() etc. + */ + max_safe_fds -= NUM_RESERVED_FDS; + + /* + * Make sure we still have enough to get by. + */ + if (max_safe_fds < FD_MINFREE) + ereport(FATAL, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("insufficient file descriptors available to start server process"), + errdetail("System allows %d, server needs at least %d.", + max_safe_fds + NUM_RESERVED_FDS, + FD_MINFREE + NUM_RESERVED_FDS))); + + elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d", + max_safe_fds, usable_fds, already_open); +} + +/* + * Open a file with BasicOpenFilePerm() and pass default file mode for the + * fileMode parameter. + */ +int +BasicOpenFile(const char *fileName, int fileFlags) +{ + return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode); +} + +/* + * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed + * + * This is exported for use by places that really want a plain kernel FD, + * but need to be proof against running out of FDs. Once an FD has been + * successfully returned, it is the caller's responsibility to ensure that + * it will not be leaked on ereport()! Most users should *not* call this + * routine directly, but instead use the VFD abstraction level, which + * provides protection against descriptor leaks as well as management of + * files that need to be open for more than a short period of time. + * + * Ideally this should be the *only* direct call of open() in the backend. + * In practice, the postmaster calls open() directly, and there are some + * direct open() calls done early in backend startup. Those are OK since + * this module wouldn't have any open files to close at that point anyway. + */ +int +BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) +{ + int fd; + +tryAgain: +#ifdef PG_O_DIRECT_USE_F_NOCACHE + + /* + * The value we defined to stand in for O_DIRECT when simulating it with + * F_NOCACHE had better not collide with any of the standard flags. + */ + StaticAssertStmt((PG_O_DIRECT & + (O_APPEND | + O_CLOEXEC | + O_CREAT | + O_DSYNC | + O_EXCL | + O_RDWR | + O_RDONLY | + O_SYNC | + O_TRUNC | + O_WRONLY)) == 0, + "PG_O_DIRECT value collides with standard flag"); + fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode); +#else + fd = open(fileName, fileFlags, fileMode); +#endif + + if (fd >= 0) + { +#ifdef PG_O_DIRECT_USE_F_NOCACHE + if (fileFlags & PG_O_DIRECT) + { + if (fcntl(fd, F_NOCACHE, 1) < 0) + { + int save_errno = errno; + + close(fd); + errno = save_errno; + return -1; + } + } +#endif + + return fd; /* success! */ + } + + if (errno == EMFILE || errno == ENFILE) + { + int save_errno = errno; + + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + errno = 0; + if (ReleaseLruFile()) + goto tryAgain; + errno = save_errno; + } + + return -1; /* failure */ +} + +/* + * AcquireExternalFD - attempt to reserve an external file descriptor + * + * This should be used by callers that need to hold a file descriptor open + * over more than a short interval, but cannot use any of the other facilities + * provided by this module. + * + * The difference between this and the underlying ReserveExternalFD function + * is that this will report failure (by setting errno and returning false) + * if "too many" external FDs are already reserved. This should be used in + * any code where the total number of FDs to be reserved is not predictable + * and small. + */ +bool +AcquireExternalFD(void) +{ + /* + * We don't want more than max_safe_fds / 3 FDs to be consumed for + * "external" FDs. + */ + if (numExternalFDs < max_safe_fds / 3) + { + ReserveExternalFD(); + return true; + } + errno = EMFILE; + return false; +} + +/* + * ReserveExternalFD - report external consumption of a file descriptor + * + * This should be used by callers that need to hold a file descriptor open + * over more than a short interval, but cannot use any of the other facilities + * provided by this module. This just tracks the use of the FD and closes + * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available. + * + * Call this directly only in code where failure to reserve the FD would be + * fatal; for example, the WAL-writing code does so, since the alternative is + * session failure. Also, it's very unwise to do so in code that could + * consume more than one FD per process. + * + * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain + * available, it doesn't matter too much whether this is called before or + * after actually opening the FD; but doing so beforehand reduces the risk of + * an EMFILE failure if not everybody played nice. In any case, it's solely + * caller's responsibility to keep the external-FD count in sync with reality. + */ +void +ReserveExternalFD(void) +{ + /* + * Release VFDs if needed to stay safe. Because we do this before + * incrementing numExternalFDs, the final state will be as desired, i.e., + * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds. + */ + ReleaseLruFiles(); + + numExternalFDs++; +} + +/* + * ReleaseExternalFD - report release of an external file descriptor + * + * This is guaranteed not to change errno, so it can be used in failure paths. + */ +void +ReleaseExternalFD(void) +{ + Assert(numExternalFDs > 0); + numExternalFDs--; +} + + +#if defined(FDDEBUG) + +static void +_dump_lru(void) +{ + int mru = VfdCache[0].lruLessRecently; + Vfd *vfdP = &VfdCache[mru]; + char buf[2048]; + + snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru); + while (mru != 0) + { + mru = vfdP->lruLessRecently; + vfdP = &VfdCache[mru]; + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru); + } + snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST"); + elog(LOG, "%s", buf); +} +#endif /* FDDEBUG */ + +static void +Delete(File file) +{ + Vfd *vfdP; + + Assert(file != 0); + + DO_DB(elog(LOG, "Delete %d (%s)", + file, VfdCache[file].fileName)); + DO_DB(_dump_lru()); + + vfdP = &VfdCache[file]; + + VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently; + VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently; + + DO_DB(_dump_lru()); +} + +static void +LruDelete(File file) +{ + Vfd *vfdP; + + Assert(file != 0); + + DO_DB(elog(LOG, "LruDelete %d (%s)", + file, VfdCache[file].fileName)); + + vfdP = &VfdCache[file]; + + /* + * Close the file. We aren't expecting this to fail; if it does, better + * to leak the FD than to mess up our internal state. + */ + if (close(vfdP->fd) != 0) + elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), + "could not close file \"%s\": %m", vfdP->fileName); + vfdP->fd = VFD_CLOSED; + --nfile; + + /* delete the vfd record from the LRU ring */ + Delete(file); +} + +static void +Insert(File file) +{ + Vfd *vfdP; + + Assert(file != 0); + + DO_DB(elog(LOG, "Insert %d (%s)", + file, VfdCache[file].fileName)); + DO_DB(_dump_lru()); + + vfdP = &VfdCache[file]; + + vfdP->lruMoreRecently = 0; + vfdP->lruLessRecently = VfdCache[0].lruLessRecently; + VfdCache[0].lruLessRecently = file; + VfdCache[vfdP->lruLessRecently].lruMoreRecently = file; + + DO_DB(_dump_lru()); +} + +/* returns 0 on success, -1 on re-open failure (with errno set) */ +static int +LruInsert(File file) +{ + Vfd *vfdP; + + Assert(file != 0); + + DO_DB(elog(LOG, "LruInsert %d (%s)", + file, VfdCache[file].fileName)); + + vfdP = &VfdCache[file]; + + if (FileIsNotOpen(file)) + { + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + + /* + * The open could still fail for lack of file descriptors, eg due to + * overall system file table being full. So, be prepared to release + * another FD if necessary... + */ + vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags, + vfdP->fileMode); + if (vfdP->fd < 0) + { + DO_DB(elog(LOG, "re-open failed: %m")); + return -1; + } + else + { + ++nfile; + } + } + + /* + * put it at the head of the Lru ring + */ + + Insert(file); + + return 0; +} + +/* + * Release one kernel FD by closing the least-recently-used VFD. + */ +static bool +ReleaseLruFile(void) +{ + DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile)); + + if (nfile > 0) + { + /* + * There are opened files and so there should be at least one used vfd + * in the ring. + */ + Assert(VfdCache[0].lruMoreRecently != 0); + LruDelete(VfdCache[0].lruMoreRecently); + return true; /* freed a file */ + } + return false; /* no files available to free */ +} + +/* + * Release kernel FDs as needed to get under the max_safe_fds limit. + * After calling this, it's OK to try to open another file. + */ +static void +ReleaseLruFiles(void) +{ + while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds) + { + if (!ReleaseLruFile()) + break; + } +} + +static File +AllocateVfd(void) +{ + Index i; + File file; + + DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache)); + + Assert(SizeVfdCache > 0); /* InitFileAccess not called? */ + + if (VfdCache[0].nextFree == 0) + { + /* + * The free list is empty so it is time to increase the size of the + * array. We choose to double it each time this happens. However, + * there's not much point in starting *real* small. + */ + Size newCacheSize = SizeVfdCache * 2; + Vfd *newVfdCache; + + if (newCacheSize < 32) + newCacheSize = 32; + + /* + * Be careful not to clobber VfdCache ptr if realloc fails. + */ + newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize); + if (newVfdCache == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + VfdCache = newVfdCache; + + /* + * Initialize the new entries and link them into the free list. + */ + for (i = SizeVfdCache; i < newCacheSize; i++) + { + MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd)); + VfdCache[i].nextFree = i + 1; + VfdCache[i].fd = VFD_CLOSED; + } + VfdCache[newCacheSize - 1].nextFree = 0; + VfdCache[0].nextFree = SizeVfdCache; + + /* + * Record the new size + */ + SizeVfdCache = newCacheSize; + } + + file = VfdCache[0].nextFree; + + VfdCache[0].nextFree = VfdCache[file].nextFree; + + return file; +} + +static void +FreeVfd(File file) +{ + Vfd *vfdP = &VfdCache[file]; + + DO_DB(elog(LOG, "FreeVfd: %d (%s)", + file, vfdP->fileName ? vfdP->fileName : "")); + + if (vfdP->fileName != NULL) + { + free(vfdP->fileName); + vfdP->fileName = NULL; + } + vfdP->fdstate = 0x0; + + vfdP->nextFree = VfdCache[0].nextFree; + VfdCache[0].nextFree = file; +} + +/* returns 0 on success, -1 on re-open failure (with errno set) */ +static int +FileAccess(File file) +{ + int returnValue; + + DO_DB(elog(LOG, "FileAccess %d (%s)", + file, VfdCache[file].fileName)); + + /* + * Is the file open? If not, open it and put it at the head of the LRU + * ring (possibly closing the least recently used file to get an FD). + */ + + if (FileIsNotOpen(file)) + { + returnValue = LruInsert(file); + if (returnValue != 0) + return returnValue; + } + else if (VfdCache[0].lruLessRecently != file) + { + /* + * We now know that the file is open and that it is not the last one + * accessed, so we need to move it to the head of the Lru ring. + */ + + Delete(file); + Insert(file); + } + + return 0; +} + +/* + * Called whenever a temporary file is deleted to report its size. + */ +static void +ReportTemporaryFileUsage(const char *path, off_t size) +{ + pgstat_report_tempfile(size); + + if (log_temp_files >= 0) + { + if ((size / 1024) >= log_temp_files) + ereport(LOG, + (errmsg("temporary file: path \"%s\", size %lu", + path, (unsigned long) size))); + } +} + +/* + * Called to register a temporary file for automatic close. + * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called + * before the file was opened. + */ +static void +RegisterTemporaryFile(File file) +{ + ResourceOwnerRememberFile(CurrentResourceOwner, file); + VfdCache[file].resowner = CurrentResourceOwner; + + /* Backup mechanism for closing at end of xact. */ + VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT; + have_xact_temporary_files = true; +} + +/* + * Called when we get a shared invalidation message on some relation. + */ +#ifdef NOT_USED +void +FileInvalidate(File file) +{ + Assert(FileIsValid(file)); + if (!FileIsNotOpen(file)) + LruDelete(file); +} +#endif + +/* + * Open a file with PathNameOpenFilePerm() and pass default file mode for the + * fileMode parameter. + */ +File +PathNameOpenFile(const char *fileName, int fileFlags) +{ + return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode); +} + +/* + * open a file in an arbitrary directory + * + * NB: if the passed pathname is relative (which it usually is), + * it will be interpreted relative to the process' working directory + * (which should always be $PGDATA when this code is running). + */ +File +PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode) +{ + char *fnamecopy; + File file; + Vfd *vfdP; + + DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o", + fileName, fileFlags, fileMode)); + + /* + * We need a malloc'd copy of the file name; fail cleanly if no room. + */ + fnamecopy = strdup(fileName); + if (fnamecopy == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + file = AllocateVfd(); + vfdP = &VfdCache[file]; + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + + /* + * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The + * client shouldn't be expected to know which kernel descriptors are + * currently open, so it wouldn't make sense for them to be inherited by + * executed subprograms. + */ + fileFlags |= O_CLOEXEC; + + vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); + + if (vfdP->fd < 0) + { + int save_errno = errno; + + FreeVfd(file); + free(fnamecopy); + errno = save_errno; + return -1; + } + ++nfile; + DO_DB(elog(LOG, "PathNameOpenFile: success %d", + vfdP->fd)); + + vfdP->fileName = fnamecopy; + /* Saved flags are adjusted to be OK for re-opening file */ + vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL); + vfdP->fileMode = fileMode; + vfdP->fileSize = 0; + vfdP->fdstate = 0x0; + vfdP->resowner = NULL; + + Insert(file); + + return file; +} + +/* + * Create directory 'directory'. If necessary, create 'basedir', which must + * be the directory above it. This is designed for creating the top-level + * temporary directory on demand before creating a directory underneath it. + * Do nothing if the directory already exists. + * + * Directories created within the top-level temporary directory should begin + * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and + * deleted at startup by RemovePgTempFiles(). Further subdirectories below + * that do not need any particular prefix. +*/ +void +PathNameCreateTemporaryDir(const char *basedir, const char *directory) +{ + if (MakePGDirectory(directory) < 0) + { + if (errno == EEXIST) + return; + + /* + * Failed. Try to create basedir first in case it's missing. Tolerate + * EEXIST to close a race against another process following the same + * algorithm. + */ + if (MakePGDirectory(basedir) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("cannot create temporary directory \"%s\": %m", + basedir))); + + /* Try again. */ + if (MakePGDirectory(directory) < 0 && errno != EEXIST) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("cannot create temporary subdirectory \"%s\": %m", + directory))); + } +} + +/* + * Delete a directory and everything in it, if it exists. + */ +void +PathNameDeleteTemporaryDir(const char *dirname) +{ + struct stat statbuf; + + /* Silently ignore missing directory. */ + if (stat(dirname, &statbuf) != 0 && errno == ENOENT) + return; + + /* + * Currently, walkdir doesn't offer a way for our passed in function to + * maintain state. Perhaps it should, so that we could tell the caller + * whether this operation succeeded or failed. Since this operation is + * used in a cleanup path, we wouldn't actually behave differently: we'll + * just log failures. + */ + walkdir(dirname, unlink_if_exists_fname, false, LOG); +} + +/* + * Open a temporary file that will disappear when we close it. + * + * This routine takes care of generating an appropriate tempfile name. + * There's no need to pass in fileFlags or fileMode either, since only + * one setting makes any sense for a temp file. + * + * Unless interXact is true, the file is remembered by CurrentResourceOwner + * to ensure it's closed and deleted when it's no longer needed, typically at + * the end-of-transaction. In most cases, you don't want temporary files to + * outlive the transaction that created them, so this should be false -- but + * if you need "somewhat" temporary storage, this might be useful. In either + * case, the file is removed when the File is explicitly closed. + */ +File +OpenTemporaryFile(bool interXact) +{ + File file = 0; + + Assert(temporary_files_allowed); /* check temp file access is up */ + + /* + * Make sure the current resource owner has space for this File before we + * open it, if we'll be registering it below. + */ + if (!interXact) + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + /* + * If some temp tablespace(s) have been given to us, try to use the next + * one. If a given tablespace can't be found, we silently fall back to + * the database's default tablespace. + * + * BUT: if the temp file is slated to outlive the current transaction, + * force it into the database's default tablespace, so that it will not + * pose a threat to possible tablespace drop attempts. + */ + if (numTempTableSpaces > 0 && !interXact) + { + Oid tblspcOid = GetNextTempTableSpace(); + + if (OidIsValid(tblspcOid)) + file = OpenTemporaryFileInTablespace(tblspcOid, false); + } + + /* + * If not, or if tablespace is bad, create in database's default + * tablespace. MyDatabaseTableSpace should normally be set before we get + * here, but just in case it isn't, fall back to pg_default tablespace. + */ + if (file <= 0) + file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ? + MyDatabaseTableSpace : + DEFAULTTABLESPACE_OID, + true); + + /* Mark it for deletion at close and temporary file size limit */ + VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT; + + /* Register it with the current resource owner */ + if (!interXact) + RegisterTemporaryFile(file); + + return file; +} + +/* + * Return the path of the temp directory in a given tablespace. + */ +void +TempTablespacePath(char *path, Oid tablespace) +{ + /* + * Identify the tempfile directory for this tablespace. + * + * If someone tries to specify pg_global, use pg_default instead. + */ + if (tablespace == InvalidOid || + tablespace == DEFAULTTABLESPACE_OID || + tablespace == GLOBALTABLESPACE_OID) + snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR); + else + { + /* All other tablespaces are accessed via symlinks */ + snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s", + tablespace, TABLESPACE_VERSION_DIRECTORY, + PG_TEMP_FILES_DIR); + } +} + +/* + * Open a temporary file in a specific tablespace. + * Subroutine for OpenTemporaryFile, which see for details. + */ +static File +OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError) +{ + char tempdirpath[MAXPGPATH]; + char tempfilepath[MAXPGPATH]; + File file; + + TempTablespacePath(tempdirpath, tblspcOid); + + /* + * Generate a tempfile name that should be unique within the current + * database instance. + */ + snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld", + tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++); + + /* + * Open the file. Note: we don't use O_EXCL, in case there is an orphaned + * temp file that can be reused. + */ + file = PathNameOpenFile(tempfilepath, + O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); + if (file <= 0) + { + /* + * We might need to create the tablespace's tempfile directory, if no + * one has yet done so. + * + * Don't check for an error from MakePGDirectory; it could fail if + * someone else just did the same thing. If it doesn't work then + * we'll bomb out on the second create attempt, instead. + */ + (void) MakePGDirectory(tempdirpath); + + file = PathNameOpenFile(tempfilepath, + O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); + if (file <= 0 && rejectError) + elog(ERROR, "could not create temporary file \"%s\": %m", + tempfilepath); + } + + return file; +} + + +/* + * Create a new file. The directory containing it must already exist. Files + * created this way are subject to temp_file_limit and are automatically + * closed at end of transaction, but are not automatically deleted on close + * because they are intended to be shared between cooperating backends. + * + * If the file is inside the top-level temporary directory, its name should + * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary + * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be + * inside a directory created with PathNameCreateTemporaryDir(), in which case + * the prefix isn't needed. + */ +File +PathNameCreateTemporaryFile(const char *path, bool error_on_failure) +{ + File file; + + Assert(temporary_files_allowed); /* check temp file access is up */ + + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + /* + * Open the file. Note: we don't use O_EXCL, in case there is an orphaned + * temp file that can be reused. + */ + file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY); + if (file <= 0) + { + if (error_on_failure) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create temporary file \"%s\": %m", + path))); + else + return file; + } + + /* Mark it for temp_file_limit accounting. */ + VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT; + + /* Register it for automatic close. */ + RegisterTemporaryFile(file); + + return file; +} + +/* + * Open a file that was created with PathNameCreateTemporaryFile, possibly in + * another backend. Files opened this way don't count against the + * temp_file_limit of the caller, are automatically closed at the end of the + * transaction but are not deleted on close. + */ +File +PathNameOpenTemporaryFile(const char *path, int mode) +{ + File file; + + Assert(temporary_files_allowed); /* check temp file access is up */ + + ResourceOwnerEnlargeFiles(CurrentResourceOwner); + + file = PathNameOpenFile(path, mode | PG_BINARY); + + /* If no such file, then we don't raise an error. */ + if (file <= 0 && errno != ENOENT) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open temporary file \"%s\": %m", + path))); + + if (file > 0) + { + /* Register it for automatic close. */ + RegisterTemporaryFile(file); + } + + return file; +} + +/* + * Delete a file by pathname. Return true if the file existed, false if + * didn't. + */ +bool +PathNameDeleteTemporaryFile(const char *path, bool error_on_failure) +{ + struct stat filestats; + int stat_errno; + + /* Get the final size for pgstat reporting. */ + if (stat(path, &filestats) != 0) + stat_errno = errno; + else + stat_errno = 0; + + /* + * Unlike FileClose's automatic file deletion code, we tolerate + * non-existence to support BufFileDeleteFileSet which doesn't know how + * many segments it has to delete until it runs out. + */ + if (stat_errno == ENOENT) + return false; + + if (unlink(path) < 0) + { + if (errno != ENOENT) + ereport(error_on_failure ? ERROR : LOG, + (errcode_for_file_access(), + errmsg("could not unlink temporary file \"%s\": %m", + path))); + return false; + } + + if (stat_errno == 0) + ReportTemporaryFileUsage(path, filestats.st_size); + else + { + errno = stat_errno; + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", path))); + } + + return true; +} + +/* + * close a file when done with it + */ +void +FileClose(File file) +{ + Vfd *vfdP; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileClose: %d (%s)", + file, VfdCache[file].fileName)); + + vfdP = &VfdCache[file]; + + if (!FileIsNotOpen(file)) + { + /* close the file */ + if (close(vfdP->fd) != 0) + { + /* + * We may need to panic on failure to close non-temporary files; + * see LruDelete. + */ + elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG), + "could not close file \"%s\": %m", vfdP->fileName); + } + + --nfile; + vfdP->fd = VFD_CLOSED; + + /* remove the file from the lru ring */ + Delete(file); + } + + if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) + { + /* Subtract its size from current usage (do first in case of error) */ + temporary_files_size -= vfdP->fileSize; + vfdP->fileSize = 0; + } + + /* + * Delete the file if it was temporary, and make a log entry if wanted + */ + if (vfdP->fdstate & FD_DELETE_AT_CLOSE) + { + struct stat filestats; + int stat_errno; + + /* + * If we get an error, as could happen within the ereport/elog calls, + * we'll come right back here during transaction abort. Reset the + * flag to ensure that we can't get into an infinite loop. This code + * is arranged to ensure that the worst-case consequence is failing to + * emit log message(s), not failing to attempt the unlink. + */ + vfdP->fdstate &= ~FD_DELETE_AT_CLOSE; + + + /* first try the stat() */ + if (stat(vfdP->fileName, &filestats)) + stat_errno = errno; + else + stat_errno = 0; + + /* in any case do the unlink */ + if (unlink(vfdP->fileName)) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not delete file \"%s\": %m", vfdP->fileName))); + + /* and last report the stat results */ + if (stat_errno == 0) + ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size); + else + { + errno = stat_errno; + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", vfdP->fileName))); + } + } + + /* Unregister it from the resource owner */ + if (vfdP->resowner) + ResourceOwnerForgetFile(vfdP->resowner, file); + + /* + * Return the Vfd slot to the free list + */ + FreeVfd(file); +} + +/* + * FilePrefetch - initiate asynchronous read of a given range of the file. + * + * Currently the only implementation of this function is using posix_fadvise + * which is the simplest standardized interface that accomplishes this. + * We could add an implementation using libaio in the future; but note that + * this API is inappropriate for libaio, which wants to have a buffer provided + * to read into. + */ +int +FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info) +{ +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED) + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + +retry: + pgstat_report_wait_start(wait_event_info); + returnCode = posix_fadvise(VfdCache[file].fd, offset, amount, + POSIX_FADV_WILLNEED); + pgstat_report_wait_end(); + + if (returnCode == EINTR) + goto retry; + + return returnCode; +#else + Assert(FileIsValid(file)); + return 0; +#endif +} + +void +FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) nbytes)); + + if (nbytes <= 0) + return; + + if (VfdCache[file].fileFlags & PG_O_DIRECT) + return; + + returnCode = FileAccess(file); + if (returnCode < 0) + return; + + pgstat_report_wait_start(wait_event_info); + pg_flush_data(VfdCache[file].fd, offset, nbytes); + pgstat_report_wait_end(); +} + +int +FileRead(File file, void *buffer, size_t amount, off_t offset, + uint32 wait_event_info) +{ + int returnCode; + Vfd *vfdP; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p", + file, VfdCache[file].fileName, + (int64) offset, + amount, buffer)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + vfdP = &VfdCache[file]; + +retry: + pgstat_report_wait_start(wait_event_info); + returnCode = pg_pread(vfdP->fd, buffer, amount, offset); + pgstat_report_wait_end(); + + if (returnCode < 0) + { + /* + * Windows may run out of kernel buffers and return "Insufficient + * system resources" error. Wait a bit and retry to solve it. + * + * It is rumored that EINTR is also possible on some Unix filesystems, + * in which case immediate retry is indicated. + */ +#ifdef WIN32 + DWORD error = GetLastError(); + + switch (error) + { + case ERROR_NO_SYSTEM_RESOURCES: + pg_usleep(1000L); + errno = EINTR; + break; + default: + _dosmaperr(error); + break; + } +#endif + /* OK to retry if interrupted */ + if (errno == EINTR) + goto retry; + } + + return returnCode; +} + +int +FileWrite(File file, const void *buffer, size_t amount, off_t offset, + uint32 wait_event_info) +{ + int returnCode; + Vfd *vfdP; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p", + file, VfdCache[file].fileName, + (int64) offset, + amount, buffer)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + vfdP = &VfdCache[file]; + + /* + * If enforcing temp_file_limit and it's a temp file, check to see if the + * write would overrun temp_file_limit, and throw error if so. Note: it's + * really a modularity violation to throw error here; we should set errno + * and return -1. However, there's no way to report a suitable error + * message if we do that. All current callers would just throw error + * immediately anyway, so this is safe at present. + */ + if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT)) + { + off_t past_write = offset + amount; + + if (past_write > vfdP->fileSize) + { + uint64 newTotal = temporary_files_size; + + newTotal += past_write - vfdP->fileSize; + if (newTotal > (uint64) temp_file_limit * (uint64) 1024) + ereport(ERROR, + (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED), + errmsg("temporary file size exceeds temp_file_limit (%dkB)", + temp_file_limit))); + } + } + +retry: + errno = 0; + pgstat_report_wait_start(wait_event_info); + returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset); + pgstat_report_wait_end(); + + /* if write didn't set errno, assume problem is no disk space */ + if (returnCode != amount && errno == 0) + errno = ENOSPC; + + if (returnCode >= 0) + { + /* + * Maintain fileSize and temporary_files_size if it's a temp file. + */ + if (vfdP->fdstate & FD_TEMP_FILE_LIMIT) + { + off_t past_write = offset + amount; + + if (past_write > vfdP->fileSize) + { + temporary_files_size += past_write - vfdP->fileSize; + vfdP->fileSize = past_write; + } + } + } + else + { + /* + * See comments in FileRead() + */ +#ifdef WIN32 + DWORD error = GetLastError(); + + switch (error) + { + case ERROR_NO_SYSTEM_RESOURCES: + pg_usleep(1000L); + errno = EINTR; + break; + default: + _dosmaperr(error); + break; + } +#endif + /* OK to retry if interrupted */ + if (errno == EINTR) + goto retry; + } + + return returnCode; +} + +int +FileSync(File file, uint32 wait_event_info) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileSync: %d (%s)", + file, VfdCache[file].fileName)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + returnCode = pg_fsync(VfdCache[file].fd); + pgstat_report_wait_end(); + + return returnCode; +} + +/* + * Zero a region of the file. + * + * Returns 0 on success, -1 otherwise. In the latter case errno is set to the + * appropriate error. + */ +int +FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info) +{ + int returnCode; + ssize_t written; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset); + pgstat_report_wait_end(); + + if (written < 0) + return -1; + else if (written != amount) + { + /* if errno is unset, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + return -1; + } + + return 0; +} + +/* + * Try to reserve file space with posix_fallocate(). If posix_fallocate() is + * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP, + * use FileZero() instead. + * + * Note that at least glibc() implements posix_fallocate() in userspace if not + * implemented by the filesystem. That's not the case for all environments + * though. + * + * Returns 0 on success, -1 otherwise. In the latter case errno is set to the + * appropriate error. + */ +int +FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info) +{ +#ifdef HAVE_POSIX_FALLOCATE + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return -1; + +retry: + pgstat_report_wait_start(wait_event_info); + returnCode = posix_fallocate(VfdCache[file].fd, offset, amount); + pgstat_report_wait_end(); + + if (returnCode == 0) + return 0; + else if (returnCode == EINTR) + goto retry; + + /* for compatibility with %m printing etc */ + errno = returnCode; + + /* + * Return in cases of a "real" failure, if fallocate is not supported, + * fall through to the FileZero() backed implementation. + */ + if (returnCode != EINVAL && returnCode != EOPNOTSUPP) + return -1; +#endif + + return FileZero(file, offset, amount, wait_event_info); +} + +off_t +FileSize(File file) +{ + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileSize %d (%s)", + file, VfdCache[file].fileName)); + + if (FileIsNotOpen(file)) + { + if (FileAccess(file) < 0) + return (off_t) -1; + } + + return lseek(VfdCache[file].fd, 0, SEEK_END); +} + +int +FileTruncate(File file, off_t offset, uint32 wait_event_info) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileTruncate %d (%s)", + file, VfdCache[file].fileName)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + returnCode = pg_ftruncate(VfdCache[file].fd, offset); + pgstat_report_wait_end(); + + if (returnCode == 0 && VfdCache[file].fileSize > offset) + { + /* adjust our state for truncation of a temp file */ + Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT); + temporary_files_size -= VfdCache[file].fileSize - offset; + VfdCache[file].fileSize = offset; + } + + return returnCode; +} + +/* + * Return the pathname associated with an open file. + * + * The returned string points to an internal buffer, which is valid until + * the file is closed. + */ +char * +FilePathName(File file) +{ + Assert(FileIsValid(file)); + + return VfdCache[file].fileName; +} + +/* + * Return the raw file descriptor of an opened file. + * + * The returned file descriptor will be valid until the file is closed, but + * there are a lot of things that can make that happen. So the caller should + * be careful not to do much of anything else before it finishes using the + * returned file descriptor. + */ +int +FileGetRawDesc(File file) +{ + Assert(FileIsValid(file)); + return VfdCache[file].fd; +} + +/* + * FileGetRawFlags - returns the file flags on open(2) + */ +int +FileGetRawFlags(File file) +{ + Assert(FileIsValid(file)); + return VfdCache[file].fileFlags; +} + +/* + * FileGetRawMode - returns the mode bitmask passed to open(2) + */ +mode_t +FileGetRawMode(File file) +{ + Assert(FileIsValid(file)); + return VfdCache[file].fileMode; +} + +/* + * Make room for another allocatedDescs[] array entry if needed and possible. + * Returns true if an array element is available. + */ +static bool +reserveAllocatedDesc(void) +{ + AllocateDesc *newDescs; + int newMax; + + /* Quick out if array already has a free slot. */ + if (numAllocatedDescs < maxAllocatedDescs) + return true; + + /* + * If the array hasn't yet been created in the current process, initialize + * it with FD_MINFREE / 3 elements. In many scenarios this is as many as + * we will ever need, anyway. We don't want to look at max_safe_fds + * immediately because set_max_safe_fds() may not have run yet. + */ + if (allocatedDescs == NULL) + { + newMax = FD_MINFREE / 3; + newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc)); + /* Out of memory already? Treat as fatal error. */ + if (newDescs == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + allocatedDescs = newDescs; + maxAllocatedDescs = newMax; + return true; + } + + /* + * Consider enlarging the array beyond the initial allocation used above. + * By the time this happens, max_safe_fds should be known accurately. + * + * We mustn't let allocated descriptors hog all the available FDs, and in + * practice we'd better leave a reasonable number of FDs for VFD use. So + * set the maximum to max_safe_fds / 3. (This should certainly be at + * least as large as the initial size, FD_MINFREE / 3, so we aren't + * tightening the restriction here.) Recall that "external" FDs are + * allowed to consume another third of max_safe_fds. + */ + newMax = max_safe_fds / 3; + if (newMax > maxAllocatedDescs) + { + newDescs = (AllocateDesc *) realloc(allocatedDescs, + newMax * sizeof(AllocateDesc)); + /* Treat out-of-memory as a non-fatal error. */ + if (newDescs == NULL) + return false; + allocatedDescs = newDescs; + maxAllocatedDescs = newMax; + return true; + } + + /* Can't enlarge allocatedDescs[] any more. */ + return false; +} + +/* + * Routines that want to use stdio (ie, FILE*) should use AllocateFile + * rather than plain fopen(). This lets fd.c deal with freeing FDs if + * necessary to open the file. When done, call FreeFile rather than fclose. + * + * Note that files that will be open for any significant length of time + * should NOT be handled this way, since they cannot share kernel file + * descriptors with other files; there is grave risk of running out of FDs + * if anyone locks down too many FDs. Most callers of this routine are + * simply reading a config file that they will read and close immediately. + * + * fd.c will automatically close all files opened with AllocateFile at + * transaction commit or abort; this prevents FD leakage if a routine + * that calls AllocateFile is terminated prematurely by ereport(ERROR). + * + * Ideally this should be the *only* direct call of fopen() in the backend. + */ +FILE * +AllocateFile(const char *name, const char *mode) +{ + FILE *file; + + DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)", + numAllocatedDescs, name)); + + /* Can we allocate another non-virtual FD? */ + if (!reserveAllocatedDesc()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"", + maxAllocatedDescs, name))); + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + +TryAgain: + if ((file = fopen(name, mode)) != NULL) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescFile; + desc->desc.file = file; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + return desc->desc.file; + } + + if (errno == EMFILE || errno == ENFILE) + { + int save_errno = errno; + + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + errno = 0; + if (ReleaseLruFile()) + goto TryAgain; + errno = save_errno; + } + + return NULL; +} + +/* + * Open a file with OpenTransientFilePerm() and pass default file mode for + * the fileMode parameter. + */ +int +OpenTransientFile(const char *fileName, int fileFlags) +{ + return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode); +} + +/* + * Like AllocateFile, but returns an unbuffered fd like open(2) + */ +int +OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode) +{ + int fd; + + DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)", + numAllocatedDescs, fileName)); + + /* Can we allocate another non-virtual FD? */ + if (!reserveAllocatedDesc()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"", + maxAllocatedDescs, fileName))); + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + + fd = BasicOpenFilePerm(fileName, fileFlags, fileMode); + + if (fd >= 0) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescRawFD; + desc->desc.fd = fd; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + + return fd; + } + + return -1; /* failure */ +} + +/* + * Routines that want to initiate a pipe stream should use OpenPipeStream + * rather than plain popen(). This lets fd.c deal with freeing FDs if + * necessary. When done, call ClosePipeStream rather than pclose. + * + * This function also ensures that the popen'd program is run with default + * SIGPIPE processing, rather than the SIG_IGN setting the backend normally + * uses. This ensures desirable response to, eg, closing a read pipe early. + */ +FILE * +OpenPipeStream(const char *command, const char *mode) +{ + FILE *file; + int save_errno; + + DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)", + numAllocatedDescs, command)); + + /* Can we allocate another non-virtual FD? */ + if (!reserveAllocatedDesc()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"", + maxAllocatedDescs, command))); + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + +TryAgain: + fflush(NULL); + pqsignal(SIGPIPE, SIG_DFL); + errno = 0; + file = popen(command, mode); + save_errno = errno; + pqsignal(SIGPIPE, SIG_IGN); + errno = save_errno; + if (file != NULL) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescPipe; + desc->desc.file = file; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + return desc->desc.file; + } + + if (errno == EMFILE || errno == ENFILE) + { + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + if (ReleaseLruFile()) + goto TryAgain; + errno = save_errno; + } + + return NULL; +} + +/* + * Free an AllocateDesc of any type. + * + * The argument *must* point into the allocatedDescs[] array. + */ +static int +FreeDesc(AllocateDesc *desc) +{ + int result; + + /* Close the underlying object */ + switch (desc->kind) + { + case AllocateDescFile: + result = fclose(desc->desc.file); + break; + case AllocateDescPipe: + result = pclose(desc->desc.file); + break; + case AllocateDescDir: + result = closedir(desc->desc.dir); + break; + case AllocateDescRawFD: + result = close(desc->desc.fd); + break; + default: + elog(ERROR, "AllocateDesc kind not recognized"); + result = 0; /* keep compiler quiet */ + break; + } + + /* Compact storage in the allocatedDescs array */ + numAllocatedDescs--; + *desc = allocatedDescs[numAllocatedDescs]; + + return result; +} + +/* + * Close a file returned by AllocateFile. + * + * Note we do not check fclose's return value --- it is up to the caller + * to handle close errors. + */ +int +FreeFile(FILE *file) +{ + int i; + + DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs)); + + /* Remove file from list of allocated files, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescFile && desc->desc.file == file) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a file not in allocatedDescs */ + elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile"); + + return fclose(file); +} + +/* + * Close a file returned by OpenTransientFile. + * + * Note we do not check close's return value --- it is up to the caller + * to handle close errors. + */ +int +CloseTransientFile(int fd) +{ + int i; + + DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs)); + + /* Remove fd from list of allocated files, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a file not in allocatedDescs */ + elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile"); + + return close(fd); +} + +/* + * Routines that want to use (ie, DIR*) should use AllocateDir + * rather than plain opendir(). This lets fd.c deal with freeing FDs if + * necessary to open the directory, and with closing it after an elog. + * When done, call FreeDir rather than closedir. + * + * Returns NULL, with errno set, on failure. Note that failure detection + * is commonly left to the following call of ReadDir or ReadDirExtended; + * see the comments for ReadDir. + * + * Ideally this should be the *only* direct call of opendir() in the backend. + */ +DIR * +AllocateDir(const char *dirname) +{ + DIR *dir; + + DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)", + numAllocatedDescs, dirname)); + + /* Can we allocate another non-virtual FD? */ + if (!reserveAllocatedDesc()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"", + maxAllocatedDescs, dirname))); + + /* Close excess kernel FDs. */ + ReleaseLruFiles(); + +TryAgain: + if ((dir = opendir(dirname)) != NULL) + { + AllocateDesc *desc = &allocatedDescs[numAllocatedDescs]; + + desc->kind = AllocateDescDir; + desc->desc.dir = dir; + desc->create_subid = GetCurrentSubTransactionId(); + numAllocatedDescs++; + return desc->desc.dir; + } + + if (errno == EMFILE || errno == ENFILE) + { + int save_errno = errno; + + ereport(LOG, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("out of file descriptors: %m; release and retry"))); + errno = 0; + if (ReleaseLruFile()) + goto TryAgain; + errno = save_errno; + } + + return NULL; +} + +/* + * Read a directory opened with AllocateDir, ereport'ing any error. + * + * This is easier to use than raw readdir() since it takes care of some + * otherwise rather tedious and error-prone manipulation of errno. Also, + * if you are happy with a generic error message for AllocateDir failure, + * you can just do + * + * dir = AllocateDir(path); + * while ((dirent = ReadDir(dir, path)) != NULL) + * process dirent; + * FreeDir(dir); + * + * since a NULL dir parameter is taken as indicating AllocateDir failed. + * (Make sure errno isn't changed between AllocateDir and ReadDir if you + * use this shortcut.) + * + * The pathname passed to AllocateDir must be passed to this routine too, + * but it is only used for error reporting. + */ +struct dirent * +ReadDir(DIR *dir, const char *dirname) +{ + return ReadDirExtended(dir, dirname, ERROR); +} + +/* + * Alternate version of ReadDir that allows caller to specify the elevel + * for any error report (whether it's reporting an initial failure of + * AllocateDir or a subsequent directory read failure). + * + * If elevel < ERROR, returns NULL after any error. With the normal coding + * pattern, this will result in falling out of the loop immediately as + * though the directory contained no (more) entries. + */ +struct dirent * +ReadDirExtended(DIR *dir, const char *dirname, int elevel) +{ + struct dirent *dent; + + /* Give a generic message for AllocateDir failure, if caller didn't */ + if (dir == NULL) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", + dirname))); + return NULL; + } + + errno = 0; + if ((dent = readdir(dir)) != NULL) + return dent; + + if (errno) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not read directory \"%s\": %m", + dirname))); + return NULL; +} + +/* + * Close a directory opened with AllocateDir. + * + * Returns closedir's return value (with errno set if it's not 0). + * Note we do not check the return value --- it is up to the caller + * to handle close errors if wanted. + * + * Does nothing if dir == NULL; we assume that directory open failure was + * already reported if desired. + */ +int +FreeDir(DIR *dir) +{ + int i; + + /* Nothing to do if AllocateDir failed */ + if (dir == NULL) + return 0; + + DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs)); + + /* Remove dir from list of allocated dirs, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescDir && desc->desc.dir == dir) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a dir not in allocatedDescs */ + elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir"); + + return closedir(dir); +} + + +/* + * Close a pipe stream returned by OpenPipeStream. + */ +int +ClosePipeStream(FILE *file) +{ + int i; + + DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs)); + + /* Remove file from list of allocated files, if it's present */ + for (i = numAllocatedDescs; --i >= 0;) + { + AllocateDesc *desc = &allocatedDescs[i]; + + if (desc->kind == AllocateDescPipe && desc->desc.file == file) + return FreeDesc(desc); + } + + /* Only get here if someone passes us a file not in allocatedDescs */ + elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream"); + + return pclose(file); +} + +/* + * closeAllVfds + * + * Force all VFDs into the physically-closed state, so that the fewest + * possible number of kernel file descriptors are in use. There is no + * change in the logical state of the VFDs. + */ +void +closeAllVfds(void) +{ + Index i; + + if (SizeVfdCache > 0) + { + Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ + for (i = 1; i < SizeVfdCache; i++) + { + if (!FileIsNotOpen(i)) + LruDelete(i); + } + } +} + + +/* + * SetTempTablespaces + * + * Define a list (actually an array) of OIDs of tablespaces to use for + * temporary files. This list will be used until end of transaction, + * unless this function is called again before then. It is caller's + * responsibility that the passed-in array has adequate lifespan (typically + * it'd be allocated in TopTransactionContext). + * + * Some entries of the array may be InvalidOid, indicating that the current + * database's default tablespace should be used. + */ +void +SetTempTablespaces(Oid *tableSpaces, int numSpaces) +{ + Assert(numSpaces >= 0); + tempTableSpaces = tableSpaces; + numTempTableSpaces = numSpaces; + + /* + * Select a random starting point in the list. This is to minimize + * conflicts between backends that are most likely sharing the same list + * of temp tablespaces. Note that if we create multiple temp files in the + * same transaction, we'll advance circularly through the list --- this + * ensures that large temporary sort files are nicely spread across all + * available tablespaces. + */ + if (numSpaces > 1) + nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state, + 0, numSpaces - 1); + else + nextTempTableSpace = 0; +} + +/* + * TempTablespacesAreSet + * + * Returns true if SetTempTablespaces has been called in current transaction. + * (This is just so that tablespaces.c doesn't need its own per-transaction + * state.) + */ +bool +TempTablespacesAreSet(void) +{ + return (numTempTableSpaces >= 0); +} + +/* + * GetTempTablespaces + * + * Populate an array with the OIDs of the tablespaces that should be used for + * temporary files. (Some entries may be InvalidOid, indicating that the + * current database's default tablespace should be used.) At most numSpaces + * entries will be filled. + * Returns the number of OIDs that were copied into the output array. + */ +int +GetTempTablespaces(Oid *tableSpaces, int numSpaces) +{ + int i; + + Assert(TempTablespacesAreSet()); + for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i) + tableSpaces[i] = tempTableSpaces[i]; + + return i; +} + +/* + * GetNextTempTableSpace + * + * Select the next temp tablespace to use. A result of InvalidOid means + * to use the current database's default tablespace. + */ +Oid +GetNextTempTableSpace(void) +{ + if (numTempTableSpaces > 0) + { + /* Advance nextTempTableSpace counter with wraparound */ + if (++nextTempTableSpace >= numTempTableSpaces) + nextTempTableSpace = 0; + return tempTableSpaces[nextTempTableSpace]; + } + return InvalidOid; +} + + +/* + * AtEOSubXact_Files + * + * Take care of subtransaction commit/abort. At abort, we close temp files + * that the subtransaction may have opened. At commit, we reassign the + * files that were opened to the parent subtransaction. + */ +void +AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid, + SubTransactionId parentSubid) +{ + Index i; + + for (i = 0; i < numAllocatedDescs; i++) + { + if (allocatedDescs[i].create_subid == mySubid) + { + if (isCommit) + allocatedDescs[i].create_subid = parentSubid; + else + { + /* have to recheck the item after FreeDesc (ugly) */ + FreeDesc(&allocatedDescs[i--]); + } + } + } +} + +/* + * AtEOXact_Files + * + * This routine is called during transaction commit or abort. All still-open + * per-transaction temporary file VFDs are closed, which also causes the + * underlying files to be deleted (although they should've been closed already + * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are + * closed. We also forget any transaction-local temp tablespace list. + * + * The isCommit flag is used only to decide whether to emit warnings about + * unclosed files. + */ +void +AtEOXact_Files(bool isCommit) +{ + CleanupTempFiles(isCommit, false); + tempTableSpaces = NULL; + numTempTableSpaces = -1; +} + +/* + * BeforeShmemExit_Files + * + * before_shmem_exit hook to clean up temp files during backend shutdown. + * Here, we want to clean up *all* temp files including interXact ones. + */ +static void +BeforeShmemExit_Files(int code, Datum arg) +{ + CleanupTempFiles(false, true); + + /* prevent further temp files from being created */ +#ifdef USE_ASSERT_CHECKING + temporary_files_allowed = false; +#endif +} + +/* + * Close temporary files and delete their underlying files. + * + * isCommit: if true, this is normal transaction commit, and we don't + * expect any remaining files; warn if there are some. + * + * isProcExit: if true, this is being called as the backend process is + * exiting. If that's the case, we should remove all temporary files; if + * that's not the case, we are being called for transaction commit/abort + * and should only remove transaction-local temp files. In either case, + * also clean up "allocated" stdio files, dirs and fds. + */ +static void +CleanupTempFiles(bool isCommit, bool isProcExit) +{ + Index i; + + /* + * Careful here: at proc_exit we need extra cleanup, not just + * xact_temporary files. + */ + if (isProcExit || have_xact_temporary_files) + { + Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */ + for (i = 1; i < SizeVfdCache; i++) + { + unsigned short fdstate = VfdCache[i].fdstate; + + if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) && + VfdCache[i].fileName != NULL) + { + /* + * If we're in the process of exiting a backend process, close + * all temporary files. Otherwise, only close temporary files + * local to the current transaction. They should be closed by + * the ResourceOwner mechanism already, so this is just a + * debugging cross-check. + */ + if (isProcExit) + FileClose(i); + else if (fdstate & FD_CLOSE_AT_EOXACT) + { + elog(WARNING, + "temporary file %s not closed at end-of-transaction", + VfdCache[i].fileName); + FileClose(i); + } + } + } + + have_xact_temporary_files = false; + } + + /* Complain if any allocated files remain open at commit. */ + if (isCommit && numAllocatedDescs > 0) + elog(WARNING, "%d temporary files and directories not closed at end-of-transaction", + numAllocatedDescs); + + /* Clean up "allocated" stdio files, dirs and fds. */ + while (numAllocatedDescs > 0) + FreeDesc(&allocatedDescs[0]); +} + + +/* + * Remove temporary and temporary relation files left over from a prior + * postmaster session + * + * This should be called during postmaster startup. It will forcibly + * remove any leftover files created by OpenTemporaryFile and any leftover + * temporary relation files created by mdcreate. + * + * During post-backend-crash restart cycle, this routine is called when + * remove_temp_files_after_crash GUC is enabled. Multiple crashes while + * queries are using temp files could result in useless storage usage that can + * only be reclaimed by a service restart. The argument against enabling it is + * that someone might want to examine the temporary files for debugging + * purposes. This does however mean that OpenTemporaryFile had better allow for + * collision with an existing temp file name. + * + * NOTE: this function and its subroutines generally report syscall failures + * with ereport(LOG) and keep going. Removing temp files is not so critical + * that we should fail to start the database when we can't do it. + */ +void +RemovePgTempFiles(void) +{ + char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)]; + DIR *spc_dir; + struct dirent *spc_de; + + /* + * First process temp files in pg_default ($PGDATA/base) + */ + snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR); + RemovePgTempFilesInDir(temp_path, true, false); + RemovePgTempRelationFiles("base"); + + /* + * Cycle through temp directories for all non-default tablespaces. + */ + spc_dir = AllocateDir("pg_tblspc"); + + while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL) + { + if (strcmp(spc_de->d_name, ".") == 0 || + strcmp(spc_de->d_name, "..") == 0) + continue; + + snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s", + spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR); + RemovePgTempFilesInDir(temp_path, true, false); + + snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", + spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); + RemovePgTempRelationFiles(temp_path); + } + + FreeDir(spc_dir); + + /* + * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of + * DataDir as well. However, that is *not* cleaned here because doing so + * would create a race condition. It's done separately, earlier in + * postmaster startup. + */ +} + +/* + * Process one pgsql_tmp directory for RemovePgTempFiles. + * + * If missing_ok is true, it's all right for the named directory to not exist. + * Any other problem results in a LOG message. (missing_ok should be true at + * the top level, since pgsql_tmp directories are not created until needed.) + * + * At the top level, this should be called with unlink_all = false, so that + * only files matching the temporary name prefix will be unlinked. When + * recursing it will be called with unlink_all = true to unlink everything + * under a top-level temporary directory. + * + * (These two flags could be replaced by one, but it seems clearer to keep + * them separate.) + */ +void +RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all) +{ + DIR *temp_dir; + struct dirent *temp_de; + char rm_path[MAXPGPATH * 2]; + + temp_dir = AllocateDir(tmpdirname); + + if (temp_dir == NULL && errno == ENOENT && missing_ok) + return; + + while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL) + { + if (strcmp(temp_de->d_name, ".") == 0 || + strcmp(temp_de->d_name, "..") == 0) + continue; + + snprintf(rm_path, sizeof(rm_path), "%s/%s", + tmpdirname, temp_de->d_name); + + if (unlink_all || + strncmp(temp_de->d_name, + PG_TEMP_FILE_PREFIX, + strlen(PG_TEMP_FILE_PREFIX)) == 0) + { + PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG); + + if (type == PGFILETYPE_ERROR) + continue; + else if (type == PGFILETYPE_DIR) + { + /* recursively remove contents, then directory itself */ + RemovePgTempFilesInDir(rm_path, false, true); + + if (rmdir(rm_path) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove directory \"%s\": %m", + rm_path))); + } + else + { + if (unlink(rm_path) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + rm_path))); + } + } + else + ereport(LOG, + (errmsg("unexpected file found in temporary-files directory: \"%s\"", + rm_path))); + } + + FreeDir(temp_dir); +} + +/* Process one tablespace directory, look for per-DB subdirectories */ +static void +RemovePgTempRelationFiles(const char *tsdirname) +{ + DIR *ts_dir; + struct dirent *de; + char dbspace_path[MAXPGPATH * 2]; + + ts_dir = AllocateDir(tsdirname); + + while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL) + { + /* + * We're only interested in the per-database directories, which have + * numeric names. Note that this code will also (properly) ignore "." + * and "..". + */ + if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) + continue; + + snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", + tsdirname, de->d_name); + RemovePgTempRelationFilesInDbspace(dbspace_path); + } + + FreeDir(ts_dir); +} + +/* Process one per-dbspace directory for RemovePgTempRelationFiles */ +static void +RemovePgTempRelationFilesInDbspace(const char *dbspacedirname) +{ + DIR *dbspace_dir; + struct dirent *de; + char rm_path[MAXPGPATH * 2]; + + dbspace_dir = AllocateDir(dbspacedirname); + + while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL) + { + if (!looks_like_temp_rel_name(de->d_name)) + continue; + + snprintf(rm_path, sizeof(rm_path), "%s/%s", + dbspacedirname, de->d_name); + + if (unlink(rm_path) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + rm_path))); + } + + FreeDir(dbspace_dir); +} + +/* t_, or t__ */ +bool +looks_like_temp_rel_name(const char *name) +{ + int pos; + int savepos; + + /* Must start with "t". */ + if (name[0] != 't') + return false; + + /* Followed by a non-empty string of digits and then an underscore. */ + for (pos = 1; isdigit((unsigned char) name[pos]); ++pos) + ; + if (pos == 1 || name[pos] != '_') + return false; + + /* Followed by another nonempty string of digits. */ + for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos) + ; + if (savepos == pos) + return false; + + /* We might have _forkname or .segment or both. */ + if (name[pos] == '_') + { + int forkchar = forkname_chars(&name[pos + 1], NULL); + + if (forkchar <= 0) + return false; + pos += forkchar + 1; + } + if (name[pos] == '.') + { + int segchar; + + for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) + ; + if (segchar <= 1) + return false; + pos += segchar; + } + + /* Now we should be at the end. */ + if (name[pos] != '\0') + return false; + return true; +} + +#ifdef HAVE_SYNCFS +static void +do_syncfs(const char *path) +{ + int fd; + + ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s", + path); + + fd = OpenTransientFile(path, O_RDONLY); + if (fd < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + return; + } + if (syncfs(fd) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not synchronize file system for file \"%s\": %m", path))); + CloseTransientFile(fd); +} +#endif + +/* + * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for + * all potential filesystem, depending on recovery_init_sync_method setting. + * + * We fsync regular files and directories wherever they are, but we + * follow symlinks only for pg_wal and immediately under pg_tblspc. + * Other symlinks are presumed to point at files we're not responsible + * for fsyncing, and might not have privileges to write at all. + * + * Errors are logged but not considered fatal; that's because this is used + * only during database startup, to deal with the possibility that there are + * issued-but-unsynced writes pending against the data directory. We want to + * ensure that such writes reach disk before anything that's done in the new + * run. However, aborting on error would result in failure to start for + * harmless cases such as read-only files in the data directory, and that's + * not good either. + * + * Note that if we previously crashed due to a PANIC on fsync(), we'll be + * rewriting all changes again during recovery. + * + * Note we assume we're chdir'd into PGDATA to begin with. + */ +void +SyncDataDirectory(void) +{ + bool xlog_is_symlink; + + /* We can skip this whole thing if fsync is disabled. */ + if (!enableFsync) + return; + + /* + * If pg_wal is a symlink, we'll need to recurse into it separately, + * because the first walkdir below will ignore it. + */ + xlog_is_symlink = false; + + { + struct stat st; + + if (lstat("pg_wal", &st) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + "pg_wal"))); + else if (S_ISLNK(st.st_mode)) + xlog_is_symlink = true; + } + +#ifdef HAVE_SYNCFS + if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS) + { + DIR *dir; + struct dirent *de; + + /* + * On Linux, we don't have to open every single file one by one. We + * can use syncfs() to sync whole filesystems. We only expect + * filesystem boundaries to exist where we tolerate symlinks, namely + * pg_wal and the tablespaces, so we call syncfs() for each of those + * directories. + */ + + /* Prepare to report progress syncing the data directory via syncfs. */ + begin_startup_progress_phase(); + + /* Sync the top level pgdata directory. */ + do_syncfs("."); + /* If any tablespaces are configured, sync each of those. */ + dir = AllocateDir("pg_tblspc"); + while ((de = ReadDirExtended(dir, "pg_tblspc", LOG))) + { + char path[MAXPGPATH]; + + if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0) + continue; + + snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name); + do_syncfs(path); + } + FreeDir(dir); + /* If pg_wal is a symlink, process that too. */ + if (xlog_is_symlink) + do_syncfs("pg_wal"); + return; + } +#endif /* !HAVE_SYNCFS */ + +#ifdef PG_FLUSH_DATA_WORKS + /* Prepare to report progress of the pre-fsync phase. */ + begin_startup_progress_phase(); + + /* + * If possible, hint to the kernel that we're soon going to fsync the data + * directory and its contents. Errors in this step are even less + * interesting than normal, so log them only at DEBUG1. + */ + walkdir(".", pre_sync_fname, false, DEBUG1); + if (xlog_is_symlink) + walkdir("pg_wal", pre_sync_fname, false, DEBUG1); + walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1); +#endif + + /* Prepare to report progress syncing the data directory via fsync. */ + begin_startup_progress_phase(); + + /* + * Now we do the fsync()s in the same order. + * + * The main call ignores symlinks, so in addition to specially processing + * pg_wal if it's a symlink, pg_tblspc has to be visited separately with + * process_symlinks = true. Note that if there are any plain directories + * in pg_tblspc, they'll get fsync'd twice. That's not an expected case + * so we don't worry about optimizing it. + */ + walkdir(".", datadir_fsync_fname, false, LOG); + if (xlog_is_symlink) + walkdir("pg_wal", datadir_fsync_fname, false, LOG); + walkdir("pg_tblspc", datadir_fsync_fname, true, LOG); +} + +/* + * walkdir: recursively walk a directory, applying the action to each + * regular file and directory (including the named directory itself). + * + * If process_symlinks is true, the action and recursion are also applied + * to regular files and directories that are pointed to by symlinks in the + * given directory; otherwise symlinks are ignored. Symlinks are always + * ignored in subdirectories, ie we intentionally don't pass down the + * process_symlinks flag to recursive calls. + * + * Errors are reported at level elevel, which might be ERROR or less. + * + * See also walkdir in file_utils.c, which is a frontend version of this + * logic. + */ +static void +walkdir(const char *path, + void (*action) (const char *fname, bool isdir, int elevel), + bool process_symlinks, + int elevel) +{ + DIR *dir; + struct dirent *de; + + dir = AllocateDir(path); + + while ((de = ReadDirExtended(dir, path, elevel)) != NULL) + { + char subpath[MAXPGPATH * 2]; + + CHECK_FOR_INTERRUPTS(); + + if (strcmp(de->d_name, ".") == 0 || + strcmp(de->d_name, "..") == 0) + continue; + + snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name); + + switch (get_dirent_type(subpath, de, process_symlinks, elevel)) + { + case PGFILETYPE_REG: + (*action) (subpath, false, elevel); + break; + case PGFILETYPE_DIR: + walkdir(subpath, action, false, elevel); + break; + default: + + /* + * Errors are already reported directly by get_dirent_type(), + * and any remaining symlinks and unknown file types are + * ignored. + */ + break; + } + } + + FreeDir(dir); /* we ignore any error here */ + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. However, skip this if AllocateDir failed; the action function + * might not be robust against that. + */ + if (dir) + (*action) (path, true, elevel); +} + + +/* + * Hint to the OS that it should get ready to fsync() this file. + * + * Ignores errors trying to open unreadable files, and logs other errors at a + * caller-specified level. + */ +#ifdef PG_FLUSH_DATA_WORKS + +static void +pre_sync_fname(const char *fname, bool isdir, int elevel) +{ + int fd; + + /* Don't try to flush directories, it'll likely just fail */ + if (isdir) + return; + + ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s", + fname); + + fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY); + + if (fd < 0) + { + if (errno == EACCES) + return; + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return; + } + + /* + * pg_flush_data() ignores errors, which is ok because this is only a + * hint. + */ + pg_flush_data(fd, 0, 0); + + if (CloseTransientFile(fd) != 0) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", fname))); +} + +#endif /* PG_FLUSH_DATA_WORKS */ + +static void +datadir_fsync_fname(const char *fname, bool isdir, int elevel) +{ + ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s", + fname); + + /* + * We want to silently ignoring errors about unreadable files. Pass that + * desire on to fsync_fname_ext(). + */ + fsync_fname_ext(fname, isdir, true, elevel); +} + +static void +unlink_if_exists_fname(const char *fname, bool isdir, int elevel) +{ + if (isdir) + { + if (rmdir(fname) != 0 && errno != ENOENT) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not remove directory \"%s\": %m", fname))); + } + else + { + /* Use PathNameDeleteTemporaryFile to report filesize */ + PathNameDeleteTemporaryFile(fname, false); + } +} + +/* + * fsync_fname_ext -- Try to fsync a file or directory + * + * If ignore_perm is true, ignore errors upon trying to open unreadable + * files. Logs other errors at a caller-specified level. + * + * Returns 0 if the operation succeeded, -1 otherwise. + */ +int +fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel) +{ + int fd; + int flags; + int returncode; + + /* + * Some OSs require directories to be opened read-only whereas other + * systems don't allow us to fsync files opened read-only; so we need both + * cases here. Using O_RDWR will cause us to fail to fsync files that are + * not writable by our userid, but we assume that's OK. + */ + flags = PG_BINARY; + if (!isdir) + flags |= O_RDWR; + else + flags |= O_RDONLY; + + fd = OpenTransientFile(fname, flags); + + /* + * Some OSs don't allow us to open directories at all (Windows returns + * EACCES), just ignore the error in that case. If desired also silently + * ignoring errors about unreadable files. Log others. + */ + if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) + return 0; + else if (fd < 0 && ignore_perm && errno == EACCES) + return 0; + else if (fd < 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return -1; + } + + returncode = pg_fsync(fd); + + /* + * Some OSes don't allow us to fsync directories at all, so we can ignore + * those errors. Anything else needs to be logged. + */ + if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL))) + { + int save_errno; + + /* close file upon error, might not be in transaction context */ + save_errno = errno; + (void) CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", fname))); + return -1; + } + + if (CloseTransientFile(fd) != 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", fname))); + return -1; + } + + return 0; +} + +/* + * fsync_parent_path -- fsync the parent path of a file or directory + * + * This is aimed at making file operations persistent on disk in case of + * an OS crash or power failure. + */ +static int +fsync_parent_path(const char *fname, int elevel) +{ + char parentpath[MAXPGPATH]; + + strlcpy(parentpath, fname, MAXPGPATH); + get_parent_directory(parentpath); + + /* + * get_parent_directory() returns an empty string if the input argument is + * just a file name (see comments in path.c), so handle that as being the + * current directory. + */ + if (strlen(parentpath) == 0) + strlcpy(parentpath, ".", MAXPGPATH); + + if (fsync_fname_ext(parentpath, true, false, elevel) != 0) + return -1; + + return 0; +} + +/* + * Create a PostgreSQL data sub-directory + * + * The data directory itself, and most of its sub-directories, are created at + * initdb time, but we do have some occasions when we create directories in + * the backend (CREATE TABLESPACE, for example). In those cases, we want to + * make sure that those directories are created consistently. Today, that means + * making sure that the created directory has the correct permissions, which is + * what pg_dir_create_mode tracks for us. + * + * Note that we also set the umask() based on what we understand the correct + * permissions to be (see file_perm.c). + * + * For permissions other than the default, mkdir() can be used directly, but + * be sure to consider carefully such cases -- a sub-directory with incorrect + * permissions in a PostgreSQL data directory could cause backups and other + * processes to fail. + */ +int +MakePGDirectory(const char *directoryName) +{ + return mkdir(directoryName, pg_dir_create_mode); +} + +/* + * Return the passed-in error level, or PANIC if data_sync_retry is off. + * + * Failure to fsync any data file is cause for immediate panic, unless + * data_sync_retry is enabled. Data may have been written to the operating + * system and removed from our buffer pool already, and if we are running on + * an operating system that forgets dirty data on write-back failure, there + * may be only one copy of the data remaining: in the WAL. A later attempt to + * fsync again might falsely report success. Therefore we must not allow any + * further checkpoints to be attempted. data_sync_retry can in theory be + * enabled on systems known not to drop dirty buffered data on write-back + * failure (with the likely outcome that checkpoints will continue to fail + * until the underlying problem is fixed). + * + * Any code that reports a failure from fsync() or related functions should + * filter the error level with this function. + */ +int +data_sync_elevel(int elevel) +{ + return data_sync_retry ? elevel : PANIC; +} + +bool +check_debug_io_direct(char **newval, void **extra, GucSource source) +{ + bool result = true; + int flags; + +#if PG_O_DIRECT == 0 + if (strcmp(*newval, "") != 0) + { + GUC_check_errdetail("debug_io_direct is not supported on this platform."); + result = false; + } + flags = 0; +#else + List *elemlist; + ListCell *l; + char *rawstring; + + /* Need a modifiable copy of string */ + rawstring = pstrdup(*newval); + + if (!SplitGUCList(rawstring, ',', &elemlist)) + { + GUC_check_errdetail("invalid list syntax in parameter \"%s\"", + "debug_io_direct"); + pfree(rawstring); + list_free(elemlist); + return false; + } + + flags = 0; + foreach(l, elemlist) + { + char *item = (char *) lfirst(l); + + if (pg_strcasecmp(item, "data") == 0) + flags |= IO_DIRECT_DATA; + else if (pg_strcasecmp(item, "wal") == 0) + flags |= IO_DIRECT_WAL; + else if (pg_strcasecmp(item, "wal_init") == 0) + flags |= IO_DIRECT_WAL_INIT; + else + { + GUC_check_errdetail("invalid option \"%s\"", item); + result = false; + break; + } + } + + /* + * It's possible to configure block sizes smaller than our assumed I/O + * alignment size, which could result in invalid I/O requests. + */ +#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE + if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT))) + { + GUC_check_errdetail("debug_io_direct is not supported for WAL because XLOG_BLCKSZ is too small"); + result = false; + } +#endif +#if BLCKSZ < PG_IO_ALIGN_SIZE + if (result && (flags & IO_DIRECT_DATA)) + { + GUC_check_errdetail("debug_io_direct is not supported for data because BLCKSZ is too small"); + result = false; + } +#endif + + pfree(rawstring); + list_free(elemlist); +#endif + + if (!result) + return result; + + /* Save the flags in *extra, for use by assign_debug_io_direct */ + *extra = guc_malloc(ERROR, sizeof(int)); + *((int *) *extra) = flags; + + return result; +} + +extern void +assign_debug_io_direct(const char *newval, void *extra) +{ + int *flags = (int *) extra; + + io_direct_flags = *flags; +} diff --git a/src/backend/storage/file/fileset.c b/src/backend/storage/file/fileset.c new file mode 100644 index 0000000..e9951b0 --- /dev/null +++ b/src/backend/storage/file/fileset.c @@ -0,0 +1,205 @@ +/*------------------------------------------------------------------------- + * + * fileset.c + * Management of named temporary files. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/fileset.c + * + * FileSets provide a temporary namespace (think directory) so that files can + * be discovered by name. + * + * FileSets can be used by backends when the temporary files need to be + * opened/closed multiple times and the underlying files need to survive across + * transactions. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "catalog/pg_tablespace.h" +#include "commands/tablespace.h" +#include "common/hashfn.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/fileset.h" +#include "utils/builtins.h" + +static void FileSetPath(char *path, FileSet *fileset, Oid tablespace); +static void FilePath(char *path, FileSet *fileset, const char *name); +static Oid ChooseTablespace(const FileSet *fileset, const char *name); + +/* + * Initialize a space for temporary files. This API can be used by shared + * fileset as well as if the temporary files are used only by single backend + * but the files need to be opened and closed multiple times and also the + * underlying files need to survive across transactions. + * + * The callers are expected to explicitly remove such files by using + * FileSetDelete/FileSetDeleteAll. + * + * Files will be distributed over the tablespaces configured in + * temp_tablespaces. + * + * Under the covers the set is one or more directories which will eventually + * be deleted. + */ +void +FileSetInit(FileSet *fileset) +{ + static uint32 counter = 0; + + fileset->creator_pid = MyProcPid; + fileset->number = counter; + counter = (counter + 1) % INT_MAX; + + /* Capture the tablespace OIDs so that all backends agree on them. */ + PrepareTempTablespaces(); + fileset->ntablespaces = + GetTempTablespaces(&fileset->tablespaces[0], + lengthof(fileset->tablespaces)); + if (fileset->ntablespaces == 0) + { + /* If the GUC is empty, use current database's default tablespace */ + fileset->tablespaces[0] = MyDatabaseTableSpace; + fileset->ntablespaces = 1; + } + else + { + int i; + + /* + * An entry of InvalidOid means use the default tablespace for the + * current database. Replace that now, to be sure that all users of + * the FileSet agree on what to do. + */ + for (i = 0; i < fileset->ntablespaces; i++) + { + if (fileset->tablespaces[i] == InvalidOid) + fileset->tablespaces[i] = MyDatabaseTableSpace; + } + } +} + +/* + * Create a new file in the given set. + */ +File +FileSetCreate(FileSet *fileset, const char *name) +{ + char path[MAXPGPATH]; + File file; + + FilePath(path, fileset, name); + file = PathNameCreateTemporaryFile(path, false); + + /* If we failed, see if we need to create the directory on demand. */ + if (file <= 0) + { + char tempdirpath[MAXPGPATH]; + char filesetpath[MAXPGPATH]; + Oid tablespace = ChooseTablespace(fileset, name); + + TempTablespacePath(tempdirpath, tablespace); + FileSetPath(filesetpath, fileset, tablespace); + PathNameCreateTemporaryDir(tempdirpath, filesetpath); + file = PathNameCreateTemporaryFile(path, true); + } + + return file; +} + +/* + * Open a file that was created with FileSetCreate() */ +File +FileSetOpen(FileSet *fileset, const char *name, int mode) +{ + char path[MAXPGPATH]; + File file; + + FilePath(path, fileset, name); + file = PathNameOpenTemporaryFile(path, mode); + + return file; +} + +/* + * Delete a file that was created with FileSetCreate(). + * + * Return true if the file existed, false if didn't. + */ +bool +FileSetDelete(FileSet *fileset, const char *name, + bool error_on_failure) +{ + char path[MAXPGPATH]; + + FilePath(path, fileset, name); + + return PathNameDeleteTemporaryFile(path, error_on_failure); +} + +/* + * Delete all files in the set. + */ +void +FileSetDeleteAll(FileSet *fileset) +{ + char dirpath[MAXPGPATH]; + int i; + + /* + * Delete the directory we created in each tablespace. Doesn't fail + * because we use this in error cleanup paths, but can generate LOG + * message on IO error. + */ + for (i = 0; i < fileset->ntablespaces; ++i) + { + FileSetPath(dirpath, fileset, fileset->tablespaces[i]); + PathNameDeleteTemporaryDir(dirpath); + } +} + +/* + * Build the path for the directory holding the files backing a FileSet in a + * given tablespace. + */ +static void +FileSetPath(char *path, FileSet *fileset, Oid tablespace) +{ + char tempdirpath[MAXPGPATH]; + + TempTablespacePath(tempdirpath, tablespace); + snprintf(path, MAXPGPATH, "%s/%s%lu.%u.fileset", + tempdirpath, PG_TEMP_FILE_PREFIX, + (unsigned long) fileset->creator_pid, fileset->number); +} + +/* + * Sorting has to determine which tablespace a given temporary file belongs in. + */ +static Oid +ChooseTablespace(const FileSet *fileset, const char *name) +{ + uint32 hash = hash_any((const unsigned char *) name, strlen(name)); + + return fileset->tablespaces[hash % fileset->ntablespaces]; +} + +/* + * Compute the full path of a file in a FileSet. + */ +static void +FilePath(char *path, FileSet *fileset, const char *name) +{ + char dirpath[MAXPGPATH]; + + FileSetPath(dirpath, fileset, ChooseTablespace(fileset, name)); + snprintf(path, MAXPGPATH, "%s/%s", dirpath, name); +} diff --git a/src/backend/storage/file/meson.build b/src/backend/storage/file/meson.build new file mode 100644 index 0000000..e7fe850 --- /dev/null +++ b/src/backend/storage/file/meson.build @@ -0,0 +1,10 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'buffile.c', + 'copydir.c', + 'fd.c', + 'fileset.c', + 'reinit.c', + 'sharedfileset.c', +) diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c new file mode 100644 index 0000000..fb55371 --- /dev/null +++ b/src/backend/storage/file/reinit.c @@ -0,0 +1,422 @@ +/*------------------------------------------------------------------------- + * + * reinit.c + * Reinitialization of unlogged relations + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/reinit.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "common/relpath.h" +#include "postmaster/startup.h" +#include "storage/copydir.h" +#include "storage/fd.h" +#include "storage/reinit.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" + +static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, + int op); +static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, + int op); + +typedef struct +{ + Oid reloid; /* hash key */ +} unlogged_relation_entry; + +/* + * Reset unlogged relations from before the last restart. + * + * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any + * relation with an "init" fork, except for the "init" fork itself. + * + * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main + * fork. + */ +void +ResetUnloggedRelations(int op) +{ + char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)]; + DIR *spc_dir; + struct dirent *spc_de; + MemoryContext tmpctx, + oldctx; + + /* Log it. */ + elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d", + (op & UNLOGGED_RELATION_CLEANUP) != 0, + (op & UNLOGGED_RELATION_INIT) != 0); + + /* + * Just to be sure we don't leak any memory, let's create a temporary + * memory context for this operation. + */ + tmpctx = AllocSetContextCreate(CurrentMemoryContext, + "ResetUnloggedRelations", + ALLOCSET_DEFAULT_SIZES); + oldctx = MemoryContextSwitchTo(tmpctx); + + /* Prepare to report progress resetting unlogged relations. */ + begin_startup_progress_phase(); + + /* + * First process unlogged files in pg_default ($PGDATA/base) + */ + ResetUnloggedRelationsInTablespaceDir("base", op); + + /* + * Cycle through directories for all non-default tablespaces. + */ + spc_dir = AllocateDir("pg_tblspc"); + + while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL) + { + if (strcmp(spc_de->d_name, ".") == 0 || + strcmp(spc_de->d_name, "..") == 0) + continue; + + snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s", + spc_de->d_name, TABLESPACE_VERSION_DIRECTORY); + ResetUnloggedRelationsInTablespaceDir(temp_path, op); + } + + FreeDir(spc_dir); + + /* + * Restore memory context. + */ + MemoryContextSwitchTo(oldctx); + MemoryContextDelete(tmpctx); +} + +/* + * Process one tablespace directory for ResetUnloggedRelations + */ +static void +ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op) +{ + DIR *ts_dir; + struct dirent *de; + char dbspace_path[MAXPGPATH * 2]; + + ts_dir = AllocateDir(tsdirname); + + /* + * If we get ENOENT on a tablespace directory, log it and return. This + * can happen if a previous DROP TABLESPACE crashed between removing the + * tablespace directory and removing the symlink in pg_tblspc. We don't + * really want to prevent database startup in that scenario, so let it + * pass instead. Any other type of error will be reported by ReadDir + * (causing a startup failure). + */ + if (ts_dir == NULL && errno == ENOENT) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", + tsdirname))); + return; + } + + while ((de = ReadDir(ts_dir, tsdirname)) != NULL) + { + /* + * We're only interested in the per-database directories, which have + * numeric names. Note that this code will also (properly) ignore "." + * and "..". + */ + if (strspn(de->d_name, "0123456789") != strlen(de->d_name)) + continue; + + snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s", + tsdirname, de->d_name); + + if (op & UNLOGGED_RELATION_INIT) + ereport_startup_progress("resetting unlogged relations (init), elapsed time: %ld.%02d s, current path: %s", + dbspace_path); + else if (op & UNLOGGED_RELATION_CLEANUP) + ereport_startup_progress("resetting unlogged relations (cleanup), elapsed time: %ld.%02d s, current path: %s", + dbspace_path); + + ResetUnloggedRelationsInDbspaceDir(dbspace_path, op); + } + + FreeDir(ts_dir); +} + +/* + * Process one per-dbspace directory for ResetUnloggedRelations + */ +static void +ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op) +{ + DIR *dbspace_dir; + struct dirent *de; + char rm_path[MAXPGPATH * 2]; + + /* Caller must specify at least one operation. */ + Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0); + + /* + * Cleanup is a two-pass operation. First, we go through and identify all + * the files with init forks. Then, we go through again and nuke + * everything with the same OID except the init fork. + */ + if ((op & UNLOGGED_RELATION_CLEANUP) != 0) + { + HTAB *hash; + HASHCTL ctl; + + /* + * It's possible that someone could create a ton of unlogged relations + * in the same database & tablespace, so we'd better use a hash table + * rather than an array or linked list to keep track of which files + * need to be reset. Otherwise, this cleanup operation would be + * O(n^2). + */ + ctl.keysize = sizeof(Oid); + ctl.entrysize = sizeof(unlogged_relation_entry); + ctl.hcxt = CurrentMemoryContext; + hash = hash_create("unlogged relation OIDs", 32, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Scan the directory. */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int relnumchars; + unlogged_relation_entry ent; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars, + &forkNum)) + continue; + + /* Also skip it unless this is the init fork. */ + if (forkNum != INIT_FORKNUM) + continue; + + /* + * Put the OID portion of the name into the hash table, if it + * isn't already. + */ + ent.reloid = atooid(de->d_name); + (void) hash_search(hash, &ent, HASH_ENTER, NULL); + } + + /* Done with the first pass. */ + FreeDir(dbspace_dir); + + /* + * If we didn't find any init forks, there's no point in continuing; + * we can bail out now. + */ + if (hash_get_num_entries(hash) == 0) + { + hash_destroy(hash); + return; + } + + /* + * Now, make a second pass and remove anything that matches. + */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int relnumchars; + unlogged_relation_entry ent; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars, + &forkNum)) + continue; + + /* We never remove the init fork. */ + if (forkNum == INIT_FORKNUM) + continue; + + /* + * See whether the OID portion of the name shows up in the hash + * table. If so, nuke it! + */ + ent.reloid = atooid(de->d_name); + if (hash_search(hash, &ent, HASH_FIND, NULL)) + { + snprintf(rm_path, sizeof(rm_path), "%s/%s", + dbspacedirname, de->d_name); + if (unlink(rm_path) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", + rm_path))); + else + elog(DEBUG2, "unlinked file \"%s\"", rm_path); + } + } + + /* Cleanup is complete. */ + FreeDir(dbspace_dir); + hash_destroy(hash); + } + + /* + * Initialization happens after cleanup is complete: we copy each init + * fork file to the corresponding main fork file. Note that if we are + * asked to do both cleanup and init, we may never get here: if the + * cleanup code determines that there are no init forks in this dbspace, + * it will return before we get to this point. + */ + if ((op & UNLOGGED_RELATION_INIT) != 0) + { + /* Scan the directory. */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int relnumchars; + char relnumbuf[OIDCHARS + 1]; + char srcpath[MAXPGPATH * 2]; + char dstpath[MAXPGPATH]; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars, + &forkNum)) + continue; + + /* Also skip it unless this is the init fork. */ + if (forkNum != INIT_FORKNUM) + continue; + + /* Construct source pathname. */ + snprintf(srcpath, sizeof(srcpath), "%s/%s", + dbspacedirname, de->d_name); + + /* Construct destination pathname. */ + memcpy(relnumbuf, de->d_name, relnumchars); + relnumbuf[relnumchars] = '\0'; + snprintf(dstpath, sizeof(dstpath), "%s/%s%s", + dbspacedirname, relnumbuf, de->d_name + relnumchars + 1 + + strlen(forkNames[INIT_FORKNUM])); + + /* OK, we're ready to perform the actual copy. */ + elog(DEBUG2, "copying %s to %s", srcpath, dstpath); + copy_file(srcpath, dstpath); + } + + FreeDir(dbspace_dir); + + /* + * copy_file() above has already called pg_flush_data() on the files + * it created. Now we need to fsync those files, because a checkpoint + * won't do it for us while we're in recovery. We do this in a + * separate pass to allow the kernel to perform all the flushes + * (especially the metadata ones) at once. + */ + dbspace_dir = AllocateDir(dbspacedirname); + while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL) + { + ForkNumber forkNum; + int relnumchars; + char relnumbuf[OIDCHARS + 1]; + char mainpath[MAXPGPATH]; + + /* Skip anything that doesn't look like a relation data file. */ + if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars, + &forkNum)) + continue; + + /* Also skip it unless this is the init fork. */ + if (forkNum != INIT_FORKNUM) + continue; + + /* Construct main fork pathname. */ + memcpy(relnumbuf, de->d_name, relnumchars); + relnumbuf[relnumchars] = '\0'; + snprintf(mainpath, sizeof(mainpath), "%s/%s%s", + dbspacedirname, relnumbuf, de->d_name + relnumchars + 1 + + strlen(forkNames[INIT_FORKNUM])); + + fsync_fname(mainpath, false); + } + + FreeDir(dbspace_dir); + + /* + * Lastly, fsync the database directory itself, ensuring the + * filesystem remembers the file creations and deletions we've done. + * We don't bother with this during a call that does only + * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we + * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step + * too at the next startup attempt. + */ + fsync_fname(dbspacedirname, true); + } +} + +/* + * Basic parsing of putative relation filenames. + * + * This function returns true if the file appears to be in the correct format + * for a non-temporary relation and false otherwise. + * + * NB: If this function returns true, the caller is entitled to assume that + * *relnumchars has been set to a value no more than OIDCHARS, and thus + * that a buffer of OIDCHARS+1 characters is sufficient to hold the + * RelFileNumber portion of the filename. This is critical to protect against + * a possible buffer overrun. + */ +bool +parse_filename_for_nontemp_relation(const char *name, int *relnumchars, + ForkNumber *fork) +{ + int pos; + + /* Look for a non-empty string of digits (that isn't too long). */ + for (pos = 0; isdigit((unsigned char) name[pos]); ++pos) + ; + if (pos == 0 || pos > OIDCHARS) + return false; + *relnumchars = pos; + + /* Check for a fork name. */ + if (name[pos] != '_') + *fork = MAIN_FORKNUM; + else + { + int forkchar; + + forkchar = forkname_chars(&name[pos + 1], fork); + if (forkchar <= 0) + return false; + pos += forkchar + 1; + } + + /* Check for a segment number. */ + if (name[pos] == '.') + { + int segchar; + + for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar) + ; + if (segchar <= 1) + return false; + pos += segchar; + } + + /* Now we should be at the end. */ + if (name[pos] != '\0') + return false; + return true; +} diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c new file mode 100644 index 0000000..a13c8ed --- /dev/null +++ b/src/backend/storage/file/sharedfileset.c @@ -0,0 +1,120 @@ +/*------------------------------------------------------------------------- + * + * sharedfileset.c + * Shared temporary file management. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/file/sharedfileset.c + * + * SharedFileSets provide a temporary namespace (think directory) so that + * files can be discovered by name, and a shared ownership semantics so that + * shared files survive until the last user detaches. + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "catalog/pg_tablespace.h" +#include "commands/tablespace.h" +#include "common/hashfn.h" +#include "miscadmin.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/sharedfileset.h" +#include "utils/builtins.h" + +static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum); + +/* + * Initialize a space for temporary files that can be opened by other backends. + * Other backends must attach to it before accessing it. Associate this + * SharedFileSet with 'seg'. Any contained files will be deleted when the + * last backend detaches. + * + * Under the covers the set is one or more directories which will eventually + * be deleted. + */ +void +SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg) +{ + /* Initialize the shared fileset specific members. */ + SpinLockInit(&fileset->mutex); + fileset->refcnt = 1; + + /* Initialize the fileset. */ + FileSetInit(&fileset->fs); + + /* Register our cleanup callback. */ + if (seg) + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); +} + +/* + * Attach to a set of directories that was created with SharedFileSetInit. + */ +void +SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg) +{ + bool success; + + SpinLockAcquire(&fileset->mutex); + if (fileset->refcnt == 0) + success = false; + else + { + ++fileset->refcnt; + success = true; + } + SpinLockRelease(&fileset->mutex); + + if (!success) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not attach to a SharedFileSet that is already destroyed"))); + + /* Register our cleanup callback. */ + on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset)); +} + +/* + * Delete all files in the set. + */ +void +SharedFileSetDeleteAll(SharedFileSet *fileset) +{ + FileSetDeleteAll(&fileset->fs); +} + +/* + * Callback function that will be invoked when this backend detaches from a + * DSM segment holding a SharedFileSet that it has created or attached to. If + * we are the last to detach, then try to remove the directories and + * everything in them. We can't raise an error on failures, because this runs + * in error cleanup paths. + */ +static void +SharedFileSetOnDetach(dsm_segment *segment, Datum datum) +{ + bool unlink_all = false; + SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum); + + SpinLockAcquire(&fileset->mutex); + Assert(fileset->refcnt > 0); + if (--fileset->refcnt == 0) + unlink_all = true; + SpinLockRelease(&fileset->mutex); + + /* + * If we are the last to detach, we delete the directory in all + * tablespaces. Note that we are still actually attached for the rest of + * this function so we can safely access its data. + */ + if (unlink_all) + FileSetDeleteAll(&fileset->fs); +} diff --git a/src/backend/storage/freespace/Makefile b/src/backend/storage/freespace/Makefile new file mode 100644 index 0000000..ac0fa8b --- /dev/null +++ b/src/backend/storage/freespace/Makefile @@ -0,0 +1,20 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/freespace +# +# IDENTIFICATION +# src/backend/storage/freespace/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/freespace +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + freespace.o \ + fsmpage.o \ + indexfsm.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README new file mode 100644 index 0000000..e7ff23b --- /dev/null +++ b/src/backend/storage/freespace/README @@ -0,0 +1,196 @@ +src/backend/storage/freespace/README + +Free Space Map +-------------- + +The purpose of the free space map is to quickly locate a page with enough +free space to hold a tuple to be stored; or to determine that no such page +exists and the relation must be extended by one page. As of PostgreSQL 8.4 +each relation has its own, extensible free space map stored in a separate +"fork" of its relation. This eliminates the disadvantages of the former +fixed-size FSM. + +It is important to keep the map small so that it can be searched rapidly. +Therefore, we don't attempt to record the exact free space on a page. +We allocate one map byte to each page, allowing us to record free space +at a granularity of 1/256th of a page. Another way to say it is that +the stored value is the free space divided by BLCKSZ/256 (rounding down). +We assume that the free space must always be less than BLCKSZ, since +all pages have some overhead; so the maximum map value is 255. + +To assist in fast searching, the map isn't simply an array of per-page +entries, but has a tree structure above those entries. There is a tree +structure of pages, and a tree structure within each page, as described +below. + +FSM page structure +------------------ + +Within each FSM page, we use a binary tree structure where leaf nodes store +the amount of free space on heap pages (or lower level FSM pages, see +"Higher-level structure" below), with one leaf node per heap page. A non-leaf +node stores the max amount of free space on any of its children. + +For example: + + 4 + 4 2 +3 4 0 2 <- This level represents heap pages + +We need two basic operations: search and update. + +To search for a page with X amount of free space, traverse down the tree +along a path where n >= X, until you hit the bottom. If both children of a +node satisfy the condition, you can pick either one arbitrarily. + +To update the amount of free space on a page to X, first update the leaf node +corresponding to the heap page, then "bubble up" the change to upper nodes, +by walking up to each parent and recomputing its value as the max of its +two children. Repeat until reaching the root or a parent whose value +doesn't change. + +This data structure has a couple of nice properties: +- to discover that there is no page with X bytes of free space, you only + need to look at the root node +- by varying which child to traverse to in the search algorithm, when you have + a choice, we can implement various strategies, like preferring pages closer + to a given page, or spreading the load across the table. + +Higher-level routines that use FSM pages access them through the fsm_set_avail() +and fsm_search_avail() functions. The interface to those functions hides the +page's internal tree structure, treating the FSM page as a black box that has +a certain number of "slots" for storing free space information. (However, +the higher routines have to be aware of the tree structure of the whole map.) + +The binary tree is stored on each FSM page as an array. Because the page +header takes some space on a page, the binary tree isn't perfect. That is, +a few right-most leaf nodes are missing, and there are some useless non-leaf +nodes at the right. So the tree looks something like this: + + 0 + 1 2 + 3 4 5 6 +7 8 9 A B + +where the numbers denote each node's position in the array. Note that the +tree is guaranteed complete above the leaf level; only some leaf nodes are +missing. This is reflected in the number of usable "slots" per page not +being an exact power of 2. + +A FSM page also has a next slot pointer, fp_next_slot, that determines where +to start the next search for free space within that page. The reason for that +is to spread out the pages that are returned by FSM searches. When several +backends are concurrently inserting into a relation, contention can be avoided +by having them insert into different pages. But it is also desirable to fill +up pages in sequential order, to get the benefit of OS prefetching and batched +writes. The FSM is responsible for making that happen, and the next slot +pointer helps provide the desired behavior. + +Higher-level structure +---------------------- + +To scale up the data structure described above beyond a single page, we +maintain a similar tree-structure across pages. Leaf nodes in higher level +pages correspond to lower level FSM pages. The root node within each page +has the same value as the corresponding leaf node on its parent page. + +The root page is always stored at physical block 0. + +For example, assuming each FSM page can hold information about 4 pages (in +reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ), +we get a disk layout like this: + + 0 <-- page 0 at level 2 (root page) + 0 <-- page 0 at level 1 + 0 <-- page 0 at level 0 + 1 <-- page 1 at level 0 + 2 <-- ... + 3 + 1 <-- page 1 at level 1 + 4 + 5 + 6 + 7 + 2 + 8 + 9 + 10 + 11 + 3 + 12 + 13 + 14 + 15 + +where the numbers are page numbers *at that level*, starting from 0. + +To find the physical block # corresponding to leaf page n, we need to +count the number of leaf and upper-level pages preceding page n. +This turns out to be + +y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1 + +where F is the fanout (4 in the above example). The first term n is the number +of preceding leaf pages, the second term is the number of pages at level 1, +and so forth. + +To keep things simple, the tree is always constant height. To cover the +maximum relation size of 2^32-1 blocks, three levels is enough with the default +BLCKSZ (4000^3 > 2^32). + +Addressing +---------- + +The higher-level routines operate on "logical" addresses, consisting of +- level, +- logical page number, and +- slot (if applicable) + +Bottom level FSM pages have level of 0, the level above that 1, and root 2. +As in the diagram above, logical page number is the page number at that level, +starting from 0. + +Locking +------- + +When traversing down to search for free space, only one page is locked at a +time: the parent page is released before locking the child. If the child page +is concurrently modified, and there no longer is free space on the child page +when you land on it, you need to start from scratch (after correcting the +parent page, so that you don't get into an infinite loop). + +We use shared buffer locks when searching, but exclusive buffer lock when +updating a page. However, the next slot search pointer is updated during +searches even though we have only a shared lock. fp_next_slot is just a hint +and we can easily reset it if it gets corrupted; so it seems better to accept +some risk of that type than to pay the overhead of exclusive locking. + +Recovery +-------- + +The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of +self-correcting measures to repair possible corruption. As a result when +we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint() +rather than MarkBufferDirty(). + +First of all, whenever a value is set on an FSM page, the root node of the +page is compared against the new value after bubbling up the change is +finished. It should be greater than or equal to the value just set, or we +have a corrupted page, with a parent somewhere with too small a value. +Secondly, if we detect corrupted pages while we search, traversing down +the tree. That check will notice if a parent node is set to too high a value. +In both cases, the upper nodes on the page are immediately rebuilt, fixing +the corruption so far as that page is concerned. + +VACUUM updates all the bottom-level FSM pages with the correct amount of free +space on corresponding heap pages, as it proceeds through the heap. This +goes through fsm_set_avail(), so that the upper nodes on those pages are +immediately updated. Periodically, VACUUM calls FreeSpaceMapVacuum[Range] +to propagate the new free-space info into the upper pages of the FSM tree. + +TODO +---- + +- fastroot to avoid traversing upper nodes with just 1 child +- use a different system for tables that fit into one FSM page, with a + mechanism to switch to the real thing as it grows. diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c new file mode 100644 index 0000000..fb9440f --- /dev/null +++ b/src/backend/storage/freespace/freespace.c @@ -0,0 +1,865 @@ +/*------------------------------------------------------------------------- + * + * freespace.c + * POSTGRES free space map for quickly finding free space in relations + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/freespace/freespace.c + * + * + * NOTES: + * + * Free Space Map keeps track of the amount of free space on pages, and + * allows quickly searching for a page with enough free space. The FSM is + * stored in a dedicated relation fork of all heap relations, and those + * index access methods that need it (see also indexfsm.c). See README for + * more information. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/xloginsert.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "storage/freespace.h" +#include "storage/fsm_internals.h" +#include "storage/lmgr.h" +#include "storage/smgr.h" + + +/* + * We use just one byte to store the amount of free space on a page, so we + * divide the amount of free space a page can have into 256 different + * categories. The highest category, 255, represents a page with at least + * MaxFSMRequestSize bytes of free space, and the second highest category + * represents the range from 254 * FSM_CAT_STEP, inclusive, to + * MaxFSMRequestSize, exclusive. + * + * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming + * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the + * categories look like this: + * + * + * Range Category + * 0 - 31 0 + * 32 - 63 1 + * ... ... ... + * 8096 - 8127 253 + * 8128 - 8163 254 + * 8164 - 8192 255 + * + * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize + * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize + * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize + * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a + * completely empty page, that would mean that we could never satisfy a + * request of exactly MaxFSMRequestSize bytes. + */ +#define FSM_CATEGORIES 256 +#define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES) +#define MaxFSMRequestSize MaxHeapTupleSize + +/* + * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks, + * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise, + * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice, + * this means that 4096 bytes is the smallest BLCKSZ that we can get away + * with a 3-level tree, and 512 is the smallest we support. + */ +#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4) + +#define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1) +#define FSM_BOTTOM_LEVEL 0 + +/* + * The internal FSM routines work on a logical addressing scheme. Each + * level of the tree can be thought of as a separately addressable file. + */ +typedef struct +{ + int level; /* level */ + int logpageno; /* page number within the level */ +} FSMAddress; + +/* Address of the root page. */ +static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0}; + +/* functions to navigate the tree */ +static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot); +static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot); +static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot); +static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot); +static BlockNumber fsm_logical_to_physical(FSMAddress addr); + +static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend); +static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks); + +/* functions to convert amount of free space to a FSM category */ +static uint8 fsm_space_avail_to_cat(Size avail); +static uint8 fsm_space_needed_to_cat(Size needed); +static Size fsm_space_cat_to_avail(uint8 cat); + +/* workhorse functions for various operations */ +static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, + uint8 newValue, uint8 minValue); +static BlockNumber fsm_search(Relation rel, uint8 min_cat); +static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, + BlockNumber start, BlockNumber end, + bool *eof_p); + + +/******** Public API ********/ + +/* + * GetPageWithFreeSpace - try to find a page in the given relation with + * at least the specified amount of free space. + * + * If successful, return the block number; if not, return InvalidBlockNumber. + * + * The caller must be prepared for the possibility that the returned page + * will turn out to have too little space available by the time the caller + * gets a lock on it. In that case, the caller should report the actual + * amount of free space available on that page and then try again (see + * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned, + * extend the relation. + */ +BlockNumber +GetPageWithFreeSpace(Relation rel, Size spaceNeeded) +{ + uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded); + + return fsm_search(rel, min_cat); +} + +/* + * RecordAndGetPageWithFreeSpace - update info about a page and try again. + * + * We provide this combo form to save some locking overhead, compared to + * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's + * also some effort to return a page close to the old page; if there's a + * page with enough free space on the same FSM page where the old one page + * is located, it is preferred. + */ +BlockNumber +RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, + Size oldSpaceAvail, Size spaceNeeded) +{ + int old_cat = fsm_space_avail_to_cat(oldSpaceAvail); + int search_cat = fsm_space_needed_to_cat(spaceNeeded); + FSMAddress addr; + uint16 slot; + int search_slot; + + /* Get the location of the FSM byte representing the heap block */ + addr = fsm_get_location(oldPage, &slot); + + search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat); + + /* + * If fsm_set_and_search found a suitable new block, return that. + * Otherwise, search as usual. + */ + if (search_slot != -1) + return fsm_get_heap_blk(addr, search_slot); + else + return fsm_search(rel, search_cat); +} + +/* + * RecordPageWithFreeSpace - update info about a page. + * + * Note that if the new spaceAvail value is higher than the old value stored + * in the FSM, the space might not become visible to searchers until the next + * FreeSpaceMapVacuum call, which updates the upper level pages. + */ +void +RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail) +{ + int new_cat = fsm_space_avail_to_cat(spaceAvail); + FSMAddress addr; + uint16 slot; + + /* Get the location of the FSM byte representing the heap block */ + addr = fsm_get_location(heapBlk, &slot); + + fsm_set_and_search(rel, addr, slot, new_cat, 0); +} + +/* + * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in + * WAL replay + */ +void +XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk, + Size spaceAvail) +{ + int new_cat = fsm_space_avail_to_cat(spaceAvail); + FSMAddress addr; + uint16 slot; + BlockNumber blkno; + Buffer buf; + Page page; + + /* Get the location of the FSM byte representing the heap block */ + addr = fsm_get_location(heapBlk, &slot); + blkno = fsm_logical_to_physical(addr); + + /* If the page doesn't exist already, extend */ + buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno, + RBM_ZERO_ON_ERROR, InvalidBuffer); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buf); + if (PageIsNew(page)) + PageInit(page, BLCKSZ, 0); + + if (fsm_set_avail(page, slot, new_cat)) + MarkBufferDirtyHint(buf, false); + UnlockReleaseBuffer(buf); +} + +/* + * GetRecordedFreeSpace - return the amount of free space on a particular page, + * according to the FSM. + */ +Size +GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk) +{ + FSMAddress addr; + uint16 slot; + Buffer buf; + uint8 cat; + + /* Get the location of the FSM byte representing the heap block */ + addr = fsm_get_location(heapBlk, &slot); + + buf = fsm_readbuf(rel, addr, false); + if (!BufferIsValid(buf)) + return 0; + cat = fsm_get_avail(BufferGetPage(buf), slot); + ReleaseBuffer(buf); + + return fsm_space_cat_to_avail(cat); +} + +/* + * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation. + * + * nblocks is the new size of the heap. + * + * Return the number of blocks of new FSM. + * If it's InvalidBlockNumber, there is nothing to truncate; + * otherwise the caller is responsible for calling smgrtruncate() + * to truncate the FSM pages, and FreeSpaceMapVacuumRange() + * to update upper-level pages in the FSM. + */ +BlockNumber +FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks) +{ + BlockNumber new_nfsmblocks; + FSMAddress first_removed_address; + uint16 first_removed_slot; + Buffer buf; + + /* + * If no FSM has been created yet for this relation, there's nothing to + * truncate. + */ + if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM)) + return InvalidBlockNumber; + + /* Get the location in the FSM of the first removed heap block */ + first_removed_address = fsm_get_location(nblocks, &first_removed_slot); + + /* + * Zero out the tail of the last remaining FSM page. If the slot + * representing the first removed heap block is at a page boundary, as the + * first slot on the FSM page that first_removed_address points to, we can + * just truncate that page altogether. + */ + if (first_removed_slot > 0) + { + buf = fsm_readbuf(rel, first_removed_address, false); + if (!BufferIsValid(buf)) + return InvalidBlockNumber; /* nothing to do; the FSM was already + * smaller */ + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + /* NO EREPORT(ERROR) from here till changes are logged */ + START_CRIT_SECTION(); + + fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); + + /* + * Truncation of a relation is WAL-logged at a higher-level, and we + * will be called at WAL replay. But if checksums are enabled, we need + * to still write a WAL record to protect against a torn page, if the + * page is flushed to disk before the truncation WAL record. We cannot + * use MarkBufferDirtyHint here, because that will not dirty the page + * during recovery. + */ + MarkBufferDirty(buf); + if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) + log_newpage_buffer(buf, false); + + END_CRIT_SECTION(); + + UnlockReleaseBuffer(buf); + + new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1; + } + else + { + new_nfsmblocks = fsm_logical_to_physical(first_removed_address); + if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks) + return InvalidBlockNumber; /* nothing to do; the FSM was already + * smaller */ + } + + return new_nfsmblocks; +} + +/* + * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM + * + * We assume that the bottom-level pages have already been updated with + * new free-space information. + */ +void +FreeSpaceMapVacuum(Relation rel) +{ + bool dummy; + + /* Recursively scan the tree, starting at the root */ + (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, + (BlockNumber) 0, InvalidBlockNumber, + &dummy); +} + +/* + * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM + * + * As above, but assume that only heap pages between start and end-1 inclusive + * have new free-space information, so update only the upper-level slots + * covering that block range. end == InvalidBlockNumber is equivalent to + * "all the rest of the relation". + */ +void +FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end) +{ + bool dummy; + + /* Recursively scan the tree, starting at the root */ + if (end > start) + (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy); +} + +/******** Internal routines ********/ + +/* + * Return category corresponding x bytes of free space + */ +static uint8 +fsm_space_avail_to_cat(Size avail) +{ + int cat; + + Assert(avail < BLCKSZ); + + if (avail >= MaxFSMRequestSize) + return 255; + + cat = avail / FSM_CAT_STEP; + + /* + * The highest category, 255, is reserved for MaxFSMRequestSize bytes or + * more. + */ + if (cat > 254) + cat = 254; + + return (uint8) cat; +} + +/* + * Return the lower bound of the range of free space represented by given + * category. + */ +static Size +fsm_space_cat_to_avail(uint8 cat) +{ + /* The highest category represents exactly MaxFSMRequestSize bytes. */ + if (cat == 255) + return MaxFSMRequestSize; + else + return cat * FSM_CAT_STEP; +} + +/* + * Which category does a page need to have, to accommodate x bytes of data? + * While fsm_space_avail_to_cat() rounds down, this needs to round up. + */ +static uint8 +fsm_space_needed_to_cat(Size needed) +{ + int cat; + + /* Can't ask for more space than the highest category represents */ + if (needed > MaxFSMRequestSize) + elog(ERROR, "invalid FSM request size %zu", needed); + + if (needed == 0) + return 1; + + cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP; + + if (cat > 255) + cat = 255; + + return (uint8) cat; +} + +/* + * Returns the physical block number of a FSM page + */ +static BlockNumber +fsm_logical_to_physical(FSMAddress addr) +{ + BlockNumber pages; + int leafno; + int l; + + /* + * Calculate the logical page number of the first leaf page below the + * given page. + */ + leafno = addr.logpageno; + for (l = 0; l < addr.level; l++) + leafno *= SlotsPerFSMPage; + + /* Count upper level nodes required to address the leaf page */ + pages = 0; + for (l = 0; l < FSM_TREE_DEPTH; l++) + { + pages += leafno + 1; + leafno /= SlotsPerFSMPage; + } + + /* + * If the page we were asked for wasn't at the bottom level, subtract the + * additional lower level pages we counted above. + */ + pages -= addr.level; + + /* Turn the page count into 0-based block number */ + return pages - 1; +} + +/* + * Return the FSM location corresponding to given heap block. + */ +static FSMAddress +fsm_get_location(BlockNumber heapblk, uint16 *slot) +{ + FSMAddress addr; + + addr.level = FSM_BOTTOM_LEVEL; + addr.logpageno = heapblk / SlotsPerFSMPage; + *slot = heapblk % SlotsPerFSMPage; + + return addr; +} + +/* + * Return the heap block number corresponding to given location in the FSM. + */ +static BlockNumber +fsm_get_heap_blk(FSMAddress addr, uint16 slot) +{ + Assert(addr.level == FSM_BOTTOM_LEVEL); + return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot; +} + +/* + * Given a logical address of a child page, get the logical page number of + * the parent, and the slot within the parent corresponding to the child. + */ +static FSMAddress +fsm_get_parent(FSMAddress child, uint16 *slot) +{ + FSMAddress parent; + + Assert(child.level < FSM_ROOT_LEVEL); + + parent.level = child.level + 1; + parent.logpageno = child.logpageno / SlotsPerFSMPage; + *slot = child.logpageno % SlotsPerFSMPage; + + return parent; +} + +/* + * Given a logical address of a parent page and a slot number, get the + * logical address of the corresponding child page. + */ +static FSMAddress +fsm_get_child(FSMAddress parent, uint16 slot) +{ + FSMAddress child; + + Assert(parent.level > FSM_BOTTOM_LEVEL); + + child.level = parent.level - 1; + child.logpageno = parent.logpageno * SlotsPerFSMPage + slot; + + return child; +} + +/* + * Read a FSM page. + * + * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is + * true, the FSM file is extended. + */ +static Buffer +fsm_readbuf(Relation rel, FSMAddress addr, bool extend) +{ + BlockNumber blkno = fsm_logical_to_physical(addr); + Buffer buf; + SMgrRelation reln; + + /* + * Caution: re-using this smgr pointer could fail if the relcache entry + * gets closed. It's safe as long as we only do smgr-level operations + * between here and the last use of the pointer. + */ + reln = RelationGetSmgr(rel); + + /* + * If we haven't cached the size of the FSM yet, check it first. Also + * recheck if the requested block seems to be past end, since our cached + * value might be stale. (We send smgr inval messages on truncation, but + * not on extension.) + */ + if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber || + blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM]) + { + /* Invalidate the cache so smgrnblocks asks the kernel. */ + reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber; + if (smgrexists(reln, FSM_FORKNUM)) + smgrnblocks(reln, FSM_FORKNUM); + else + reln->smgr_cached_nblocks[FSM_FORKNUM] = 0; + } + + /* + * For reading we use ZERO_ON_ERROR mode, and initialize the page if + * necessary. The FSM information is not accurate anyway, so it's better + * to clear corrupt pages than error out. Since the FSM changes are not + * WAL-logged, the so-called torn page problem on crash can lead to pages + * with corrupt headers, for example. + * + * We use the same path below to initialize pages when extending the + * relation, as a concurrent extension can end up with vm_extend() + * returning an already-initialized page. + */ + if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM]) + { + if (extend) + buf = fsm_extend(rel, blkno + 1); + else + return InvalidBuffer; + } + else + buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL); + + /* + * Initializing the page when needed is trickier than it looks, because of + * the possibility of multiple backends doing this concurrently, and our + * desire to not uselessly take the buffer lock in the normal path where + * the page is OK. We must take the lock to initialize the page, so + * recheck page newness after we have the lock, in case someone else + * already did it. Also, because we initially check PageIsNew with no + * lock, it's possible to fall through and return the buffer while someone + * else is still initializing the page (i.e., we might see pd_upper as set + * but other page header fields are still zeroes). This is harmless for + * callers that will take a buffer lock themselves, but some callers + * inspect the page without any lock at all. The latter is OK only so + * long as it doesn't depend on the page header having correct contents. + * Current usage is safe because PageGetContents() does not require that. + */ + if (PageIsNew(BufferGetPage(buf))) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(BufferGetPage(buf))) + PageInit(BufferGetPage(buf), BLCKSZ, 0); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + return buf; +} + +/* + * Ensure that the FSM fork is at least fsm_nblocks long, extending + * it if necessary with empty pages. And by empty, I mean pages filled + * with zeros, meaning there's no free space. + */ +static Buffer +fsm_extend(Relation rel, BlockNumber fsm_nblocks) +{ + return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL, + EB_CREATE_FORK_IF_NEEDED | + EB_CLEAR_SIZE_CACHE, + fsm_nblocks, + RBM_ZERO_ON_ERROR); +} + +/* + * Set value in given FSM page and slot. + * + * If minValue > 0, the updated page is also searched for a page with at + * least minValue of free space. If one is found, its slot number is + * returned, -1 otherwise. + */ +static int +fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot, + uint8 newValue, uint8 minValue) +{ + Buffer buf; + Page page; + int newslot = -1; + + buf = fsm_readbuf(rel, addr, true); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buf); + + if (fsm_set_avail(page, slot, newValue)) + MarkBufferDirtyHint(buf, false); + + if (minValue != 0) + { + /* Search while we still hold the lock */ + newslot = fsm_search_avail(buf, minValue, + addr.level == FSM_BOTTOM_LEVEL, + true); + } + + UnlockReleaseBuffer(buf); + + return newslot; +} + +/* + * Search the tree for a heap page with at least min_cat of free space + */ +static BlockNumber +fsm_search(Relation rel, uint8 min_cat) +{ + int restarts = 0; + FSMAddress addr = FSM_ROOT_ADDRESS; + + for (;;) + { + int slot; + Buffer buf; + uint8 max_avail = 0; + + /* Read the FSM page. */ + buf = fsm_readbuf(rel, addr, false); + + /* Search within the page */ + if (BufferIsValid(buf)) + { + LockBuffer(buf, BUFFER_LOCK_SHARE); + slot = fsm_search_avail(buf, min_cat, + (addr.level == FSM_BOTTOM_LEVEL), + false); + if (slot == -1) + max_avail = fsm_get_max_avail(BufferGetPage(buf)); + UnlockReleaseBuffer(buf); + } + else + slot = -1; + + if (slot != -1) + { + /* + * Descend the tree, or return the found block if we're at the + * bottom. + */ + if (addr.level == FSM_BOTTOM_LEVEL) + return fsm_get_heap_blk(addr, slot); + + addr = fsm_get_child(addr, slot); + } + else if (addr.level == FSM_ROOT_LEVEL) + { + /* + * At the root, failure means there's no page with enough free + * space in the FSM. Give up. + */ + return InvalidBlockNumber; + } + else + { + uint16 parentslot; + FSMAddress parent; + + /* + * At lower level, failure can happen if the value in the upper- + * level node didn't reflect the value on the lower page. Update + * the upper node, to avoid falling into the same trap again, and + * start over. + * + * There's a race condition here, if another backend updates this + * page right after we release it, and gets the lock on the parent + * page before us. We'll then update the parent page with the now + * stale information we had. It's OK, because it should happen + * rarely, and will be fixed by the next vacuum. + */ + parent = fsm_get_parent(addr, &parentslot); + fsm_set_and_search(rel, parent, parentslot, max_avail, 0); + + /* + * If the upper pages are badly out of date, we might need to loop + * quite a few times, updating them as we go. Any inconsistencies + * should eventually be corrected and the loop should end. Looping + * indefinitely is nevertheless scary, so provide an emergency + * valve. + */ + if (restarts++ > 10000) + return InvalidBlockNumber; + + /* Start search all over from the root */ + addr = FSM_ROOT_ADDRESS; + } + } +} + + +/* + * Recursive guts of FreeSpaceMapVacuum + * + * Examine the FSM page indicated by addr, as well as its children, updating + * upper-level nodes that cover the heap block range from start to end-1. + * (It's okay if end is beyond the actual end of the map.) + * Return the maximum freespace value on this page. + * + * If addr is past the end of the FSM, set *eof_p to true and return 0. + * + * This traverses the tree in depth-first order. The tree is stored + * physically in depth-first order, so this should be pretty I/O efficient. + */ +static uint8 +fsm_vacuum_page(Relation rel, FSMAddress addr, + BlockNumber start, BlockNumber end, + bool *eof_p) +{ + Buffer buf; + Page page; + uint8 max_avail; + + /* Read the page if it exists, or return EOF */ + buf = fsm_readbuf(rel, addr, false); + if (!BufferIsValid(buf)) + { + *eof_p = true; + return 0; + } + else + *eof_p = false; + + page = BufferGetPage(buf); + + /* + * If we're above the bottom level, recurse into children, and fix the + * information stored about them at this level. + */ + if (addr.level > FSM_BOTTOM_LEVEL) + { + FSMAddress fsm_start, + fsm_end; + uint16 fsm_start_slot, + fsm_end_slot; + int slot, + start_slot, + end_slot; + bool eof = false; + + /* + * Compute the range of slots we need to update on this page, given + * the requested range of heap blocks to consider. The first slot to + * update is the one covering the "start" block, and the last slot is + * the one covering "end - 1". (Some of this work will be duplicated + * in each recursive call, but it's cheap enough to not worry about.) + */ + fsm_start = fsm_get_location(start, &fsm_start_slot); + fsm_end = fsm_get_location(end - 1, &fsm_end_slot); + + while (fsm_start.level < addr.level) + { + fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot); + fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot); + } + Assert(fsm_start.level == addr.level); + + if (fsm_start.logpageno == addr.logpageno) + start_slot = fsm_start_slot; + else if (fsm_start.logpageno > addr.logpageno) + start_slot = SlotsPerFSMPage; /* shouldn't get here... */ + else + start_slot = 0; + + if (fsm_end.logpageno == addr.logpageno) + end_slot = fsm_end_slot; + else if (fsm_end.logpageno > addr.logpageno) + end_slot = SlotsPerFSMPage - 1; + else + end_slot = -1; /* shouldn't get here... */ + + for (slot = start_slot; slot <= end_slot; slot++) + { + int child_avail; + + CHECK_FOR_INTERRUPTS(); + + /* After we hit end-of-file, just clear the rest of the slots */ + if (!eof) + child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot), + start, end, + &eof); + else + child_avail = 0; + + /* Update information about the child */ + if (fsm_get_avail(page, slot) != child_avail) + { + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + fsm_set_avail(page, slot, child_avail); + MarkBufferDirtyHint(buf, false); + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } + } + } + + /* Now get the maximum value on the page, to return to caller */ + max_avail = fsm_get_max_avail(page); + + /* + * Reset the next slot pointer. This encourages the use of low-numbered + * pages, increasing the chances that a later vacuum can truncate the + * relation. We don't bother with a lock here, nor with marking the page + * dirty if it wasn't already, since this is just a hint. + */ + ((FSMPage) PageGetContents(page))->fp_next_slot = 0; + + ReleaseBuffer(buf); + + return max_avail; +} diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c new file mode 100644 index 0000000..0cfb2ae --- /dev/null +++ b/src/backend/storage/freespace/fsmpage.c @@ -0,0 +1,374 @@ +/*------------------------------------------------------------------------- + * + * fsmpage.c + * routines to search and manipulate one FSM page. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/freespace/fsmpage.c + * + * NOTES: + * + * The public functions in this file form an API that hides the internal + * structure of a FSM page. This allows freespace.c to treat each FSM page + * as a black box with SlotsPerPage "slots". fsm_set_avail() and + * fsm_get_avail() let you get/set the value of a slot, and + * fsm_search_avail() lets you search for a slot with value >= X. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/bufmgr.h" +#include "storage/fsm_internals.h" + +/* Macros to navigate the tree within a page. Root has index zero. */ +#define leftchild(x) (2 * (x) + 1) +#define rightchild(x) (2 * (x) + 2) +#define parentof(x) (((x) - 1) / 2) + +/* + * Find right neighbor of x, wrapping around within the level + */ +static int +rightneighbor(int x) +{ + /* + * Move right. This might wrap around, stepping to the leftmost node at + * the next level. + */ + x++; + + /* + * Check if we stepped to the leftmost node at next level, and correct if + * so. The leftmost nodes at each level are numbered x = 2^level - 1, so + * check if (x + 1) is a power of two, using a standard + * twos-complement-arithmetic trick. + */ + if (((x + 1) & x) == 0) + x = parentof(x); + + return x; +} + +/* + * Sets the value of a slot on page. Returns true if the page was modified. + * + * The caller must hold an exclusive lock on the page. + */ +bool +fsm_set_avail(Page page, int slot, uint8 value) +{ + int nodeno = NonLeafNodesPerPage + slot; + FSMPage fsmpage = (FSMPage) PageGetContents(page); + uint8 oldvalue; + + Assert(slot < LeafNodesPerPage); + + oldvalue = fsmpage->fp_nodes[nodeno]; + + /* If the value hasn't changed, we don't need to do anything */ + if (oldvalue == value && value <= fsmpage->fp_nodes[0]) + return false; + + fsmpage->fp_nodes[nodeno] = value; + + /* + * Propagate up, until we hit the root or a node that doesn't need to be + * updated. + */ + do + { + uint8 newvalue = 0; + int lchild; + int rchild; + + nodeno = parentof(nodeno); + lchild = leftchild(nodeno); + rchild = lchild + 1; + + newvalue = fsmpage->fp_nodes[lchild]; + if (rchild < NodesPerPage) + newvalue = Max(newvalue, + fsmpage->fp_nodes[rchild]); + + oldvalue = fsmpage->fp_nodes[nodeno]; + if (oldvalue == newvalue) + break; + + fsmpage->fp_nodes[nodeno] = newvalue; + } while (nodeno > 0); + + /* + * sanity check: if the new value is (still) higher than the value at the + * top, the tree is corrupt. If so, rebuild. + */ + if (value > fsmpage->fp_nodes[0]) + fsm_rebuild_page(page); + + return true; +} + +/* + * Returns the value of given slot on page. + * + * Since this is just a read-only access of a single byte, the page doesn't + * need to be locked. + */ +uint8 +fsm_get_avail(Page page, int slot) +{ + FSMPage fsmpage = (FSMPage) PageGetContents(page); + + Assert(slot < LeafNodesPerPage); + + return fsmpage->fp_nodes[NonLeafNodesPerPage + slot]; +} + +/* + * Returns the value at the root of a page. + * + * Since this is just a read-only access of a single byte, the page doesn't + * need to be locked. + */ +uint8 +fsm_get_max_avail(Page page) +{ + FSMPage fsmpage = (FSMPage) PageGetContents(page); + + return fsmpage->fp_nodes[0]; +} + +/* + * Searches for a slot with category at least minvalue. + * Returns slot number, or -1 if none found. + * + * The caller must hold at least a shared lock on the page, and this + * function can unlock and lock the page again in exclusive mode if it + * needs to be updated. exclusive_lock_held should be set to true if the + * caller is already holding an exclusive lock, to avoid extra work. + * + * If advancenext is false, fp_next_slot is set to point to the returned + * slot, and if it's true, to the slot after the returned slot. + */ +int +fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext, + bool exclusive_lock_held) +{ + Page page = BufferGetPage(buf); + FSMPage fsmpage = (FSMPage) PageGetContents(page); + int nodeno; + int target; + uint16 slot; + +restart: + + /* + * Check the root first, and exit quickly if there's no leaf with enough + * free space + */ + if (fsmpage->fp_nodes[0] < minvalue) + return -1; + + /* + * Start search using fp_next_slot. It's just a hint, so check that it's + * sane. (This also handles wrapping around when the prior call returned + * the last slot on the page.) + */ + target = fsmpage->fp_next_slot; + if (target < 0 || target >= LeafNodesPerPage) + target = 0; + target += NonLeafNodesPerPage; + + /*---------- + * Start the search from the target slot. At every step, move one + * node to the right, then climb up to the parent. Stop when we reach + * a node with enough free space (as we must, since the root has enough + * space). + * + * The idea is to gradually expand our "search triangle", that is, all + * nodes covered by the current node, and to be sure we search to the + * right from the start point. At the first step, only the target slot + * is examined. When we move up from a left child to its parent, we are + * adding the right-hand subtree of that parent to the search triangle. + * When we move right then up from a right child, we are dropping the + * current search triangle (which we know doesn't contain any suitable + * page) and instead looking at the next-larger-size triangle to its + * right. So we never look left from our original start point, and at + * each step the size of the search triangle doubles, ensuring it takes + * only log2(N) work to search N pages. + * + * The "move right" operation will wrap around if it hits the right edge + * of the tree, so the behavior is still good if we start near the right. + * Note also that the move-and-climb behavior ensures that we can't end + * up on one of the missing nodes at the right of the leaf level. + * + * For example, consider this tree: + * + * 7 + * 7 6 + * 5 7 6 5 + * 4 5 5 7 2 6 5 2 + * T + * + * Assume that the target node is the node indicated by the letter T, + * and we're searching for a node with value of 6 or higher. The search + * begins at T. At the first iteration, we move to the right, then to the + * parent, arriving at the rightmost 5. At the second iteration, we move + * to the right, wrapping around, then climb up, arriving at the 7 on the + * third level. 7 satisfies our search, so we descend down to the bottom, + * following the path of sevens. This is in fact the first suitable page + * to the right of (allowing for wraparound) our start point. + *---------- + */ + nodeno = target; + while (nodeno > 0) + { + if (fsmpage->fp_nodes[nodeno] >= minvalue) + break; + + /* + * Move to the right, wrapping around on same level if necessary, then + * climb up. + */ + nodeno = parentof(rightneighbor(nodeno)); + } + + /* + * We're now at a node with enough free space, somewhere in the middle of + * the tree. Descend to the bottom, following a path with enough free + * space, preferring to move left if there's a choice. + */ + while (nodeno < NonLeafNodesPerPage) + { + int childnodeno = leftchild(nodeno); + + if (childnodeno < NodesPerPage && + fsmpage->fp_nodes[childnodeno] >= minvalue) + { + nodeno = childnodeno; + continue; + } + childnodeno++; /* point to right child */ + if (childnodeno < NodesPerPage && + fsmpage->fp_nodes[childnodeno] >= minvalue) + { + nodeno = childnodeno; + } + else + { + /* + * Oops. The parent node promised that either left or right child + * has enough space, but neither actually did. This can happen in + * case of a "torn page", IOW if we crashed earlier while writing + * the page to disk, and only part of the page made it to disk. + * + * Fix the corruption and restart. + */ + RelFileLocator rlocator; + ForkNumber forknum; + BlockNumber blknum; + + BufferGetTag(buf, &rlocator, &forknum, &blknum); + elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u", + blknum, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber); + + /* make sure we hold an exclusive lock */ + if (!exclusive_lock_held) + { + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + exclusive_lock_held = true; + } + fsm_rebuild_page(page); + MarkBufferDirtyHint(buf, false); + goto restart; + } + } + + /* We're now at the bottom level, at a node with enough space. */ + slot = nodeno - NonLeafNodesPerPage; + + /* + * Update the next-target pointer. Note that we do this even if we're only + * holding a shared lock, on the grounds that it's better to use a shared + * lock and get a garbled next pointer every now and then, than take the + * concurrency hit of an exclusive lock. + * + * Wrap-around is handled at the beginning of this function. + */ + fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0); + + return slot; +} + +/* + * Sets the available space to zero for all slots numbered >= nslots. + * Returns true if the page was modified. + */ +bool +fsm_truncate_avail(Page page, int nslots) +{ + FSMPage fsmpage = (FSMPage) PageGetContents(page); + uint8 *ptr; + bool changed = false; + + Assert(nslots >= 0 && nslots < LeafNodesPerPage); + + /* Clear all truncated leaf nodes */ + ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots]; + for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++) + { + if (*ptr != 0) + changed = true; + *ptr = 0; + } + + /* Fix upper nodes. */ + if (changed) + fsm_rebuild_page(page); + + return changed; +} + +/* + * Reconstructs the upper levels of a page. Returns true if the page + * was modified. + */ +bool +fsm_rebuild_page(Page page) +{ + FSMPage fsmpage = (FSMPage) PageGetContents(page); + bool changed = false; + int nodeno; + + /* + * Start from the lowest non-leaf level, at last node, working our way + * backwards, through all non-leaf nodes at all levels, up to the root. + */ + for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--) + { + int lchild = leftchild(nodeno); + int rchild = lchild + 1; + uint8 newvalue = 0; + + /* The first few nodes we examine might have zero or one child. */ + if (lchild < NodesPerPage) + newvalue = fsmpage->fp_nodes[lchild]; + + if (rchild < NodesPerPage) + newvalue = Max(newvalue, + fsmpage->fp_nodes[rchild]); + + if (fsmpage->fp_nodes[nodeno] != newvalue) + { + fsmpage->fp_nodes[nodeno] = newvalue; + changed = true; + } + } + + return changed; +} diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c new file mode 100644 index 0000000..fff8f4f --- /dev/null +++ b/src/backend/storage/freespace/indexfsm.c @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * indexfsm.c + * POSTGRES free space map for quickly finding free pages in relations + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/freespace/indexfsm.c + * + * + * NOTES: + * + * This is similar to the FSM used for heap, in freespace.c, but instead + * of tracking the amount of free space on pages, we only track whether + * pages are completely free or in-use. We use the same FSM implementation + * as for heaps, using BLCKSZ - 1 to denote used pages, and 0 for unused. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/freespace.h" +#include "storage/indexfsm.h" + +/* + * Exported routines + */ + +/* + * GetFreeIndexPage - return a free page from the FSM + * + * As a side effect, the page is marked as used in the FSM. + */ +BlockNumber +GetFreeIndexPage(Relation rel) +{ + BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2); + + if (blkno != InvalidBlockNumber) + RecordUsedIndexPage(rel, blkno); + + return blkno; +} + +/* + * RecordFreeIndexPage - mark a page as free in the FSM + */ +void +RecordFreeIndexPage(Relation rel, BlockNumber freeBlock) +{ + RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1); +} + + +/* + * RecordUsedIndexPage - mark a page as used in the FSM + */ +void +RecordUsedIndexPage(Relation rel, BlockNumber usedBlock) +{ + RecordPageWithFreeSpace(rel, usedBlock, 0); +} + +/* + * IndexFreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM + */ +void +IndexFreeSpaceMapVacuum(Relation rel) +{ + FreeSpaceMapVacuum(rel); +} diff --git a/src/backend/storage/freespace/meson.build b/src/backend/storage/freespace/meson.build new file mode 100644 index 0000000..4dd8602 --- /dev/null +++ b/src/backend/storage/freespace/meson.build @@ -0,0 +1,7 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'freespace.c', + 'fsmpage.c', + 'indexfsm.c', +) diff --git a/src/backend/storage/ipc/Makefile b/src/backend/storage/ipc/Makefile new file mode 100644 index 0000000..6d5b921 --- /dev/null +++ b/src/backend/storage/ipc/Makefile @@ -0,0 +1,29 @@ +# +# Makefile for storage/ipc +# +# src/backend/storage/ipc/Makefile +# + +subdir = src/backend/storage/ipc +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + barrier.o \ + dsm.o \ + dsm_impl.o \ + ipc.o \ + ipci.o \ + latch.o \ + pmsignal.o \ + procarray.o \ + procsignal.o \ + shm_mq.o \ + shm_toc.o \ + shmem.o \ + signalfuncs.o \ + sinval.o \ + sinvaladt.o \ + standby.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/ipc/barrier.c b/src/backend/storage/ipc/barrier.c new file mode 100644 index 0000000..4734dc6 --- /dev/null +++ b/src/backend/storage/ipc/barrier.c @@ -0,0 +1,333 @@ +/*------------------------------------------------------------------------- + * + * barrier.c + * Barriers for synchronizing cooperating processes. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * From Wikipedia[1]: "In parallel computing, a barrier is a type of + * synchronization method. A barrier for a group of threads or processes in + * the source code means any thread/process must stop at this point and cannot + * proceed until all other threads/processes reach this barrier." + * + * This implementation of barriers allows for static sets of participants + * known up front, or dynamic sets of participants which processes can join or + * leave at any time. In the dynamic case, a phase number can be used to + * track progress through a parallel algorithm, and may be necessary to + * synchronize with the current phase of a multi-phase algorithm when a new + * participant joins. In the static case, the phase number is used + * internally, but it isn't strictly necessary for client code to access it + * because the phase can only advance when the declared number of participants + * reaches the barrier, so client code should be in no doubt about the current + * phase of computation at all times. + * + * Consider a parallel algorithm that involves separate phases of computation + * A, B and C where the output of each phase is needed before the next phase + * can begin. + * + * In the case of a static barrier initialized with 4 participants, each + * participant works on phase A, then calls BarrierArriveAndWait to wait until + * all 4 participants have reached that point. When BarrierArriveAndWait + * returns control, each participant can work on B, and so on. Because the + * barrier knows how many participants to expect, the phases of computation + * don't need labels or numbers, since each process's program counter implies + * the current phase. Even if some of the processes are slow to start up and + * begin running phase A, the other participants are expecting them and will + * patiently wait at the barrier. The code could be written as follows: + * + * perform_a(); + * BarrierArriveAndWait(&barrier, ...); + * perform_b(); + * BarrierArriveAndWait(&barrier, ...); + * perform_c(); + * BarrierArriveAndWait(&barrier, ...); + * + * If the number of participants is not known up front, then a dynamic barrier + * is needed and the number should be set to zero at initialization. New + * complications arise because the number necessarily changes over time as + * participants attach and detach, and therefore phases B, C or even the end + * of processing may be reached before any given participant has started + * running and attached. Therefore the client code must perform an initial + * test of the phase number after attaching, because it needs to find out + * which phase of the algorithm has been reached by any participants that are + * already attached in order to synchronize with that work. Once the program + * counter or some other representation of current progress is synchronized + * with the barrier's phase, normal control flow can be used just as in the + * static case. Our example could be written using a switch statement with + * cases that fall-through, as follows: + * + * phase = BarrierAttach(&barrier); + * switch (phase) + * { + * case PHASE_A: + * perform_a(); + * BarrierArriveAndWait(&barrier, ...); + * case PHASE_B: + * perform_b(); + * BarrierArriveAndWait(&barrier, ...); + * case PHASE_C: + * perform_c(); + * BarrierArriveAndWait(&barrier, ...); + * } + * BarrierDetach(&barrier); + * + * Static barriers behave similarly to POSIX's pthread_barrier_t. Dynamic + * barriers behave similarly to Java's java.util.concurrent.Phaser. + * + * [1] https://en.wikipedia.org/wiki/Barrier_(computer_science) + * + * IDENTIFICATION + * src/backend/storage/ipc/barrier.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "storage/barrier.h" + +static inline bool BarrierDetachImpl(Barrier *barrier, bool arrive); + +/* + * Initialize this barrier. To use a static party size, provide the number of + * participants to wait for at each phase indicating that that number of + * backends is implicitly attached. To use a dynamic party size, specify zero + * here and then use BarrierAttach() and + * BarrierDetach()/BarrierArriveAndDetach() to register and deregister + * participants explicitly. + */ +void +BarrierInit(Barrier *barrier, int participants) +{ + SpinLockInit(&barrier->mutex); + barrier->participants = participants; + barrier->arrived = 0; + barrier->phase = 0; + barrier->elected = 0; + barrier->static_party = participants > 0; + ConditionVariableInit(&barrier->condition_variable); +} + +/* + * Arrive at this barrier, wait for all other attached participants to arrive + * too and then return. Increments the current phase. The caller must be + * attached. + * + * While waiting, pg_stat_activity shows a wait_event_type and wait_event + * controlled by the wait_event_info passed in, which should be a value from + * one of the WaitEventXXX enums defined in pgstat.h. + * + * Return true in one arbitrarily chosen participant. Return false in all + * others. The return code can be used to elect one participant to execute a + * phase of work that must be done serially while other participants wait. + */ +bool +BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info) +{ + bool release = false; + bool elected; + int start_phase; + int next_phase; + + SpinLockAcquire(&barrier->mutex); + start_phase = barrier->phase; + next_phase = start_phase + 1; + ++barrier->arrived; + if (barrier->arrived == barrier->participants) + { + release = true; + barrier->arrived = 0; + barrier->phase = next_phase; + barrier->elected = next_phase; + } + SpinLockRelease(&barrier->mutex); + + /* + * If we were the last expected participant to arrive, we can release our + * peers and return true to indicate that this backend has been elected to + * perform any serial work. + */ + if (release) + { + ConditionVariableBroadcast(&barrier->condition_variable); + + return true; + } + + /* + * Otherwise we have to wait for the last participant to arrive and + * advance the phase. + */ + elected = false; + ConditionVariablePrepareToSleep(&barrier->condition_variable); + for (;;) + { + /* + * We know that phase must either be start_phase, indicating that we + * need to keep waiting, or next_phase, indicating that the last + * participant that we were waiting for has either arrived or detached + * so that the next phase has begun. The phase cannot advance any + * further than that without this backend's participation, because + * this backend is attached. + */ + SpinLockAcquire(&barrier->mutex); + Assert(barrier->phase == start_phase || barrier->phase == next_phase); + release = barrier->phase == next_phase; + if (release && barrier->elected != next_phase) + { + /* + * Usually the backend that arrives last and releases the other + * backends is elected to return true (see above), so that it can + * begin processing serial work while it has a CPU timeslice. + * However, if the barrier advanced because someone detached, then + * one of the backends that is awoken will need to be elected. + */ + barrier->elected = barrier->phase; + elected = true; + } + SpinLockRelease(&barrier->mutex); + if (release) + break; + ConditionVariableSleep(&barrier->condition_variable, wait_event_info); + } + ConditionVariableCancelSleep(); + + return elected; +} + +/* + * Arrive at this barrier, but detach rather than waiting. Returns true if + * the caller was the last to detach. + */ +bool +BarrierArriveAndDetach(Barrier *barrier) +{ + return BarrierDetachImpl(barrier, true); +} + +/* + * Arrive at a barrier, and detach all but the last to arrive. Returns true if + * the caller was the last to arrive, and is therefore still attached. + */ +bool +BarrierArriveAndDetachExceptLast(Barrier *barrier) +{ + SpinLockAcquire(&barrier->mutex); + if (barrier->participants > 1) + { + --barrier->participants; + SpinLockRelease(&barrier->mutex); + + return false; + } + Assert(barrier->participants == 1); + ++barrier->phase; + SpinLockRelease(&barrier->mutex); + + return true; +} + +/* + * Attach to a barrier. All waiting participants will now wait for this + * participant to call BarrierArriveAndWait(), BarrierDetach() or + * BarrierArriveAndDetach(). Return the current phase. + */ +int +BarrierAttach(Barrier *barrier) +{ + int phase; + + Assert(!barrier->static_party); + + SpinLockAcquire(&barrier->mutex); + ++barrier->participants; + phase = barrier->phase; + SpinLockRelease(&barrier->mutex); + + return phase; +} + +/* + * Detach from a barrier. This may release other waiters from + * BarrierArriveAndWait() and advance the phase if they were only waiting for + * this backend. Return true if this participant was the last to detach. + */ +bool +BarrierDetach(Barrier *barrier) +{ + return BarrierDetachImpl(barrier, false); +} + +/* + * Return the current phase of a barrier. The caller must be attached. + */ +int +BarrierPhase(Barrier *barrier) +{ + /* + * It is OK to read barrier->phase without locking, because it can't + * change without us (we are attached to it), and we executed a memory + * barrier when we either attached or participated in changing it last + * time. + */ + return barrier->phase; +} + +/* + * Return an instantaneous snapshot of the number of participants currently + * attached to this barrier. For debugging purposes only. + */ +int +BarrierParticipants(Barrier *barrier) +{ + int participants; + + SpinLockAcquire(&barrier->mutex); + participants = barrier->participants; + SpinLockRelease(&barrier->mutex); + + return participants; +} + +/* + * Detach from a barrier. If 'arrive' is true then also increment the phase + * if there are no other participants. If there are other participants + * waiting, then the phase will be advanced and they'll be released if they + * were only waiting for the caller. Return true if this participant was the + * last to detach. + */ +static inline bool +BarrierDetachImpl(Barrier *barrier, bool arrive) +{ + bool release; + bool last; + + Assert(!barrier->static_party); + + SpinLockAcquire(&barrier->mutex); + Assert(barrier->participants > 0); + --barrier->participants; + + /* + * If any other participants are waiting and we were the last participant + * waited for, release them. If no other participants are waiting, but + * this is a BarrierArriveAndDetach() call, then advance the phase too. + */ + if ((arrive || barrier->participants > 0) && + barrier->arrived == barrier->participants) + { + release = true; + barrier->arrived = 0; + ++barrier->phase; + } + else + release = false; + + last = barrier->participants == 0; + SpinLockRelease(&barrier->mutex); + + if (release) + ConditionVariableBroadcast(&barrier->condition_variable); + + return last; +} diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c new file mode 100644 index 0000000..7e4e278 --- /dev/null +++ b/src/backend/storage/ipc/dsm.c @@ -0,0 +1,1257 @@ +/*------------------------------------------------------------------------- + * + * dsm.c + * manage dynamic shared memory segments + * + * This file provides a set of services to make programming with dynamic + * shared memory segments more convenient. Unlike the low-level + * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments + * created using this module will be cleaned up automatically. Mappings + * will be removed when the resource owner under which they were created + * is cleaned up, unless dsm_pin_mapping() is used, in which case they + * have session lifespan. Segments will be removed when there are no + * remaining mappings, or at postmaster shutdown in any case. After a + * hard postmaster crash, remaining segments will be removed, if they + * still exist, at the next postmaster startup. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/dsm.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#ifndef WIN32 +#include +#endif +#include + +#include "common/pg_prng.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "utils/freepage.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/resowner_private.h" + +#define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32 + +#define PG_DYNSHMEM_FIXED_SLOTS 64 +#define PG_DYNSHMEM_SLOTS_PER_BACKEND 5 + +#define INVALID_CONTROL_SLOT ((uint32) -1) + +/* Backend-local tracking for on-detach callbacks. */ +typedef struct dsm_segment_detach_callback +{ + on_dsm_detach_callback function; + Datum arg; + slist_node node; +} dsm_segment_detach_callback; + +/* Backend-local state for a dynamic shared memory segment. */ +struct dsm_segment +{ + dlist_node node; /* List link in dsm_segment_list. */ + ResourceOwner resowner; /* Resource owner. */ + dsm_handle handle; /* Segment name. */ + uint32 control_slot; /* Slot in control segment. */ + void *impl_private; /* Implementation-specific private data. */ + void *mapped_address; /* Mapping address, or NULL if unmapped. */ + Size mapped_size; /* Size of our mapping. */ + slist_head on_detach; /* On-detach callbacks. */ +}; + +/* Shared-memory state for a dynamic shared memory segment. */ +typedef struct dsm_control_item +{ + dsm_handle handle; + uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */ + size_t first_page; + size_t npages; + void *impl_private_pm_handle; /* only needed on Windows */ + bool pinned; +} dsm_control_item; + +/* Layout of the dynamic shared memory control segment. */ +typedef struct dsm_control_header +{ + uint32 magic; + uint32 nitems; + uint32 maxitems; + dsm_control_item item[FLEXIBLE_ARRAY_MEMBER]; +} dsm_control_header; + +static void dsm_cleanup_for_mmap(void); +static void dsm_postmaster_shutdown(int code, Datum arg); +static dsm_segment *dsm_create_descriptor(void); +static bool dsm_control_segment_sane(dsm_control_header *control, + Size mapped_size); +static uint64 dsm_control_bytes_needed(uint32 nitems); +static inline dsm_handle make_main_region_dsm_handle(int slot); +static inline bool is_main_region_dsm_handle(dsm_handle handle); + +/* Has this backend initialized the dynamic shared memory system yet? */ +static bool dsm_init_done = false; + +/* Preallocated DSM space in the main shared memory region. */ +static void *dsm_main_space_begin = NULL; + +/* + * List of dynamic shared memory segments used by this backend. + * + * At process exit time, we must decrement the reference count of each + * segment we have attached; this list makes it possible to find all such + * segments. + * + * This list should always be empty in the postmaster. We could probably + * allow the postmaster to map dynamic shared memory segments before it + * begins to start child processes, provided that each process adjusted + * the reference counts for those segments in the control segment at + * startup time, but there's no obvious need for such a facility, which + * would also be complex to handle in the EXEC_BACKEND case. Once the + * postmaster has begun spawning children, there's an additional problem: + * each new mapping would require an update to the control segment, + * which requires locking, in which the postmaster must not be involved. + */ +static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list); + +/* + * Control segment information. + * + * Unlike ordinary shared memory segments, the control segment is not + * reference counted; instead, it lasts for the postmaster's entire + * life cycle. For simplicity, it doesn't have a dsm_segment object either. + */ +static dsm_handle dsm_control_handle; +static dsm_control_header *dsm_control; +static Size dsm_control_mapped_size = 0; +static void *dsm_control_impl_private = NULL; + +/* + * Start up the dynamic shared memory system. + * + * This is called just once during each cluster lifetime, at postmaster + * startup time. + */ +void +dsm_postmaster_startup(PGShmemHeader *shim) +{ + void *dsm_control_address = NULL; + uint32 maxitems; + Size segsize; + + Assert(!IsUnderPostmaster); + + /* + * If we're using the mmap implementations, clean up any leftovers. + * Cleanup isn't needed on Windows, and happens earlier in startup for + * POSIX and System V shared memory, via a direct call to + * dsm_cleanup_using_control_segment. + */ + if (dynamic_shared_memory_type == DSM_IMPL_MMAP) + dsm_cleanup_for_mmap(); + + /* Determine size for new control segment. */ + maxitems = PG_DYNSHMEM_FIXED_SLOTS + + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends; + elog(DEBUG2, "dynamic shared memory system will support %u segments", + maxitems); + segsize = dsm_control_bytes_needed(maxitems); + + /* + * Loop until we find an unused identifier for the new control segment. We + * sometimes use DSM_HANDLE_INVALID as a sentinel value indicating "no + * control segment", so avoid generating that value for a real handle. + */ + for (;;) + { + Assert(dsm_control_address == NULL); + Assert(dsm_control_mapped_size == 0); + /* Use even numbers only */ + dsm_control_handle = pg_prng_uint32(&pg_global_prng_state) << 1; + if (dsm_control_handle == DSM_HANDLE_INVALID) + continue; + if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize, + &dsm_control_impl_private, &dsm_control_address, + &dsm_control_mapped_size, ERROR)) + break; + } + dsm_control = dsm_control_address; + on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim)); + elog(DEBUG2, + "created dynamic shared memory control segment %u (%zu bytes)", + dsm_control_handle, segsize); + shim->dsm_control = dsm_control_handle; + + /* Initialize control segment. */ + dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC; + dsm_control->nitems = 0; + dsm_control->maxitems = maxitems; +} + +/* + * Determine whether the control segment from the previous postmaster + * invocation still exists. If so, remove the dynamic shared memory + * segments to which it refers, and then the control segment itself. + */ +void +dsm_cleanup_using_control_segment(dsm_handle old_control_handle) +{ + void *mapped_address = NULL; + void *junk_mapped_address = NULL; + void *impl_private = NULL; + void *junk_impl_private = NULL; + Size mapped_size = 0; + Size junk_mapped_size = 0; + uint32 nitems; + uint32 i; + dsm_control_header *old_control; + + /* + * Try to attach the segment. If this fails, it probably just means that + * the operating system has been rebooted and the segment no longer + * exists, or an unrelated process has used the same shm ID. So just fall + * out quietly. + */ + if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, DEBUG1)) + return; + + /* + * We've managed to reattach it, but the contents might not be sane. If + * they aren't, we disregard the segment after all. + */ + old_control = (dsm_control_header *) mapped_address; + if (!dsm_control_segment_sane(old_control, mapped_size)) + { + dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, LOG); + return; + } + + /* + * OK, the control segment looks basically valid, so we can use it to get + * a list of segments that need to be removed. + */ + nitems = old_control->nitems; + for (i = 0; i < nitems; ++i) + { + dsm_handle handle; + uint32 refcnt; + + /* If the reference count is 0, the slot is actually unused. */ + refcnt = old_control->item[i].refcnt; + if (refcnt == 0) + continue; + + /* If it was using the main shmem area, there is nothing to do. */ + handle = old_control->item[i].handle; + if (is_main_region_dsm_handle(handle)) + continue; + + /* Log debugging information. */ + elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)", + handle, refcnt); + + /* Destroy the referenced segment. */ + dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, LOG); + } + + /* Destroy the old control segment, too. */ + elog(DEBUG2, + "cleaning up dynamic shared memory control segment with ID %u", + old_control_handle); + dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, LOG); +} + +/* + * When we're using the mmap shared memory implementation, "shared memory" + * segments might even manage to survive an operating system reboot. + * But there's no guarantee as to exactly what will survive: some segments + * may survive, and others may not, and the contents of some may be out + * of date. In particular, the control segment may be out of date, so we + * can't rely on it to figure out what to remove. However, since we know + * what directory contains the files we used as shared memory, we can simply + * scan the directory and blow everything away that shouldn't be there. + */ +static void +dsm_cleanup_for_mmap(void) +{ + DIR *dir; + struct dirent *dent; + + /* Scan the directory for something with a name of the correct format. */ + dir = AllocateDir(PG_DYNSHMEM_DIR); + + while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL) + { + if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX, + strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0) + { + char buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)]; + + snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name); + + elog(DEBUG2, "removing file \"%s\"", buf); + + /* We found a matching file; so remove it. */ + if (unlink(buf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", buf))); + } + } + + /* Cleanup complete. */ + FreeDir(dir); +} + +/* + * At shutdown time, we iterate over the control segment and remove all + * remaining dynamic shared memory segments. We avoid throwing errors here; + * the postmaster is shutting down either way, and this is just non-critical + * resource cleanup. + */ +static void +dsm_postmaster_shutdown(int code, Datum arg) +{ + uint32 nitems; + uint32 i; + void *dsm_control_address; + void *junk_mapped_address = NULL; + void *junk_impl_private = NULL; + Size junk_mapped_size = 0; + PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg); + + /* + * If some other backend exited uncleanly, it might have corrupted the + * control segment while it was dying. In that case, we warn and ignore + * the contents of the control segment. This may end up leaving behind + * stray shared memory segments, but there's not much we can do about that + * if the metadata is gone. + */ + nitems = dsm_control->nitems; + if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size)) + { + ereport(LOG, + (errmsg("dynamic shared memory control segment is corrupt"))); + return; + } + + /* Remove any remaining segments. */ + for (i = 0; i < nitems; ++i) + { + dsm_handle handle; + + /* If the reference count is 0, the slot is actually unused. */ + if (dsm_control->item[i].refcnt == 0) + continue; + + handle = dsm_control->item[i].handle; + if (is_main_region_dsm_handle(handle)) + continue; + + /* Log debugging information. */ + elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u", + handle); + + /* Destroy the segment. */ + dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, LOG); + } + + /* Remove the control segment itself. */ + elog(DEBUG2, + "cleaning up dynamic shared memory control segment with ID %u", + dsm_control_handle); + dsm_control_address = dsm_control; + dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0, + &dsm_control_impl_private, &dsm_control_address, + &dsm_control_mapped_size, LOG); + dsm_control = dsm_control_address; + shim->dsm_control = 0; +} + +/* + * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND, + * we must reread the state file and map the control segment; in other cases, + * we'll have inherited the postmaster's mapping and global variables. + */ +static void +dsm_backend_startup(void) +{ +#ifdef EXEC_BACKEND + if (IsUnderPostmaster) + { + void *control_address = NULL; + + /* Attach control segment. */ + Assert(dsm_control_handle != 0); + dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0, + &dsm_control_impl_private, &control_address, + &dsm_control_mapped_size, ERROR); + dsm_control = control_address; + /* If control segment doesn't look sane, something is badly wrong. */ + if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size)) + { + dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0, + &dsm_control_impl_private, &control_address, + &dsm_control_mapped_size, WARNING); + ereport(FATAL, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("dynamic shared memory control segment is not valid"))); + } + } +#endif + + dsm_init_done = true; +} + +#ifdef EXEC_BACKEND +/* + * When running under EXEC_BACKEND, we get a callback here when the main + * shared memory segment is re-attached, so that we can record the control + * handle retrieved from it. + */ +void +dsm_set_control_handle(dsm_handle h) +{ + Assert(dsm_control_handle == 0 && h != 0); + dsm_control_handle = h; +} +#endif + +/* + * Reserve some space in the main shared memory segment for DSM segments. + */ +size_t +dsm_estimate_size(void) +{ + return 1024 * 1024 * (size_t) min_dynamic_shared_memory; +} + +/* + * Initialize space in the main shared memory segment for DSM segments. + */ +void +dsm_shmem_init(void) +{ + size_t size = dsm_estimate_size(); + bool found; + + if (size == 0) + return; + + dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found); + if (!found) + { + FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin; + size_t first_page = 0; + size_t pages; + + /* Reserve space for the FreePageManager. */ + while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager)) + ++first_page; + + /* Initialize it and give it all the rest of the space. */ + FreePageManagerInitialize(fpm, dsm_main_space_begin); + pages = (size / FPM_PAGE_SIZE) - first_page; + FreePageManagerPut(fpm, first_page, pages); + } +} + +/* + * Create a new dynamic shared memory segment. + * + * If there is a non-NULL CurrentResourceOwner, the new segment is associated + * with it and must be detached before the resource owner releases, or a + * warning will be logged. If CurrentResourceOwner is NULL, the segment + * remains attached until explicitly detached or the session ends. + * Creating with a NULL CurrentResourceOwner is equivalent to creating + * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping. + */ +dsm_segment * +dsm_create(Size size, int flags) +{ + dsm_segment *seg; + uint32 i; + uint32 nitems; + size_t npages = 0; + size_t first_page = 0; + FreePageManager *dsm_main_space_fpm = dsm_main_space_begin; + bool using_main_dsm_region = false; + + /* + * Unsafe in postmaster. It might seem pointless to allow use of dsm in + * single user mode, but otherwise some subsystems will need dedicated + * single user mode code paths. + */ + Assert(IsUnderPostmaster || !IsPostmasterEnvironment); + + if (!dsm_init_done) + dsm_backend_startup(); + + /* Create a new segment descriptor. */ + seg = dsm_create_descriptor(); + + /* + * Lock the control segment while we try to allocate from the main shared + * memory area, if configured. + */ + if (dsm_main_space_fpm) + { + npages = size / FPM_PAGE_SIZE; + if (size % FPM_PAGE_SIZE > 0) + ++npages; + + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page)) + { + /* We can carve out a piece of the main shared memory segment. */ + seg->mapped_address = (char *) dsm_main_space_begin + + first_page * FPM_PAGE_SIZE; + seg->mapped_size = npages * FPM_PAGE_SIZE; + using_main_dsm_region = true; + /* We'll choose a handle below. */ + } + } + + if (!using_main_dsm_region) + { + /* + * We need to create a new memory segment. Loop until we find an + * unused segment identifier. + */ + if (dsm_main_space_fpm) + LWLockRelease(DynamicSharedMemoryControlLock); + for (;;) + { + Assert(seg->mapped_address == NULL && seg->mapped_size == 0); + /* Use even numbers only */ + seg->handle = pg_prng_uint32(&pg_global_prng_state) << 1; + if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */ + continue; + if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, ERROR)) + break; + } + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + } + + /* Search the control segment for an unused slot. */ + nitems = dsm_control->nitems; + for (i = 0; i < nitems; ++i) + { + if (dsm_control->item[i].refcnt == 0) + { + if (using_main_dsm_region) + { + seg->handle = make_main_region_dsm_handle(i); + dsm_control->item[i].first_page = first_page; + dsm_control->item[i].npages = npages; + } + else + Assert(!is_main_region_dsm_handle(seg->handle)); + dsm_control->item[i].handle = seg->handle; + /* refcnt of 1 triggers destruction, so start at 2 */ + dsm_control->item[i].refcnt = 2; + dsm_control->item[i].impl_private_pm_handle = NULL; + dsm_control->item[i].pinned = false; + seg->control_slot = i; + LWLockRelease(DynamicSharedMemoryControlLock); + return seg; + } + } + + /* Verify that we can support an additional mapping. */ + if (nitems >= dsm_control->maxitems) + { + if (using_main_dsm_region) + FreePageManagerPut(dsm_main_space_fpm, first_page, npages); + LWLockRelease(DynamicSharedMemoryControlLock); + if (!using_main_dsm_region) + dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, WARNING); + if (seg->resowner != NULL) + ResourceOwnerForgetDSM(seg->resowner, seg); + dlist_delete(&seg->node); + pfree(seg); + + if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0) + return NULL; + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("too many dynamic shared memory segments"))); + } + + /* Enter the handle into a new array slot. */ + if (using_main_dsm_region) + { + seg->handle = make_main_region_dsm_handle(nitems); + dsm_control->item[i].first_page = first_page; + dsm_control->item[i].npages = npages; + } + dsm_control->item[nitems].handle = seg->handle; + /* refcnt of 1 triggers destruction, so start at 2 */ + dsm_control->item[nitems].refcnt = 2; + dsm_control->item[nitems].impl_private_pm_handle = NULL; + dsm_control->item[nitems].pinned = false; + seg->control_slot = nitems; + dsm_control->nitems++; + LWLockRelease(DynamicSharedMemoryControlLock); + + return seg; +} + +/* + * Attach a dynamic shared memory segment. + * + * See comments for dsm_segment_handle() for an explanation of how this + * is intended to be used. + * + * This function will return NULL if the segment isn't known to the system. + * This can happen if we're asked to attach the segment, but then everyone + * else detaches it (causing it to be destroyed) before we get around to + * attaching it. + * + * If there is a non-NULL CurrentResourceOwner, the attached segment is + * associated with it and must be detached before the resource owner releases, + * or a warning will be logged. Otherwise the segment remains attached until + * explicitly detached or the session ends. See the note atop dsm_create(). + */ +dsm_segment * +dsm_attach(dsm_handle h) +{ + dsm_segment *seg; + dlist_iter iter; + uint32 i; + uint32 nitems; + + /* Unsafe in postmaster (and pointless in a stand-alone backend). */ + Assert(IsUnderPostmaster); + + if (!dsm_init_done) + dsm_backend_startup(); + + /* + * Since this is just a debugging cross-check, we could leave it out + * altogether, or include it only in assert-enabled builds. But since the + * list of attached segments should normally be very short, let's include + * it always for right now. + * + * If you're hitting this error, you probably want to attempt to find an + * existing mapping via dsm_find_mapping() before calling dsm_attach() to + * create a new one. + */ + dlist_foreach(iter, &dsm_segment_list) + { + seg = dlist_container(dsm_segment, node, iter.cur); + if (seg->handle == h) + elog(ERROR, "can't attach the same segment more than once"); + } + + /* Create a new segment descriptor. */ + seg = dsm_create_descriptor(); + seg->handle = h; + + /* Bump reference count for this segment in shared memory. */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + nitems = dsm_control->nitems; + for (i = 0; i < nitems; ++i) + { + /* + * If the reference count is 0, the slot is actually unused. If the + * reference count is 1, the slot is still in use, but the segment is + * in the process of going away; even if the handle matches, another + * slot may already have started using the same handle value by + * coincidence so we have to keep searching. + */ + if (dsm_control->item[i].refcnt <= 1) + continue; + + /* If the handle doesn't match, it's not the slot we want. */ + if (dsm_control->item[i].handle != seg->handle) + continue; + + /* Otherwise we've found a match. */ + dsm_control->item[i].refcnt++; + seg->control_slot = i; + if (is_main_region_dsm_handle(seg->handle)) + { + seg->mapped_address = (char *) dsm_main_space_begin + + dsm_control->item[i].first_page * FPM_PAGE_SIZE; + seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE; + } + break; + } + LWLockRelease(DynamicSharedMemoryControlLock); + + /* + * If we didn't find the handle we're looking for in the control segment, + * it probably means that everyone else who had it mapped, including the + * original creator, died before we got to this point. It's up to the + * caller to decide what to do about that. + */ + if (seg->control_slot == INVALID_CONTROL_SLOT) + { + dsm_detach(seg); + return NULL; + } + + /* Here's where we actually try to map the segment. */ + if (!is_main_region_dsm_handle(seg->handle)) + dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, ERROR); + + return seg; +} + +/* + * At backend shutdown time, detach any segments that are still attached. + * (This is similar to dsm_detach_all, except that there's no reason to + * unmap the control segment before exiting, so we don't bother.) + */ +void +dsm_backend_shutdown(void) +{ + while (!dlist_is_empty(&dsm_segment_list)) + { + dsm_segment *seg; + + seg = dlist_head_element(dsm_segment, node, &dsm_segment_list); + dsm_detach(seg); + } +} + +/* + * Detach all shared memory segments, including the control segments. This + * should be called, along with PGSharedMemoryDetach, in processes that + * might inherit mappings but are not intended to be connected to dynamic + * shared memory. + */ +void +dsm_detach_all(void) +{ + void *control_address = dsm_control; + + while (!dlist_is_empty(&dsm_segment_list)) + { + dsm_segment *seg; + + seg = dlist_head_element(dsm_segment, node, &dsm_segment_list); + dsm_detach(seg); + } + + if (control_address != NULL) + dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0, + &dsm_control_impl_private, &control_address, + &dsm_control_mapped_size, ERROR); +} + +/* + * Detach from a shared memory segment, destroying the segment if we + * remove the last reference. + * + * This function should never fail. It will often be invoked when aborting + * a transaction, and a further error won't serve any purpose. It's not a + * complete disaster if we fail to unmap or destroy the segment; it means a + * resource leak, but that doesn't necessarily preclude further operations. + */ +void +dsm_detach(dsm_segment *seg) +{ + /* + * Invoke registered callbacks. Just in case one of those callbacks + * throws a further error that brings us back here, pop the callback + * before invoking it, to avoid infinite error recursion. Don't allow + * interrupts while running the individual callbacks in non-error code + * paths, to avoid leaving cleanup work unfinished if we're interrupted by + * a statement timeout or similar. + */ + HOLD_INTERRUPTS(); + while (!slist_is_empty(&seg->on_detach)) + { + slist_node *node; + dsm_segment_detach_callback *cb; + on_dsm_detach_callback function; + Datum arg; + + node = slist_pop_head_node(&seg->on_detach); + cb = slist_container(dsm_segment_detach_callback, node, node); + function = cb->function; + arg = cb->arg; + pfree(cb); + + function(seg, arg); + } + RESUME_INTERRUPTS(); + + /* + * Try to remove the mapping, if one exists. Normally, there will be, but + * maybe not, if we failed partway through a create or attach operation. + * We remove the mapping before decrementing the reference count so that + * the process that sees a zero reference count can be certain that no + * remaining mappings exist. Even if this fails, we pretend that it + * works, because retrying is likely to fail in the same way. + */ + if (seg->mapped_address != NULL) + { + if (!is_main_region_dsm_handle(seg->handle)) + dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, WARNING); + seg->impl_private = NULL; + seg->mapped_address = NULL; + seg->mapped_size = 0; + } + + /* Reduce reference count, if we previously increased it. */ + if (seg->control_slot != INVALID_CONTROL_SLOT) + { + uint32 refcnt; + uint32 control_slot = seg->control_slot; + + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + Assert(dsm_control->item[control_slot].handle == seg->handle); + Assert(dsm_control->item[control_slot].refcnt > 1); + refcnt = --dsm_control->item[control_slot].refcnt; + seg->control_slot = INVALID_CONTROL_SLOT; + LWLockRelease(DynamicSharedMemoryControlLock); + + /* If new reference count is 1, try to destroy the segment. */ + if (refcnt == 1) + { + /* A pinned segment should never reach 1. */ + Assert(!dsm_control->item[control_slot].pinned); + + /* + * If we fail to destroy the segment here, or are killed before we + * finish doing so, the reference count will remain at 1, which + * will mean that nobody else can attach to the segment. At + * postmaster shutdown time, or when a new postmaster is started + * after a hard kill, another attempt will be made to remove the + * segment. + * + * The main case we're worried about here is being killed by a + * signal before we can finish removing the segment. In that + * case, it's important to be sure that the segment still gets + * removed. If we actually fail to remove the segment for some + * other reason, the postmaster may not have any better luck than + * we did. There's not much we can do about that, though. + */ + if (is_main_region_dsm_handle(seg->handle) || + dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, WARNING)) + { + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (is_main_region_dsm_handle(seg->handle)) + FreePageManagerPut((FreePageManager *) dsm_main_space_begin, + dsm_control->item[control_slot].first_page, + dsm_control->item[control_slot].npages); + Assert(dsm_control->item[control_slot].handle == seg->handle); + Assert(dsm_control->item[control_slot].refcnt == 1); + dsm_control->item[control_slot].refcnt = 0; + LWLockRelease(DynamicSharedMemoryControlLock); + } + } + } + + /* Clean up our remaining backend-private data structures. */ + if (seg->resowner != NULL) + ResourceOwnerForgetDSM(seg->resowner, seg); + dlist_delete(&seg->node); + pfree(seg); +} + +/* + * Keep a dynamic shared memory mapping until end of session. + * + * By default, mappings are owned by the current resource owner, which + * typically means they stick around for the duration of the current query + * only. + */ +void +dsm_pin_mapping(dsm_segment *seg) +{ + if (seg->resowner != NULL) + { + ResourceOwnerForgetDSM(seg->resowner, seg); + seg->resowner = NULL; + } +} + +/* + * Arrange to remove a dynamic shared memory mapping at cleanup time. + * + * dsm_pin_mapping() can be used to preserve a mapping for the entire + * lifetime of a process; this function reverses that decision, making + * the segment owned by the current resource owner. This may be useful + * just before performing some operation that will invalidate the segment + * for future use by this backend. + */ +void +dsm_unpin_mapping(dsm_segment *seg) +{ + Assert(seg->resowner == NULL); + ResourceOwnerEnlargeDSMs(CurrentResourceOwner); + seg->resowner = CurrentResourceOwner; + ResourceOwnerRememberDSM(seg->resowner, seg); +} + +/* + * Keep a dynamic shared memory segment until postmaster shutdown, or until + * dsm_unpin_segment is called. + * + * This function should not be called more than once per segment, unless the + * segment is explicitly unpinned with dsm_unpin_segment in between calls. + * + * Note that this function does not arrange for the current process to + * keep the segment mapped indefinitely; if that behavior is desired, + * dsm_pin_mapping() should be used from each process that needs to + * retain the mapping. + */ +void +dsm_pin_segment(dsm_segment *seg) +{ + void *handle = NULL; + + /* + * Bump reference count for this segment in shared memory. This will + * ensure that even if there is no session which is attached to this + * segment, it will remain until postmaster shutdown or an explicit call + * to unpin. + */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (dsm_control->item[seg->control_slot].pinned) + elog(ERROR, "cannot pin a segment that is already pinned"); + if (!is_main_region_dsm_handle(seg->handle)) + dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle); + dsm_control->item[seg->control_slot].pinned = true; + dsm_control->item[seg->control_slot].refcnt++; + dsm_control->item[seg->control_slot].impl_private_pm_handle = handle; + LWLockRelease(DynamicSharedMemoryControlLock); +} + +/* + * Unpin a dynamic shared memory segment that was previously pinned with + * dsm_pin_segment. This function should not be called unless dsm_pin_segment + * was previously called for this segment. + * + * The argument is a dsm_handle rather than a dsm_segment in case you want + * to unpin a segment to which you haven't attached. This turns out to be + * useful if, for example, a reference to one shared memory segment is stored + * within another shared memory segment. You might want to unpin the + * referenced segment before destroying the referencing segment. + */ +void +dsm_unpin_segment(dsm_handle handle) +{ + uint32 control_slot = INVALID_CONTROL_SLOT; + bool destroy = false; + uint32 i; + + /* Find the control slot for the given handle. */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + for (i = 0; i < dsm_control->nitems; ++i) + { + /* Skip unused slots and segments that are concurrently going away. */ + if (dsm_control->item[i].refcnt <= 1) + continue; + + /* If we've found our handle, we can stop searching. */ + if (dsm_control->item[i].handle == handle) + { + control_slot = i; + break; + } + } + + /* + * We should definitely have found the slot, and it should not already be + * in the process of going away, because this function should only be + * called on a segment which is pinned. + */ + if (control_slot == INVALID_CONTROL_SLOT) + elog(ERROR, "cannot unpin unknown segment handle"); + if (!dsm_control->item[control_slot].pinned) + elog(ERROR, "cannot unpin a segment that is not pinned"); + Assert(dsm_control->item[control_slot].refcnt > 1); + + /* + * Allow implementation-specific code to run. We have to do this before + * releasing the lock, because impl_private_pm_handle may get modified by + * dsm_impl_unpin_segment. + */ + if (!is_main_region_dsm_handle(handle)) + dsm_impl_unpin_segment(handle, + &dsm_control->item[control_slot].impl_private_pm_handle); + + /* Note that 1 means no references (0 means unused slot). */ + if (--dsm_control->item[control_slot].refcnt == 1) + destroy = true; + dsm_control->item[control_slot].pinned = false; + + /* Now we can release the lock. */ + LWLockRelease(DynamicSharedMemoryControlLock); + + /* Clean up resources if that was the last reference. */ + if (destroy) + { + void *junk_impl_private = NULL; + void *junk_mapped_address = NULL; + Size junk_mapped_size = 0; + + /* + * For an explanation of how error handling works in this case, see + * comments in dsm_detach. Note that if we reach this point, the + * current process certainly does not have the segment mapped, because + * if it did, the reference count would have still been greater than 1 + * even after releasing the reference count held by the pin. The fact + * that there can't be a dsm_segment for this handle makes it OK to + * pass the mapped size, mapped address, and private data as NULL + * here. + */ + if (is_main_region_dsm_handle(handle) || + dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, WARNING)) + { + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (is_main_region_dsm_handle(handle)) + FreePageManagerPut((FreePageManager *) dsm_main_space_begin, + dsm_control->item[control_slot].first_page, + dsm_control->item[control_slot].npages); + Assert(dsm_control->item[control_slot].handle == handle); + Assert(dsm_control->item[control_slot].refcnt == 1); + dsm_control->item[control_slot].refcnt = 0; + LWLockRelease(DynamicSharedMemoryControlLock); + } + } +} + +/* + * Find an existing mapping for a shared memory segment, if there is one. + */ +dsm_segment * +dsm_find_mapping(dsm_handle handle) +{ + dlist_iter iter; + dsm_segment *seg; + + dlist_foreach(iter, &dsm_segment_list) + { + seg = dlist_container(dsm_segment, node, iter.cur); + if (seg->handle == handle) + return seg; + } + + return NULL; +} + +/* + * Get the address at which a dynamic shared memory segment is mapped. + */ +void * +dsm_segment_address(dsm_segment *seg) +{ + Assert(seg->mapped_address != NULL); + return seg->mapped_address; +} + +/* + * Get the size of a mapping. + */ +Size +dsm_segment_map_length(dsm_segment *seg) +{ + Assert(seg->mapped_address != NULL); + return seg->mapped_size; +} + +/* + * Get a handle for a mapping. + * + * To establish communication via dynamic shared memory between two backends, + * one of them should first call dsm_create() to establish a new shared + * memory mapping. That process should then call dsm_segment_handle() to + * obtain a handle for the mapping, and pass that handle to the + * coordinating backend via some means (e.g. bgw_main_arg, or via the + * main shared memory segment). The recipient, once in possession of the + * handle, should call dsm_attach(). + */ +dsm_handle +dsm_segment_handle(dsm_segment *seg) +{ + return seg->handle; +} + +/* + * Register an on-detach callback for a dynamic shared memory segment. + */ +void +on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg) +{ + dsm_segment_detach_callback *cb; + + cb = MemoryContextAlloc(TopMemoryContext, + sizeof(dsm_segment_detach_callback)); + cb->function = function; + cb->arg = arg; + slist_push_head(&seg->on_detach, &cb->node); +} + +/* + * Unregister an on-detach callback for a dynamic shared memory segment. + */ +void +cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, + Datum arg) +{ + slist_mutable_iter iter; + + slist_foreach_modify(iter, &seg->on_detach) + { + dsm_segment_detach_callback *cb; + + cb = slist_container(dsm_segment_detach_callback, node, iter.cur); + if (cb->function == function && cb->arg == arg) + { + slist_delete_current(&iter); + pfree(cb); + break; + } + } +} + +/* + * Discard all registered on-detach callbacks without executing them. + */ +void +reset_on_dsm_detach(void) +{ + dlist_iter iter; + + dlist_foreach(iter, &dsm_segment_list) + { + dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur); + + /* Throw away explicit on-detach actions one by one. */ + while (!slist_is_empty(&seg->on_detach)) + { + slist_node *node; + dsm_segment_detach_callback *cb; + + node = slist_pop_head_node(&seg->on_detach); + cb = slist_container(dsm_segment_detach_callback, node, node); + pfree(cb); + } + + /* + * Decrementing the reference count is a sort of implicit on-detach + * action; make sure we don't do that, either. + */ + seg->control_slot = INVALID_CONTROL_SLOT; + } +} + +/* + * Create a segment descriptor. + */ +static dsm_segment * +dsm_create_descriptor(void) +{ + dsm_segment *seg; + + if (CurrentResourceOwner) + ResourceOwnerEnlargeDSMs(CurrentResourceOwner); + + seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment)); + dlist_push_head(&dsm_segment_list, &seg->node); + + /* seg->handle must be initialized by the caller */ + seg->control_slot = INVALID_CONTROL_SLOT; + seg->impl_private = NULL; + seg->mapped_address = NULL; + seg->mapped_size = 0; + + seg->resowner = CurrentResourceOwner; + if (CurrentResourceOwner) + ResourceOwnerRememberDSM(CurrentResourceOwner, seg); + + slist_init(&seg->on_detach); + + return seg; +} + +/* + * Sanity check a control segment. + * + * The goal here isn't to detect everything that could possibly be wrong with + * the control segment; there's not enough information for that. Rather, the + * goal is to make sure that someone can iterate over the items in the segment + * without overrunning the end of the mapping and crashing. We also check + * the magic number since, if that's messed up, this may not even be one of + * our segments at all. + */ +static bool +dsm_control_segment_sane(dsm_control_header *control, Size mapped_size) +{ + if (mapped_size < offsetof(dsm_control_header, item)) + return false; /* Mapped size too short to read header. */ + if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC) + return false; /* Magic number doesn't match. */ + if (dsm_control_bytes_needed(control->maxitems) > mapped_size) + return false; /* Max item count won't fit in map. */ + if (control->nitems > control->maxitems) + return false; /* Overfull. */ + return true; +} + +/* + * Compute the number of control-segment bytes needed to store a given + * number of items. + */ +static uint64 +dsm_control_bytes_needed(uint32 nitems) +{ + return offsetof(dsm_control_header, item) + + sizeof(dsm_control_item) * (uint64) nitems; +} + +static inline dsm_handle +make_main_region_dsm_handle(int slot) +{ + dsm_handle handle; + + /* + * We need to create a handle that doesn't collide with any existing extra + * segment created by dsm_impl_op(), so we'll make it odd. It also + * mustn't collide with any other main area pseudo-segment, so we'll + * include the slot number in some of the bits. We also want to make an + * effort to avoid newly created and recently destroyed handles from being + * confused, so we'll make the rest of the bits random. + */ + handle = 1; + handle |= slot << 1; + handle |= pg_prng_uint32(&pg_global_prng_state) << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1); + return handle; +} + +static inline bool +is_main_region_dsm_handle(dsm_handle handle) +{ + return handle & 1; +} diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c new file mode 100644 index 0000000..6399fa2 --- /dev/null +++ b/src/backend/storage/ipc/dsm_impl.c @@ -0,0 +1,1053 @@ +/*------------------------------------------------------------------------- + * + * dsm_impl.c + * manage dynamic shared memory segments + * + * This file provides low-level APIs for creating and destroying shared + * memory segments using several different possible techniques. We refer + * to these segments as dynamic because they can be created, altered, and + * destroyed at any point during the server life cycle. This is unlike + * the main shared memory segment, of which there is always exactly one + * and which is always mapped at a fixed address in every PostgreSQL + * background process. + * + * Because not all systems provide the same primitives in this area, nor + * do all primitives behave the same way on all systems, we provide + * several implementations of this facility. Many systems implement + * POSIX shared memory (shm_open etc.), which is well-suited to our needs + * in this area, with the exception that shared memory identifiers live + * in a flat system-wide namespace, raising the uncomfortable prospect of + * name collisions with other processes (including other copies of + * PostgreSQL) running on the same system. Some systems only support + * the older System V shared memory interface (shmget etc.) which is + * also usable; however, the default allocation limits are often quite + * small, and the namespace is even more restricted. + * + * We also provide an mmap-based shared memory implementation. This may + * be useful on systems that provide shared memory via a special-purpose + * filesystem; by opting for this implementation, the user can even + * control precisely where their shared memory segments are placed. It + * can also be used as a fallback for systems where shm_open and shmget + * are not available or can't be used for some reason. Of course, + * mapping a file residing on an actual spinning disk is a fairly poor + * approximation for shared memory because writeback may hurt performance + * substantially, but there should be few systems where we must make do + * with such poor tools. + * + * As ever, Windows requires its own implementation. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/dsm_impl.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include +#include +#include +#ifndef WIN32 +#include +#include +#include +#include +#endif + +#include "common/file_perm.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "portability/mem.h" +#include "postmaster/postmaster.h" +#include "storage/dsm_impl.h" +#include "storage/fd.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +#ifdef USE_DSM_POSIX +static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +static int dsm_impl_posix_resize(int fd, off_t size); +#endif +#ifdef USE_DSM_SYSV +static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +#ifdef USE_DSM_WINDOWS +static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +#ifdef USE_DSM_MMAP +static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +static int errcode_for_dynamic_shared_memory(void); + +const struct config_enum_entry dynamic_shared_memory_options[] = { +#ifdef USE_DSM_POSIX + {"posix", DSM_IMPL_POSIX, false}, +#endif +#ifdef USE_DSM_SYSV + {"sysv", DSM_IMPL_SYSV, false}, +#endif +#ifdef USE_DSM_WINDOWS + {"windows", DSM_IMPL_WINDOWS, false}, +#endif +#ifdef USE_DSM_MMAP + {"mmap", DSM_IMPL_MMAP, false}, +#endif + {NULL, 0, false} +}; + +/* Implementation selector. */ +int dynamic_shared_memory_type = DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE; + +/* Amount of space reserved for DSM segments in the main area. */ +int min_dynamic_shared_memory; + +/* Size of buffer to be used for zero-filling. */ +#define ZBUFFER_SIZE 8192 + +#define SEGMENT_NAME_PREFIX "Global/PostgreSQL" + +/*------ + * Perform a low-level shared memory operation in a platform-specific way, + * as dictated by the selected implementation. Each implementation is + * required to implement the following primitives. + * + * DSM_OP_CREATE. Create a segment whose size is the request_size and + * map it. + * + * DSM_OP_ATTACH. Map the segment, whose size must be the request_size. + * + * DSM_OP_DETACH. Unmap the segment. + * + * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the + * segment. + * + * Arguments: + * op: The operation to be performed. + * handle: The handle of an existing object, or for DSM_OP_CREATE, the + * a new handle the caller wants created. + * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0. + * impl_private: Private, implementation-specific data. Will be a pointer + * to NULL for the first operation on a shared memory segment within this + * backend; thereafter, it will point to the value to which it was set + * on the previous call. + * mapped_address: Pointer to start of current mapping; pointer to NULL + * if none. Updated with new mapping address. + * mapped_size: Pointer to size of current mapping; pointer to 0 if none. + * Updated with new mapped size. + * elevel: Level at which to log errors. + * + * Return value: true on success, false on failure. When false is returned, + * a message should first be logged at the specified elevel, except in the + * case where DSM_OP_CREATE experiences a name collision, which should + * silently return false. + *----- + */ +bool +dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + Assert(op == DSM_OP_CREATE || request_size == 0); + Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) || + (*mapped_address == NULL && *mapped_size == 0)); + + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_POSIX + case DSM_IMPL_POSIX: + return dsm_impl_posix(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_SYSV + case DSM_IMPL_SYSV: + return dsm_impl_sysv(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + return dsm_impl_windows(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_MMAP + case DSM_IMPL_MMAP: + return dsm_impl_mmap(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif + default: + elog(ERROR, "unexpected dynamic shared memory type: %d", + dynamic_shared_memory_type); + return false; + } +} + +#ifdef USE_DSM_POSIX +/* + * Operating system primitives to support POSIX shared memory. + * + * POSIX shared memory segments are created and attached using shm_open() + * and shm_unlink(); other operations, such as sizing or mapping the + * segment, are performed as if the shared memory segments were files. + * + * Indeed, on some platforms, they may be implemented that way. While + * POSIX shared memory segments seem intended to exist in a flat namespace, + * some operating systems may implement them as files, even going so far + * to treat a request for /xyz as a request to create a file by that name + * in the root directory. Users of such broken platforms should select + * a different shared memory implementation. + */ +static bool +dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + char name[64]; + int flags; + int fd; + char *address; + + snprintf(name, 64, "/PostgreSQL.%u", handle); + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && munmap(*mapped_address, *mapped_size) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && shm_unlink(name) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* + * Create new segment or open an existing one for attach. + * + * Even though we will close the FD before returning, it seems desirable + * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE + * failure. The fact that we won't hold the FD open long justifies using + * ReserveExternalFD rather than AcquireExternalFD, though. + */ + ReserveExternalFD(); + + flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); + if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1) + { + ReleaseExternalFD(); + if (op == DSM_OP_ATTACH || errno != EEXIST) + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * If we're attaching the segment, determine the current size; if we are + * creating the segment, set the size to the requested value. + */ + if (op == DSM_OP_ATTACH) + { + struct stat st; + + if (fstat(fd, &st) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = st.st_size; + } + else if (dsm_impl_posix_resize(fd, request_size) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", + name, request_size))); + return false; + } + + /* Map it. */ + address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); + if (address == MAP_FAILED) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + if (op == DSM_OP_CREATE) + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + close(fd); + ReleaseExternalFD(); + + return true; +} + +/* + * Set the size of a virtual memory region associated with a file descriptor. + * If necessary, also ensure that virtual memory is actually allocated by the + * operating system, to avoid nasty surprises later. + * + * Returns non-zero if either truncation or allocation fails, and sets errno. + */ +static int +dsm_impl_posix_resize(int fd, off_t size) +{ + int rc; + int save_errno; + sigset_t save_sigmask; + + /* + * Block all blockable signals, except SIGQUIT. posix_fallocate() can run + * for quite a long time, and is an all-or-nothing operation. If we + * allowed SIGUSR1 to interrupt us repeatedly (for example, due to + * recovery conflicts), the retry loop might never succeed. + */ + if (IsUnderPostmaster) + sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask); + + pgstat_report_wait_start(WAIT_EVENT_DSM_ALLOCATE); +#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__) + + /* + * On Linux, a shm_open fd is backed by a tmpfs file. If we were to use + * ftruncate, the file would contain a hole. Accessing memory backed by a + * hole causes tmpfs to allocate pages, which fails with SIGBUS if there + * is no more tmpfs space available. So we ask tmpfs to allocate pages + * here, so we can fail gracefully with ENOSPC now rather than risking + * SIGBUS later. + * + * We still use a traditional EINTR retry loop to handle SIGCONT. + * posix_fallocate() doesn't restart automatically, and we don't want this + * to fail if you attach a debugger. + */ + do + { + rc = posix_fallocate(fd, 0, size); + } while (rc == EINTR); + + /* + * The caller expects errno to be set, but posix_fallocate() doesn't set + * it. Instead it returns error numbers directly. So set errno, even + * though we'll also return rc to indicate success or failure. + */ + errno = rc; +#else + /* Extend the file to the requested size. */ + do + { + rc = ftruncate(fd, size); + } while (rc < 0 && errno == EINTR); +#endif + pgstat_report_wait_end(); + + if (IsUnderPostmaster) + { + save_errno = errno; + sigprocmask(SIG_SETMASK, &save_sigmask, NULL); + errno = save_errno; + } + + return rc; +} + +#endif /* USE_DSM_POSIX */ + +#ifdef USE_DSM_SYSV +/* + * Operating system primitives to support System V shared memory. + * + * System V shared memory segments are manipulated using shmget(), shmat(), + * shmdt(), and shmctl(). As the default allocation limits for System V + * shared memory are usually quite low, the POSIX facilities may be + * preferable; but those are not supported everywhere. + */ +static bool +dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + key_t key; + int ident; + char *address; + char name[64]; + int *ident_cache; + + /* + * POSIX shared memory and mmap-based shared memory identify segments with + * names. To avoid needless error message variation, we use the handle as + * the name. + */ + snprintf(name, 64, "%u", handle); + + /* + * The System V shared memory namespace is very restricted; names are of + * type key_t, which is expected to be some sort of integer data type, but + * not necessarily the same one as dsm_handle. Since we use dsm_handle to + * identify shared memory segments across processes, this might seem like + * a problem, but it's really not. If dsm_handle is bigger than key_t, + * the cast below might truncate away some bits from the handle the + * user-provided, but it'll truncate exactly the same bits away in exactly + * the same fashion every time we use that handle, which is all that + * really matters. Conversely, if dsm_handle is smaller than key_t, we + * won't use the full range of available key space, but that's no big deal + * either. + * + * We do make sure that the key isn't negative, because that might not be + * portable. + */ + key = (key_t) handle; + if (key < 1) /* avoid compiler warning if type is unsigned */ + key = -key; + + /* + * There's one special key, IPC_PRIVATE, which can't be used. If we end + * up with that value by chance during a create operation, just pretend it + * already exists, so that caller will retry. If we run into it anywhere + * else, the caller has passed a handle that doesn't correspond to + * anything we ever created, which should not happen. + */ + if (key == IPC_PRIVATE) + { + if (op != DSM_OP_CREATE) + elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE"); + errno = EEXIST; + return false; + } + + /* + * Before we can do anything with a shared memory segment, we have to map + * the shared memory key to a shared memory identifier using shmget(). To + * avoid repeated lookups, we store the key using impl_private. + */ + if (*impl_private != NULL) + { + ident_cache = *impl_private; + ident = *ident_cache; + } + else + { + int flags = IPCProtection; + size_t segsize; + + /* + * Allocate the memory BEFORE acquiring the resource, so that we don't + * leak the resource if memory allocation fails. + */ + ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int)); + + /* + * When using shmget to find an existing segment, we must pass the + * size as 0. Passing a non-zero size which is greater than the + * actual size will result in EINVAL. + */ + segsize = 0; + + if (op == DSM_OP_CREATE) + { + flags |= IPC_CREAT | IPC_EXCL; + segsize = request_size; + } + + if ((ident = shmget(key, segsize, flags)) == -1) + { + if (op == DSM_OP_ATTACH || errno != EEXIST) + { + int save_errno = errno; + + pfree(ident_cache); + errno = save_errno; + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not get shared memory segment: %m"))); + } + return false; + } + + *ident_cache = ident; + *impl_private = ident_cache; + } + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + pfree(ident_cache); + *impl_private = NULL; + if (*mapped_address != NULL && shmdt(*mapped_address) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* If we're attaching it, we must use IPC_STAT to determine the size. */ + if (op == DSM_OP_ATTACH) + { + struct shmid_ds shm; + + if (shmctl(ident, IPC_STAT, &shm) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = shm.shm_segsz; + } + + /* Map it. */ + address = shmat(ident, NULL, PG_SHMAT_FLAGS); + if (address == (void *) -1) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + if (op == DSM_OP_CREATE) + shmctl(ident, IPC_RMID, NULL); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + + return true; +} +#endif + +#ifdef USE_DSM_WINDOWS +/* + * Operating system primitives to support Windows shared memory. + * + * Windows shared memory implementation is done using file mapping + * which can be backed by either physical file or system paging file. + * Current implementation uses system paging file as other effects + * like performance are not clear for physical file and it is used in similar + * way for main shared memory in windows. + * + * A memory mapping object is a kernel object - they always get deleted when + * the last reference to them goes away, either explicitly via a CloseHandle or + * when the process containing the reference exits. + */ +static bool +dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel) +{ + char *address; + HANDLE hmap; + char name[64]; + MEMORY_BASIC_INFORMATION info; + + /* + * Storing the shared memory segment in the Global\ namespace, can allow + * any process running in any session to access that file mapping object + * provided that the caller has the required access rights. But to avoid + * issues faced in main shared memory, we are using the naming convention + * similar to main shared memory. We can change here once issue mentioned + * in GetSharedMemName is resolved. + */ + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + + /* + * Handle teardown cases. Since Windows automatically destroys the object + * when no references remain, we can treat it the same as detach. + */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && UnmapViewOfFile(*mapped_address) == 0) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + if (*impl_private != NULL + && CloseHandle(*impl_private) == 0) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + + *impl_private = NULL; + *mapped_address = NULL; + *mapped_size = 0; + return true; + } + + /* Create new segment or open an existing one for attach. */ + if (op == DSM_OP_CREATE) + { + DWORD size_high; + DWORD size_low; + DWORD errcode; + + /* Shifts >= the width of the type are undefined. */ +#ifdef _WIN64 + size_high = request_size >> 32; +#else + size_high = 0; +#endif + size_low = (DWORD) request_size; + + /* CreateFileMapping might not clear the error code on success */ + SetLastError(0); + + hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ + NULL, /* Default security attrs */ + PAGE_READWRITE, /* Memory is read/write */ + size_high, /* Upper 32 bits of size */ + size_low, /* Lower 32 bits of size */ + name); + + errcode = GetLastError(); + if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED) + { + /* + * On Windows, when the segment already exists, a handle for the + * existing segment is returned. We must close it before + * returning. However, if the existing segment is created by a + * service, then it returns ERROR_ACCESS_DENIED. We don't do + * _dosmaperr here, so errno won't be modified. + */ + if (hmap) + CloseHandle(hmap); + return false; + } + + if (!hmap) + { + _dosmaperr(errcode); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not create shared memory segment \"%s\": %m", + name))); + return false; + } + } + else + { + hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ, + FALSE, /* do not inherit the name */ + name); /* name of mapping object */ + if (!hmap) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + } + + /* Map it. */ + address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ, + 0, 0, 0); + if (!address) + { + int save_errno; + + _dosmaperr(GetLastError()); + /* Back out what's already been done. */ + save_errno = errno; + CloseHandle(hmap); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * VirtualQuery gives size in page_size units, which is 4K for Windows. We + * need size only when we are attaching, but it's better to get the size + * when creating new segment to keep size consistent both for + * DSM_OP_CREATE and DSM_OP_ATTACH. + */ + if (VirtualQuery(address, &info, sizeof(info)) == 0) + { + int save_errno; + + _dosmaperr(GetLastError()); + /* Back out what's already been done. */ + save_errno = errno; + UnmapViewOfFile(address); + CloseHandle(hmap); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + + *mapped_address = address; + *mapped_size = info.RegionSize; + *impl_private = hmap; + + return true; +} +#endif + +#ifdef USE_DSM_MMAP +/* + * Operating system primitives to support mmap-based shared memory. + * + * Calling this "shared memory" is somewhat of a misnomer, because what + * we're really doing is creating a bunch of files and mapping them into + * our address space. The operating system may feel obliged to + * synchronize the contents to disk even if nothing is being paged out, + * which will not serve us well. The user can relocate the pg_dynshmem + * directory to a ramdisk to avoid this problem, if available. + */ +static bool +dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + char name[64]; + int flags; + int fd; + char *address; + + snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u", + handle); + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && munmap(*mapped_address, *mapped_size) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && unlink(name) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* Create new segment or open an existing one for attach. */ + flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); + if ((fd = OpenTransientFile(name, flags)) == -1) + { + if (op == DSM_OP_ATTACH || errno != EEXIST) + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * If we're attaching the segment, determine the current size; if we are + * creating the segment, set the size to the requested value. + */ + if (op == DSM_OP_ATTACH) + { + struct stat st; + + if (fstat(fd, &st) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = st.st_size; + } + else + { + /* + * Allocate a buffer full of zeros. + * + * Note: palloc zbuffer, instead of just using a local char array, to + * ensure it is reasonably well-aligned; this may save a few cycles + * transferring data to the kernel. + */ + char *zbuffer = (char *) palloc0(ZBUFFER_SIZE); + uint32 remaining = request_size; + bool success = true; + + /* + * Zero-fill the file. We have to do this the hard way to ensure that + * all the file space has really been allocated, so that we don't + * later seg fault when accessing the memory mapping. This is pretty + * pessimal. + */ + while (success && remaining > 0) + { + Size goal = remaining; + + if (goal > ZBUFFER_SIZE) + goal = ZBUFFER_SIZE; + pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE); + if (write(fd, zbuffer, goal) == goal) + remaining -= goal; + else + success = false; + pgstat_report_wait_end(); + } + + if (!success) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + unlink(name); + errno = save_errno ? save_errno : ENOSPC; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", + name, request_size))); + return false; + } + } + + /* Map it. */ + address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); + if (address == MAP_FAILED) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + if (op == DSM_OP_CREATE) + unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + + if (CloseTransientFile(fd) != 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close shared memory segment \"%s\": %m", + name))); + return false; + } + + return true; +} +#endif + +/* + * Implementation-specific actions that must be performed when a segment is to + * be preserved even when no backend has it attached. + * + * Except on Windows, we don't need to do anything at all. But since Windows + * cleans up segments automatically when no references remain, we duplicate + * the segment handle into the postmaster process. The postmaster needn't + * do anything to receive the handle; Windows transfers it automatically. + */ +void +dsm_impl_pin_segment(dsm_handle handle, void *impl_private, + void **impl_private_pm_handle) +{ + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + if (IsUnderPostmaster) + { + HANDLE hmap; + + if (!DuplicateHandle(GetCurrentProcess(), impl_private, + PostmasterHandle, &hmap, 0, FALSE, + DUPLICATE_SAME_ACCESS)) + { + char name[64]; + + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + _dosmaperr(GetLastError()); + ereport(ERROR, + (errcode_for_dynamic_shared_memory(), + errmsg("could not duplicate handle for \"%s\": %m", + name))); + } + + /* + * Here, we remember the handle that we created in the + * postmaster process. This handle isn't actually usable in + * any process other than the postmaster, but that doesn't + * matter. We're just holding onto it so that, if the segment + * is unpinned, dsm_impl_unpin_segment can close it. + */ + *impl_private_pm_handle = hmap; + } + break; +#endif + default: + break; + } +} + +/* + * Implementation-specific actions that must be performed when a segment is no + * longer to be preserved, so that it will be cleaned up when all backends + * have detached from it. + * + * Except on Windows, we don't need to do anything at all. For Windows, we + * close the extra handle that dsm_impl_pin_segment created in the + * postmaster's process space. + */ +void +dsm_impl_unpin_segment(dsm_handle handle, void **impl_private) +{ + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + if (IsUnderPostmaster) + { + if (*impl_private && + !DuplicateHandle(PostmasterHandle, *impl_private, + NULL, NULL, 0, FALSE, + DUPLICATE_CLOSE_SOURCE)) + { + char name[64]; + + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + _dosmaperr(GetLastError()); + ereport(ERROR, + (errcode_for_dynamic_shared_memory(), + errmsg("could not duplicate handle for \"%s\": %m", + name))); + } + + *impl_private = NULL; + } + break; +#endif + default: + break; + } +} + +static int +errcode_for_dynamic_shared_memory(void) +{ + if (errno == EFBIG || errno == ENOMEM) + return errcode(ERRCODE_OUT_OF_MEMORY); + else + return errcode_for_file_access(); +} diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c new file mode 100644 index 0000000..6591b5d --- /dev/null +++ b/src/backend/storage/ipc/ipc.c @@ -0,0 +1,439 @@ +/*------------------------------------------------------------------------- + * + * ipc.c + * POSTGRES inter-process communication definitions. + * + * This file is misnamed, as it no longer has much of anything directly + * to do with IPC. The functionality here is concerned with managing + * exit-time cleanup for either a postmaster or a backend. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/ipc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#include "miscadmin.h" +#ifdef PROFILE_PID_DIR +#include "postmaster/autovacuum.h" +#endif +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "tcop/tcopprot.h" + + +/* + * This flag is set during proc_exit() to change ereport()'s behavior, + * so that an ereport() from an on_proc_exit routine cannot get us out + * of the exit procedure. We do NOT want to go back to the idle loop... + */ +bool proc_exit_inprogress = false; + +/* + * Set when shmem_exit() is in progress. + */ +bool shmem_exit_inprogress = false; + +/* + * This flag tracks whether we've called atexit() in the current process + * (or in the parent postmaster). + */ +static bool atexit_callback_setup = false; + +/* local functions */ +static void proc_exit_prepare(int code); + + +/* ---------------------------------------------------------------- + * exit() handling stuff + * + * These functions are in generally the same spirit as atexit(), + * but provide some additional features we need --- in particular, + * we want to register callbacks to invoke when we are disconnecting + * from a broken shared-memory context but not exiting the postmaster. + * + * Callback functions can take zero, one, or two args: the first passed + * arg is the integer exitcode, the second is the Datum supplied when + * the callback was registered. + * ---------------------------------------------------------------- + */ + +#define MAX_ON_EXITS 20 + +struct ONEXIT +{ + pg_on_exit_callback function; + Datum arg; +}; + +static struct ONEXIT on_proc_exit_list[MAX_ON_EXITS]; +static struct ONEXIT on_shmem_exit_list[MAX_ON_EXITS]; +static struct ONEXIT before_shmem_exit_list[MAX_ON_EXITS]; + +static int on_proc_exit_index, + on_shmem_exit_index, + before_shmem_exit_index; + + +/* ---------------------------------------------------------------- + * proc_exit + * + * this function calls all the callbacks registered + * for it (to free resources) and then calls exit. + * + * This should be the only function to call exit(). + * -cim 2/6/90 + * + * Unfortunately, we can't really guarantee that add-on code + * obeys the rule of not calling exit() directly. So, while + * this is the preferred way out of the system, we also register + * an atexit callback that will make sure cleanup happens. + * ---------------------------------------------------------------- + */ +void +proc_exit(int code) +{ + /* not safe if forked by system(), etc. */ + if (MyProcPid != (int) getpid()) + elog(PANIC, "proc_exit() called in child process"); + + /* Clean up everything that must be cleaned up */ + proc_exit_prepare(code); + +#ifdef PROFILE_PID_DIR + { + /* + * If we are profiling ourself then gprof's mcleanup() is about to + * write out a profile to ./gmon.out. Since mcleanup() always uses a + * fixed file name, each backend will overwrite earlier profiles. To + * fix that, we create a separate subdirectory for each backend + * (./gprof/pid) and 'cd' to that subdirectory before we exit() - that + * forces mcleanup() to write each profile into its own directory. We + * end up with something like: $PGDATA/gprof/8829/gmon.out + * $PGDATA/gprof/8845/gmon.out ... + * + * To avoid undesirable disk space bloat, autovacuum workers are + * discriminated against: all their gmon.out files go into the same + * subdirectory. Without this, an installation that is "just sitting + * there" nonetheless eats megabytes of disk space every few seconds. + * + * Note that we do this here instead of in an on_proc_exit() callback + * because we want to ensure that this code executes last - we don't + * want to interfere with any other on_proc_exit() callback. For the + * same reason, we do not include it in proc_exit_prepare ... so if + * you are exiting in the "wrong way" you won't drop your profile in a + * nice place. + */ + char gprofDirName[32]; + + if (IsAutoVacuumWorkerProcess()) + snprintf(gprofDirName, 32, "gprof/avworker"); + else + snprintf(gprofDirName, 32, "gprof/%d", (int) getpid()); + + /* + * Use mkdir() instead of MakePGDirectory() since we aren't making a + * PG directory here. + */ + mkdir("gprof", S_IRWXU | S_IRWXG | S_IRWXO); + mkdir(gprofDirName, S_IRWXU | S_IRWXG | S_IRWXO); + chdir(gprofDirName); + } +#endif + + elog(DEBUG3, "exit(%d)", code); + + exit(code); +} + +/* + * Code shared between proc_exit and the atexit handler. Note that in + * normal exit through proc_exit, this will actually be called twice ... + * but the second call will have nothing to do. + */ +static void +proc_exit_prepare(int code) +{ + /* + * Once we set this flag, we are committed to exit. Any ereport() will + * NOT send control back to the main loop, but right back here. + */ + proc_exit_inprogress = true; + + /* + * Forget any pending cancel or die requests; we're doing our best to + * close up shop already. Note that the signal handlers will not set + * these flags again, now that proc_exit_inprogress is set. + */ + InterruptPending = false; + ProcDiePending = false; + QueryCancelPending = false; + InterruptHoldoffCount = 1; + CritSectionCount = 0; + + /* + * Also clear the error context stack, to prevent error callbacks from + * being invoked by any elog/ereport calls made during proc_exit. Whatever + * context they might want to offer is probably not relevant, and in any + * case they are likely to fail outright after we've done things like + * aborting any open transaction. (In normal exit scenarios the context + * stack should be empty anyway, but it might not be in the case of + * elog(FATAL) for example.) + */ + error_context_stack = NULL; + /* For the same reason, reset debug_query_string before it's clobbered */ + debug_query_string = NULL; + + /* do our shared memory exits first */ + shmem_exit(code); + + elog(DEBUG3, "proc_exit(%d): %d callbacks to make", + code, on_proc_exit_index); + + /* + * call all the registered callbacks. + * + * Note that since we decrement on_proc_exit_index each time, if a + * callback calls ereport(ERROR) or ereport(FATAL) then it won't be + * invoked again when control comes back here (nor will the + * previously-completed callbacks). So, an infinite loop should not be + * possible. + */ + while (--on_proc_exit_index >= 0) + on_proc_exit_list[on_proc_exit_index].function(code, + on_proc_exit_list[on_proc_exit_index].arg); + + on_proc_exit_index = 0; +} + +/* ------------------ + * Run all of the on_shmem_exit routines --- but don't actually exit. + * This is used by the postmaster to re-initialize shared memory and + * semaphores after a backend dies horribly. As with proc_exit(), we + * remove each callback from the list before calling it, to avoid + * infinite loop in case of error. + * ------------------ + */ +void +shmem_exit(int code) +{ + shmem_exit_inprogress = true; + + /* + * Call before_shmem_exit callbacks. + * + * These should be things that need most of the system to still be up and + * working, such as cleanup of temp relations, which requires catalog + * access; or things that need to be completed because later cleanup steps + * depend on them, such as releasing lwlocks. + */ + elog(DEBUG3, "shmem_exit(%d): %d before_shmem_exit callbacks to make", + code, before_shmem_exit_index); + while (--before_shmem_exit_index >= 0) + before_shmem_exit_list[before_shmem_exit_index].function(code, + before_shmem_exit_list[before_shmem_exit_index].arg); + before_shmem_exit_index = 0; + + /* + * Call dynamic shared memory callbacks. + * + * These serve the same purpose as late callbacks, but for dynamic shared + * memory segments rather than the main shared memory segment. + * dsm_backend_shutdown() has the same kind of progressive logic we use + * for the main shared memory segment; namely, it unregisters each + * callback before invoking it, so that we don't get stuck in an infinite + * loop if one of those callbacks itself throws an ERROR or FATAL. + * + * Note that explicitly calling this function here is quite different from + * registering it as an on_shmem_exit callback for precisely this reason: + * if one dynamic shared memory callback errors out, the remaining + * callbacks will still be invoked. Thus, hard-coding this call puts it + * equal footing with callbacks for the main shared memory segment. + */ + dsm_backend_shutdown(); + + /* + * Call on_shmem_exit callbacks. + * + * These are generally releasing low-level shared memory resources. In + * some cases, this is a backstop against the possibility that the early + * callbacks might themselves fail, leading to re-entry to this routine; + * in other cases, it's cleanup that only happens at process exit. + */ + elog(DEBUG3, "shmem_exit(%d): %d on_shmem_exit callbacks to make", + code, on_shmem_exit_index); + while (--on_shmem_exit_index >= 0) + on_shmem_exit_list[on_shmem_exit_index].function(code, + on_shmem_exit_list[on_shmem_exit_index].arg); + on_shmem_exit_index = 0; + + shmem_exit_inprogress = false; +} + +/* ---------------------------------------------------------------- + * atexit_callback + * + * Backstop to ensure that direct calls of exit() don't mess us up. + * + * Somebody who was being really uncooperative could call _exit(), + * but for that case we have a "dead man switch" that will make the + * postmaster treat it as a crash --- see pmsignal.c. + * ---------------------------------------------------------------- + */ +static void +atexit_callback(void) +{ + /* Clean up everything that must be cleaned up */ + /* ... too bad we don't know the real exit code ... */ + proc_exit_prepare(-1); +} + +/* ---------------------------------------------------------------- + * on_proc_exit + * + * this function adds a callback function to the list of + * functions invoked by proc_exit(). -cim 2/6/90 + * ---------------------------------------------------------------- + */ +void +on_proc_exit(pg_on_exit_callback function, Datum arg) +{ + if (on_proc_exit_index >= MAX_ON_EXITS) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("out of on_proc_exit slots"))); + + on_proc_exit_list[on_proc_exit_index].function = function; + on_proc_exit_list[on_proc_exit_index].arg = arg; + + ++on_proc_exit_index; + + if (!atexit_callback_setup) + { + atexit(atexit_callback); + atexit_callback_setup = true; + } +} + +/* ---------------------------------------------------------------- + * before_shmem_exit + * + * Register early callback to perform user-level cleanup, + * e.g. transaction abort, before we begin shutting down + * low-level subsystems. + * ---------------------------------------------------------------- + */ +void +before_shmem_exit(pg_on_exit_callback function, Datum arg) +{ + if (before_shmem_exit_index >= MAX_ON_EXITS) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("out of before_shmem_exit slots"))); + + before_shmem_exit_list[before_shmem_exit_index].function = function; + before_shmem_exit_list[before_shmem_exit_index].arg = arg; + + ++before_shmem_exit_index; + + if (!atexit_callback_setup) + { + atexit(atexit_callback); + atexit_callback_setup = true; + } +} + +/* ---------------------------------------------------------------- + * on_shmem_exit + * + * Register ordinary callback to perform low-level shutdown + * (e.g. releasing our PGPROC); run after before_shmem_exit + * callbacks and before on_proc_exit callbacks. + * ---------------------------------------------------------------- + */ +void +on_shmem_exit(pg_on_exit_callback function, Datum arg) +{ + if (on_shmem_exit_index >= MAX_ON_EXITS) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("out of on_shmem_exit slots"))); + + on_shmem_exit_list[on_shmem_exit_index].function = function; + on_shmem_exit_list[on_shmem_exit_index].arg = arg; + + ++on_shmem_exit_index; + + if (!atexit_callback_setup) + { + atexit(atexit_callback); + atexit_callback_setup = true; + } +} + +/* ---------------------------------------------------------------- + * cancel_before_shmem_exit + * + * this function removes a previously-registered before_shmem_exit + * callback. We only look at the latest entry for removal, as we + * expect callers to add and remove temporary before_shmem_exit + * callbacks in strict LIFO order. + * ---------------------------------------------------------------- + */ +void +cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg) +{ + if (before_shmem_exit_index > 0 && + before_shmem_exit_list[before_shmem_exit_index - 1].function + == function && + before_shmem_exit_list[before_shmem_exit_index - 1].arg == arg) + --before_shmem_exit_index; + else + elog(ERROR, "before_shmem_exit callback (%p,0x%llx) is not the latest entry", + function, (long long) arg); +} + +/* ---------------------------------------------------------------- + * on_exit_reset + * + * this function clears all on_proc_exit() and on_shmem_exit() + * registered functions. This is used just after forking a backend, + * so that the backend doesn't believe it should call the postmaster's + * on-exit routines when it exits... + * ---------------------------------------------------------------- + */ +void +on_exit_reset(void) +{ + before_shmem_exit_index = 0; + on_shmem_exit_index = 0; + on_proc_exit_index = 0; + reset_on_dsm_detach(); +} + +/* ---------------------------------------------------------------- + * check_on_shmem_exit_lists_are_empty + * + * Debugging check that no shmem cleanup handlers have been registered + * prematurely in the current process. + * ---------------------------------------------------------------- + */ +void +check_on_shmem_exit_lists_are_empty(void) +{ + if (before_shmem_exit_index) + elog(FATAL, "before_shmem_exit has been called prematurely"); + if (on_shmem_exit_index) + elog(FATAL, "on_shmem_exit has been called prematurely"); + /* Checking DSM detach state seems unnecessary given the above */ +} diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c new file mode 100644 index 0000000..8f1ded7 --- /dev/null +++ b/src/backend/storage/ipc/ipci.c @@ -0,0 +1,354 @@ +/*------------------------------------------------------------------------- + * + * ipci.c + * POSTGRES inter-process communication initialization code. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/ipci.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/heapam.h" +#include "access/multixact.h" +#include "access/nbtree.h" +#include "access/subtrans.h" +#include "access/syncscan.h" +#include "access/twophase.h" +#include "access/xlogprefetcher.h" +#include "access/xlogrecovery.h" +#include "commands/async.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "postmaster/bgworker_internals.h" +#include "postmaster/bgwriter.h" +#include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/origin.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "storage/sinvaladt.h" +#include "storage/spin.h" +#include "utils/guc.h" +#include "utils/snapmgr.h" + +/* GUCs */ +int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE; + +shmem_startup_hook_type shmem_startup_hook = NULL; + +static Size total_addin_request = 0; + +/* + * RequestAddinShmemSpace + * Request that extra shmem space be allocated for use by + * a loadable module. + * + * This may only be called via the shmem_request_hook of a library that is + * loaded into the postmaster via shared_preload_libraries. Calls from + * elsewhere will fail. + */ +void +RequestAddinShmemSpace(Size size) +{ + if (!process_shmem_requests_in_progress) + elog(FATAL, "cannot request additional shared memory outside shmem_request_hook"); + total_addin_request = add_size(total_addin_request, size); +} + +/* + * CalculateShmemSize + * Calculates the amount of shared memory and number of semaphores needed. + * + * If num_semaphores is not NULL, it will be set to the number of semaphores + * required. + */ +Size +CalculateShmemSize(int *num_semaphores) +{ + Size size; + int numSemas; + + /* Compute number of semaphores we'll need */ + numSemas = ProcGlobalSemas(); + numSemas += SpinlockSemas(); + + /* Return the number of semaphores if requested by the caller */ + if (num_semaphores) + *num_semaphores = numSemas; + + /* + * Size of the Postgres shared-memory block is estimated via moderately- + * accurate estimates for the big hogs, plus 100K for the stuff that's too + * small to bother with estimating. + * + * We take some care to ensure that the total size request doesn't + * overflow size_t. If this gets through, we don't need to be so careful + * during the actual allocation phase. + */ + size = 100000; + size = add_size(size, PGSemaphoreShmemSize(numSemas)); + size = add_size(size, SpinlockSemaSize()); + size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE, + sizeof(ShmemIndexEnt))); + size = add_size(size, dsm_estimate_size()); + size = add_size(size, BufferShmemSize()); + size = add_size(size, LockShmemSize()); + size = add_size(size, PredicateLockShmemSize()); + size = add_size(size, ProcGlobalShmemSize()); + size = add_size(size, XLogPrefetchShmemSize()); + size = add_size(size, XLOGShmemSize()); + size = add_size(size, XLogRecoveryShmemSize()); + size = add_size(size, CLOGShmemSize()); + size = add_size(size, CommitTsShmemSize()); + size = add_size(size, SUBTRANSShmemSize()); + size = add_size(size, TwoPhaseShmemSize()); + size = add_size(size, BackgroundWorkerShmemSize()); + size = add_size(size, MultiXactShmemSize()); + size = add_size(size, LWLockShmemSize()); + size = add_size(size, ProcArrayShmemSize()); + size = add_size(size, BackendStatusShmemSize()); + size = add_size(size, SInvalShmemSize()); + size = add_size(size, PMSignalShmemSize()); + size = add_size(size, ProcSignalShmemSize()); + size = add_size(size, CheckpointerShmemSize()); + size = add_size(size, AutoVacuumShmemSize()); + size = add_size(size, ReplicationSlotsShmemSize()); + size = add_size(size, ReplicationOriginShmemSize()); + size = add_size(size, WalSndShmemSize()); + size = add_size(size, WalRcvShmemSize()); + size = add_size(size, PgArchShmemSize()); + size = add_size(size, ApplyLauncherShmemSize()); + size = add_size(size, SnapMgrShmemSize()); + size = add_size(size, BTreeShmemSize()); + size = add_size(size, SyncScanShmemSize()); + size = add_size(size, AsyncShmemSize()); + size = add_size(size, StatsShmemSize()); +#ifdef EXEC_BACKEND + size = add_size(size, ShmemBackendArraySize()); +#endif + + /* include additional requested shmem from preload libraries */ + size = add_size(size, total_addin_request); + + /* might as well round it off to a multiple of a typical page size */ + size = add_size(size, 8192 - (size % 8192)); + + return size; +} + +/* + * CreateSharedMemoryAndSemaphores + * Creates and initializes shared memory and semaphores. + * + * This is called by the postmaster or by a standalone backend. + * It is also called by a backend forked from the postmaster in the + * EXEC_BACKEND case. In the latter case, the shared memory segment + * already exists and has been physically attached to, but we have to + * initialize pointers in local memory that reference the shared structures, + * because we didn't inherit the correct pointer values from the postmaster + * as we do in the fork() scenario. The easiest way to do that is to run + * through the same code as before. (Note that the called routines mostly + * check IsUnderPostmaster, rather than EXEC_BACKEND, to detect this case. + * This is a bit code-wasteful and could be cleaned up.) + */ +void +CreateSharedMemoryAndSemaphores(void) +{ + PGShmemHeader *shim = NULL; + + if (!IsUnderPostmaster) + { + PGShmemHeader *seghdr; + Size size; + int numSemas; + + /* Compute the size of the shared-memory block */ + size = CalculateShmemSize(&numSemas); + elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); + + /* + * Create the shmem segment + */ + seghdr = PGSharedMemoryCreate(size, &shim); + + InitShmemAccess(seghdr); + + /* + * Create semaphores + */ + PGReserveSemaphores(numSemas); + + /* + * If spinlocks are disabled, initialize emulation layer (which + * depends on semaphores, so the order is important here). + */ +#ifndef HAVE_SPINLOCKS + SpinlockSemaInit(); +#endif + } + else + { + /* + * We are reattaching to an existing shared memory segment. This + * should only be reached in the EXEC_BACKEND case. + */ +#ifndef EXEC_BACKEND + elog(PANIC, "should be attached to shared memory already"); +#endif + } + + /* + * Set up shared memory allocation mechanism + */ + if (!IsUnderPostmaster) + InitShmemAllocation(); + + /* + * Now initialize LWLocks, which do shared memory allocation and are + * needed for InitShmemIndex. + */ + CreateLWLocks(); + + /* + * Set up shmem.c index hashtable + */ + InitShmemIndex(); + + dsm_shmem_init(); + + /* + * Set up xlog, clog, and buffers + */ + XLOGShmemInit(); + XLogPrefetchShmemInit(); + XLogRecoveryShmemInit(); + CLOGShmemInit(); + CommitTsShmemInit(); + SUBTRANSShmemInit(); + MultiXactShmemInit(); + InitBufferPool(); + + /* + * Set up lock manager + */ + InitLocks(); + + /* + * Set up predicate lock manager + */ + InitPredicateLocks(); + + /* + * Set up process table + */ + if (!IsUnderPostmaster) + InitProcGlobal(); + CreateSharedProcArray(); + CreateSharedBackendStatus(); + TwoPhaseShmemInit(); + BackgroundWorkerShmemInit(); + + /* + * Set up shared-inval messaging + */ + CreateSharedInvalidationState(); + + /* + * Set up interprocess signaling mechanisms + */ + PMSignalShmemInit(); + ProcSignalShmemInit(); + CheckpointerShmemInit(); + AutoVacuumShmemInit(); + ReplicationSlotsShmemInit(); + ReplicationOriginShmemInit(); + WalSndShmemInit(); + WalRcvShmemInit(); + PgArchShmemInit(); + ApplyLauncherShmemInit(); + + /* + * Set up other modules that need some shared memory space + */ + SnapMgrInit(); + BTreeShmemInit(); + SyncScanShmemInit(); + AsyncShmemInit(); + StatsShmemInit(); + +#ifdef EXEC_BACKEND + + /* + * Alloc the win32 shared backend array + */ + if (!IsUnderPostmaster) + ShmemBackendArrayAllocation(); +#endif + + /* Initialize dynamic shared memory facilities. */ + if (!IsUnderPostmaster) + dsm_postmaster_startup(shim); + + /* + * Now give loadable modules a chance to set up their shmem allocations + */ + if (shmem_startup_hook) + shmem_startup_hook(); +} + +/* + * InitializeShmemGUCs + * + * This function initializes runtime-computed GUCs related to the amount of + * shared memory required for the current configuration. + */ +void +InitializeShmemGUCs(void) +{ + char buf[64]; + Size size_b; + Size size_mb; + Size hp_size; + + /* + * Calculate the shared memory size and round up to the nearest megabyte. + */ + size_b = CalculateShmemSize(NULL); + size_mb = add_size(size_b, (1024 * 1024) - 1) / (1024 * 1024); + sprintf(buf, "%zu", size_mb); + SetConfigOption("shared_memory_size", buf, + PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + + /* + * Calculate the number of huge pages required. + */ + GetHugePageSize(&hp_size, NULL); + if (hp_size != 0) + { + Size hp_required; + + hp_required = add_size(size_b / hp_size, 1); + sprintf(buf, "%zu", hp_required); + SetConfigOption("shared_memory_size_in_huge_pages", buf, + PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT); + } +} diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c new file mode 100644 index 0000000..cdb95c1 --- /dev/null +++ b/src/backend/storage/ipc/latch.c @@ -0,0 +1,2268 @@ +/*------------------------------------------------------------------------- + * + * latch.c + * Routines for inter-process latches + * + * The poll() implementation uses the so-called self-pipe trick to overcome the + * race condition involved with poll() and setting a global flag in the signal + * handler. When a latch is set and the current process is waiting for it, the + * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe. + * A signal by itself doesn't interrupt poll() on all platforms, and even on + * platforms where it does, a signal that arrives just before the poll() call + * does not prevent poll() from entering sleep. An incoming byte on a pipe + * however reliably interrupts the sleep, and causes poll() to return + * immediately even if the signal arrives before poll() begins. + * + * The epoll() implementation overcomes the race with a different technique: it + * keeps SIGURG blocked and consumes from a signalfd() descriptor instead. We + * don't need to register a signal handler or create our own self-pipe. We + * assume that any system that has Linux epoll() also has Linux signalfd(). + * + * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL. + * + * The Windows implementation uses Windows events that are inherited by all + * postmaster child processes. There's no need for the self-pipe trick there. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/ipc/latch.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include +#include +#ifdef HAVE_SYS_EPOLL_H +#include +#endif +#ifdef HAVE_SYS_EVENT_H +#include +#endif +#ifdef HAVE_SYS_SIGNALFD_H +#include +#endif +#ifdef HAVE_POLL_H +#include +#endif + +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "portability/instr_time.h" +#include "postmaster/postmaster.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/shmem.h" +#include "utils/memutils.h" + +/* + * Select the fd readiness primitive to use. Normally the "most modern" + * primitive supported by the OS will be used, but for testing it can be + * useful to manually specify the used primitive. If desired, just add a + * define somewhere before this block. + */ +#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \ + defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32) +/* don't overwrite manual choice */ +#elif defined(HAVE_SYS_EPOLL_H) +#define WAIT_USE_EPOLL +#elif defined(HAVE_KQUEUE) +#define WAIT_USE_KQUEUE +#elif defined(HAVE_POLL) +#define WAIT_USE_POLL +#elif WIN32 +#define WAIT_USE_WIN32 +#else +#error "no wait set implementation available" +#endif + +/* + * By default, we use a self-pipe with poll() and a signalfd with epoll(), if + * available. We avoid signalfd on illumos for now based on problem reports. + * For testing the choice can also be manually specified. + */ +#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL) +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) +/* don't overwrite manual choice */ +#elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H) && \ + !defined(__illumos__) +#define WAIT_USE_SIGNALFD +#else +#define WAIT_USE_SELF_PIPE +#endif +#endif + +/* typedef in latch.h */ +struct WaitEventSet +{ + int nevents; /* number of registered events */ + int nevents_space; /* maximum number of events in this set */ + + /* + * Array, of nevents_space length, storing the definition of events this + * set is waiting for. + */ + WaitEvent *events; + + /* + * If WL_LATCH_SET is specified in any wait event, latch is a pointer to + * said latch, and latch_pos the offset in the ->events array. This is + * useful because we check the state of the latch before performing doing + * syscalls related to waiting. + */ + Latch *latch; + int latch_pos; + + /* + * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag + * is set so that we'll exit immediately if postmaster death is detected, + * instead of returning. + */ + bool exit_on_postmaster_death; + +#if defined(WAIT_USE_EPOLL) + int epoll_fd; + /* epoll_wait returns events in a user provided arrays, allocate once */ + struct epoll_event *epoll_ret_events; +#elif defined(WAIT_USE_KQUEUE) + int kqueue_fd; + /* kevent returns events in a user provided arrays, allocate once */ + struct kevent *kqueue_ret_events; + bool report_postmaster_not_running; +#elif defined(WAIT_USE_POLL) + /* poll expects events to be waited on every poll() call, prepare once */ + struct pollfd *pollfds; +#elif defined(WAIT_USE_WIN32) + + /* + * Array of windows events. The first element always contains + * pgwin32_signal_event, so the remaining elements are offset by one (i.e. + * event->pos + 1). + */ + HANDLE *handles; +#endif +}; + +/* A common WaitEventSet used to implement WaitLatch() */ +static WaitEventSet *LatchWaitSet; + +/* The position of the latch in LatchWaitSet. */ +#define LatchWaitSetLatchPos 0 + +#ifndef WIN32 +/* Are we currently in WaitLatch? The signal handler would like to know. */ +static volatile sig_atomic_t waiting = false; +#endif + +#ifdef WAIT_USE_SIGNALFD +/* On Linux, we'll receive SIGURG via a signalfd file descriptor. */ +static int signal_fd = -1; +#endif + +#ifdef WAIT_USE_SELF_PIPE +/* Read and write ends of the self-pipe */ +static int selfpipe_readfd = -1; +static int selfpipe_writefd = -1; + +/* Process owning the self-pipe --- needed for checking purposes */ +static int selfpipe_owner_pid = 0; + +/* Private function prototypes */ +static void latch_sigurg_handler(SIGNAL_ARGS); +static void sendSelfPipeByte(void); +#endif + +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) +static void drain(void); +#endif + +#if defined(WAIT_USE_EPOLL) +static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action); +#elif defined(WAIT_USE_KQUEUE) +static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events); +#elif defined(WAIT_USE_POLL) +static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event); +#elif defined(WAIT_USE_WIN32) +static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event); +#endif + +static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents); + +/* + * Initialize the process-local latch infrastructure. + * + * This must be called once during startup of any process that can wait on + * latches, before it issues any InitLatch() or OwnLatch() calls. + */ +void +InitializeLatchSupport(void) +{ +#if defined(WAIT_USE_SELF_PIPE) + int pipefd[2]; + + if (IsUnderPostmaster) + { + /* + * We might have inherited connections to a self-pipe created by the + * postmaster. It's critical that child processes create their own + * self-pipes, of course, and we really want them to close the + * inherited FDs for safety's sake. + */ + if (selfpipe_owner_pid != 0) + { + /* Assert we go through here but once in a child process */ + Assert(selfpipe_owner_pid != MyProcPid); + /* Release postmaster's pipe FDs; ignore any error */ + (void) close(selfpipe_readfd); + (void) close(selfpipe_writefd); + /* Clean up, just for safety's sake; we'll set these below */ + selfpipe_readfd = selfpipe_writefd = -1; + selfpipe_owner_pid = 0; + /* Keep fd.c's accounting straight */ + ReleaseExternalFD(); + ReleaseExternalFD(); + } + else + { + /* + * Postmaster didn't create a self-pipe ... or else we're in an + * EXEC_BACKEND build, in which case it doesn't matter since the + * postmaster's pipe FDs were closed by the action of FD_CLOEXEC. + * fd.c won't have state to clean up, either. + */ + Assert(selfpipe_readfd == -1); + } + } + else + { + /* In postmaster or standalone backend, assert we do this but once */ + Assert(selfpipe_readfd == -1); + Assert(selfpipe_owner_pid == 0); + } + + /* + * Set up the self-pipe that allows a signal handler to wake up the + * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so + * that SetLatch won't block if the event has already been set many times + * filling the kernel buffer. Make the read-end non-blocking too, so that + * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK. + * Also, make both FDs close-on-exec, since we surely do not want any + * child processes messing with them. + */ + if (pipe(pipefd) < 0) + elog(FATAL, "pipe() failed: %m"); + if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1) + elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m"); + if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1) + elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m"); + if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1) + elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m"); + if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1) + elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m"); + + selfpipe_readfd = pipefd[0]; + selfpipe_writefd = pipefd[1]; + selfpipe_owner_pid = MyProcPid; + + /* Tell fd.c about these two long-lived FDs */ + ReserveExternalFD(); + ReserveExternalFD(); + + pqsignal(SIGURG, latch_sigurg_handler); +#endif + +#ifdef WAIT_USE_SIGNALFD + sigset_t signalfd_mask; + + if (IsUnderPostmaster) + { + /* + * It would probably be safe to re-use the inherited signalfd since + * signalfds only see the current process's pending signals, but it + * seems less surprising to close it and create our own. + */ + if (signal_fd != -1) + { + /* Release postmaster's signal FD; ignore any error */ + (void) close(signal_fd); + signal_fd = -1; + ReleaseExternalFD(); + } + } + + /* Block SIGURG, because we'll receive it through a signalfd. */ + sigaddset(&UnBlockSig, SIGURG); + + /* Set up the signalfd to receive SIGURG notifications. */ + sigemptyset(&signalfd_mask); + sigaddset(&signalfd_mask, SIGURG); + signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC); + if (signal_fd < 0) + elog(FATAL, "signalfd() failed"); + ReserveExternalFD(); +#endif + +#ifdef WAIT_USE_KQUEUE + /* Ignore SIGURG, because we'll receive it via kqueue. */ + pqsignal(SIGURG, SIG_IGN); +#endif +} + +void +InitializeLatchWaitSet(void) +{ + int latch_pos PG_USED_FOR_ASSERTS_ONLY; + + Assert(LatchWaitSet == NULL); + + /* Set up the WaitEventSet used by WaitLatch(). */ + LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2); + latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + if (IsUnderPostmaster) + AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH, + PGINVALID_SOCKET, NULL, NULL); + + Assert(latch_pos == LatchWaitSetLatchPos); +} + +void +ShutdownLatchSupport(void) +{ +#if defined(WAIT_USE_POLL) + pqsignal(SIGURG, SIG_IGN); +#endif + + if (LatchWaitSet) + { + FreeWaitEventSet(LatchWaitSet); + LatchWaitSet = NULL; + } + +#if defined(WAIT_USE_SELF_PIPE) + close(selfpipe_readfd); + close(selfpipe_writefd); + selfpipe_readfd = -1; + selfpipe_writefd = -1; + selfpipe_owner_pid = InvalidPid; +#endif + +#if defined(WAIT_USE_SIGNALFD) + close(signal_fd); + signal_fd = -1; +#endif +} + +/* + * Initialize a process-local latch. + */ +void +InitLatch(Latch *latch) +{ + latch->is_set = false; + latch->maybe_sleeping = false; + latch->owner_pid = MyProcPid; + latch->is_shared = false; + +#if defined(WAIT_USE_SELF_PIPE) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid); +#elif defined(WAIT_USE_SIGNALFD) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(signal_fd >= 0); +#elif defined(WAIT_USE_WIN32) + latch->event = CreateEvent(NULL, TRUE, FALSE, NULL); + if (latch->event == NULL) + elog(ERROR, "CreateEvent failed: error code %lu", GetLastError()); +#endif /* WIN32 */ +} + +/* + * Initialize a shared latch that can be set from other processes. The latch + * is initially owned by no-one; use OwnLatch to associate it with the + * current process. + * + * InitSharedLatch needs to be called in postmaster before forking child + * processes, usually right after allocating the shared memory block + * containing the latch with ShmemInitStruct. (The Unix implementation + * doesn't actually require that, but the Windows one does.) Because of + * this restriction, we have no concurrency issues to worry about here. + * + * Note that other handles created in this module are never marked as + * inheritable. Thus we do not need to worry about cleaning up child + * process references to postmaster-private latches or WaitEventSets. + */ +void +InitSharedLatch(Latch *latch) +{ +#ifdef WIN32 + SECURITY_ATTRIBUTES sa; + + /* + * Set up security attributes to specify that the events are inherited. + */ + ZeroMemory(&sa, sizeof(sa)); + sa.nLength = sizeof(sa); + sa.bInheritHandle = TRUE; + + latch->event = CreateEvent(&sa, TRUE, FALSE, NULL); + if (latch->event == NULL) + elog(ERROR, "CreateEvent failed: error code %lu", GetLastError()); +#endif + + latch->is_set = false; + latch->maybe_sleeping = false; + latch->owner_pid = 0; + latch->is_shared = true; +} + +/* + * Associate a shared latch with the current process, allowing it to + * wait on the latch. + * + * Although there is a sanity check for latch-already-owned, we don't do + * any sort of locking here, meaning that we could fail to detect the error + * if two processes try to own the same latch at about the same time. If + * there is any risk of that, caller must provide an interlock to prevent it. + */ +void +OwnLatch(Latch *latch) +{ + int owner_pid; + + /* Sanity checks */ + Assert(latch->is_shared); + +#if defined(WAIT_USE_SELF_PIPE) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid); +#elif defined(WAIT_USE_SIGNALFD) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(signal_fd >= 0); +#endif + + owner_pid = latch->owner_pid; + if (owner_pid != 0) + elog(PANIC, "latch already owned by PID %d", owner_pid); + + latch->owner_pid = MyProcPid; +} + +/* + * Disown a shared latch currently owned by the current process. + */ +void +DisownLatch(Latch *latch) +{ + Assert(latch->is_shared); + Assert(latch->owner_pid == MyProcPid); + + latch->owner_pid = 0; +} + +/* + * Wait for a given latch to be set, or for postmaster death, or until timeout + * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events + * to wait for. If the latch is already set (and WL_LATCH_SET is given), the + * function returns immediately. + * + * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag + * is given. Although it is declared as "long", we don't actually support + * timeouts longer than INT_MAX milliseconds. Note that some extra overhead + * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible. + * + * The latch must be owned by the current process, ie. it must be a + * process-local latch initialized with InitLatch, or a shared latch + * associated with the current process by calling OwnLatch. + * + * Returns bit mask indicating which condition(s) caused the wake-up. Note + * that if multiple wake-up conditions are true, there is no guarantee that + * we return all of them in one call, but we will return at least one. + */ +int +WaitLatch(Latch *latch, int wakeEvents, long timeout, + uint32 wait_event_info) +{ + WaitEvent event; + + /* Postmaster-managed callers must handle postmaster death somehow. */ + Assert(!IsUnderPostmaster || + (wakeEvents & WL_EXIT_ON_PM_DEATH) || + (wakeEvents & WL_POSTMASTER_DEATH)); + + /* + * Some callers may have a latch other than MyLatch, or no latch at all, + * or want to handle postmaster death differently. It's cheap to assign + * those, so just do it every time. + */ + if (!(wakeEvents & WL_LATCH_SET)) + latch = NULL; + ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch); + LatchWaitSet->exit_on_postmaster_death = + ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0); + + if (WaitEventSetWait(LatchWaitSet, + (wakeEvents & WL_TIMEOUT) ? timeout : -1, + &event, 1, + wait_event_info) == 0) + return WL_TIMEOUT; + else + return event.events; +} + +/* + * Like WaitLatch, but with an extra socket argument for WL_SOCKET_* + * conditions. + * + * When waiting on a socket, EOF and error conditions always cause the socket + * to be reported as readable/writable/connected, so that the caller can deal + * with the condition. + * + * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit + * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the + * return value if the postmaster dies. The latter is useful for rare cases + * where some behavior other than immediate exit is needed. + * + * NB: These days this is just a wrapper around the WaitEventSet API. When + * using a latch very frequently, consider creating a longer living + * WaitEventSet instead; that's more efficient. + */ +int +WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, + long timeout, uint32 wait_event_info) +{ + int ret = 0; + int rc; + WaitEvent event; + WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3); + + if (wakeEvents & WL_TIMEOUT) + Assert(timeout >= 0); + else + timeout = -1; + + if (wakeEvents & WL_LATCH_SET) + AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET, + latch, NULL); + + /* Postmaster-managed callers must handle postmaster death somehow. */ + Assert(!IsUnderPostmaster || + (wakeEvents & WL_EXIT_ON_PM_DEATH) || + (wakeEvents & WL_POSTMASTER_DEATH)); + + if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster) + AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, + NULL, NULL); + + if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster) + AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + + if (wakeEvents & WL_SOCKET_MASK) + { + int ev; + + ev = wakeEvents & WL_SOCKET_MASK; + AddWaitEventToSet(set, ev, sock, NULL, NULL); + } + + rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info); + + if (rc == 0) + ret |= WL_TIMEOUT; + else + { + ret |= event.events & (WL_LATCH_SET | + WL_POSTMASTER_DEATH | + WL_SOCKET_MASK); + } + + FreeWaitEventSet(set); + + return ret; +} + +/* + * Sets a latch and wakes up anyone waiting on it. + * + * This is cheap if the latch is already set, otherwise not so much. + * + * NB: when calling this in a signal handler, be sure to save and restore + * errno around it. (That's standard practice in most signal handlers, of + * course, but we used to omit it in handlers that only set a flag.) + * + * NB: this function is called from critical sections and signal handlers so + * throwing an error is not a good idea. + */ +void +SetLatch(Latch *latch) +{ +#ifndef WIN32 + pid_t owner_pid; +#else + HANDLE handle; +#endif + + /* + * The memory barrier has to be placed here to ensure that any flag + * variables possibly changed by this process have been flushed to main + * memory, before we check/set is_set. + */ + pg_memory_barrier(); + + /* Quick exit if already set */ + if (latch->is_set) + return; + + latch->is_set = true; + + pg_memory_barrier(); + if (!latch->maybe_sleeping) + return; + +#ifndef WIN32 + + /* + * See if anyone's waiting for the latch. It can be the current process if + * we're in a signal handler. We use the self-pipe or SIGURG to ourselves + * to wake up WaitEventSetWaitBlock() without races in that case. If it's + * another process, send a signal. + * + * Fetch owner_pid only once, in case the latch is concurrently getting + * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't + * guaranteed to be true! In practice, the effective range of pid_t fits + * in a 32 bit integer, and so should be atomic. In the worst case, we + * might end up signaling the wrong process. Even then, you're very + * unlucky if a process with that bogus pid exists and belongs to + * Postgres; and PG database processes should handle excess SIGUSR1 + * interrupts without a problem anyhow. + * + * Another sort of race condition that's possible here is for a new + * process to own the latch immediately after we look, so we don't signal + * it. This is okay so long as all callers of ResetLatch/WaitLatch follow + * the standard coding convention of waiting at the bottom of their loops, + * not the top, so that they'll correctly process latch-setting events + * that happen before they enter the loop. + */ + owner_pid = latch->owner_pid; + if (owner_pid == 0) + return; + else if (owner_pid == MyProcPid) + { +#if defined(WAIT_USE_SELF_PIPE) + if (waiting) + sendSelfPipeByte(); +#else + if (waiting) + kill(MyProcPid, SIGURG); +#endif + } + else + kill(owner_pid, SIGURG); + +#else + + /* + * See if anyone's waiting for the latch. It can be the current process if + * we're in a signal handler. + * + * Use a local variable here just in case somebody changes the event field + * concurrently (which really should not happen). + */ + handle = latch->event; + if (handle) + { + SetEvent(handle); + + /* + * Note that we silently ignore any errors. We might be in a signal + * handler or other critical path where it's not safe to call elog(). + */ + } +#endif +} + +/* + * Clear the latch. Calling WaitLatch after this will sleep, unless + * the latch is set again before the WaitLatch call. + */ +void +ResetLatch(Latch *latch) +{ + /* Only the owner should reset the latch */ + Assert(latch->owner_pid == MyProcPid); + Assert(latch->maybe_sleeping == false); + + latch->is_set = false; + + /* + * Ensure that the write to is_set gets flushed to main memory before we + * examine any flag variables. Otherwise a concurrent SetLatch might + * falsely conclude that it needn't signal us, even though we have missed + * seeing some flag updates that SetLatch was supposed to inform us of. + */ + pg_memory_barrier(); +} + +/* + * Create a WaitEventSet with space for nevents different events to wait for. + * + * These events can then be efficiently waited upon together, using + * WaitEventSetWait(). + */ +WaitEventSet * +CreateWaitEventSet(MemoryContext context, int nevents) +{ + WaitEventSet *set; + char *data; + Size sz = 0; + + /* + * Use MAXALIGN size/alignment to guarantee that later uses of memory are + * aligned correctly. E.g. epoll_event might need 8 byte alignment on some + * platforms, but earlier allocations like WaitEventSet and WaitEvent + * might not be sized to guarantee that when purely using sizeof(). + */ + sz += MAXALIGN(sizeof(WaitEventSet)); + sz += MAXALIGN(sizeof(WaitEvent) * nevents); + +#if defined(WAIT_USE_EPOLL) + sz += MAXALIGN(sizeof(struct epoll_event) * nevents); +#elif defined(WAIT_USE_KQUEUE) + sz += MAXALIGN(sizeof(struct kevent) * nevents); +#elif defined(WAIT_USE_POLL) + sz += MAXALIGN(sizeof(struct pollfd) * nevents); +#elif defined(WAIT_USE_WIN32) + /* need space for the pgwin32_signal_event */ + sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1)); +#endif + + data = (char *) MemoryContextAllocZero(context, sz); + + set = (WaitEventSet *) data; + data += MAXALIGN(sizeof(WaitEventSet)); + + set->events = (WaitEvent *) data; + data += MAXALIGN(sizeof(WaitEvent) * nevents); + +#if defined(WAIT_USE_EPOLL) + set->epoll_ret_events = (struct epoll_event *) data; + data += MAXALIGN(sizeof(struct epoll_event) * nevents); +#elif defined(WAIT_USE_KQUEUE) + set->kqueue_ret_events = (struct kevent *) data; + data += MAXALIGN(sizeof(struct kevent) * nevents); +#elif defined(WAIT_USE_POLL) + set->pollfds = (struct pollfd *) data; + data += MAXALIGN(sizeof(struct pollfd) * nevents); +#elif defined(WAIT_USE_WIN32) + set->handles = (HANDLE) data; + data += MAXALIGN(sizeof(HANDLE) * nevents); +#endif + + set->latch = NULL; + set->nevents_space = nevents; + set->exit_on_postmaster_death = false; + +#if defined(WAIT_USE_EPOLL) + if (!AcquireExternalFD()) + { + /* treat this as though epoll_create1 itself returned EMFILE */ + elog(ERROR, "epoll_create1 failed: %m"); + } + set->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (set->epoll_fd < 0) + { + ReleaseExternalFD(); + elog(ERROR, "epoll_create1 failed: %m"); + } +#elif defined(WAIT_USE_KQUEUE) + if (!AcquireExternalFD()) + { + /* treat this as though kqueue itself returned EMFILE */ + elog(ERROR, "kqueue failed: %m"); + } + set->kqueue_fd = kqueue(); + if (set->kqueue_fd < 0) + { + ReleaseExternalFD(); + elog(ERROR, "kqueue failed: %m"); + } + if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1) + { + int save_errno = errno; + + close(set->kqueue_fd); + ReleaseExternalFD(); + errno = save_errno; + elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m"); + } + set->report_postmaster_not_running = false; +#elif defined(WAIT_USE_WIN32) + + /* + * To handle signals while waiting, we need to add a win32 specific event. + * We accounted for the additional event at the top of this routine. See + * port/win32/signal.c for more details. + * + * Note: pgwin32_signal_event should be first to ensure that it will be + * reported when multiple events are set. We want to guarantee that + * pending signals are serviced. + */ + set->handles[0] = pgwin32_signal_event; + StaticAssertStmt(WSA_INVALID_EVENT == NULL, ""); +#endif + + return set; +} + +/* + * Free a previously created WaitEventSet. + * + * Note: preferably, this shouldn't have to free any resources that could be + * inherited across an exec(). If it did, we'd likely leak those resources in + * many scenarios. For the epoll case, we ensure that by setting EPOLL_CLOEXEC + * when the FD is created. For the Windows case, we assume that the handles + * involved are non-inheritable. + */ +void +FreeWaitEventSet(WaitEventSet *set) +{ +#if defined(WAIT_USE_EPOLL) + close(set->epoll_fd); + ReleaseExternalFD(); +#elif defined(WAIT_USE_KQUEUE) + close(set->kqueue_fd); + ReleaseExternalFD(); +#elif defined(WAIT_USE_WIN32) + WaitEvent *cur_event; + + for (cur_event = set->events; + cur_event < (set->events + set->nevents); + cur_event++) + { + if (cur_event->events & WL_LATCH_SET) + { + /* uses the latch's HANDLE */ + } + else if (cur_event->events & WL_POSTMASTER_DEATH) + { + /* uses PostmasterHandle */ + } + else + { + /* Clean up the event object we created for the socket */ + WSAEventSelect(cur_event->fd, NULL, 0); + WSACloseEvent(set->handles[cur_event->pos + 1]); + } + } +#endif + + pfree(set); +} + +/* + * Free a previously created WaitEventSet in a child process after a fork(). + */ +void +FreeWaitEventSetAfterFork(WaitEventSet *set) +{ +#if defined(WAIT_USE_EPOLL) + close(set->epoll_fd); + ReleaseExternalFD(); +#elif defined(WAIT_USE_KQUEUE) + /* kqueues are not normally inherited by child processes */ + ReleaseExternalFD(); +#endif + + pfree(set); +} + +/* --- + * Add an event to the set. Possible events are: + * - WL_LATCH_SET: Wait for the latch to be set + * - WL_POSTMASTER_DEATH: Wait for postmaster to die + * - WL_SOCKET_READABLE: Wait for socket to become readable, + * can be combined in one event with other WL_SOCKET_* events + * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable, + * can be combined with other WL_SOCKET_* events + * - WL_SOCKET_CONNECTED: Wait for socket connection to be established, + * can be combined with other WL_SOCKET_* events (on non-Windows + * platforms, this is the same as WL_SOCKET_WRITEABLE) + * - WL_SOCKET_ACCEPT: Wait for new connection to a server socket, + * can be combined with other WL_SOCKET_* events (on non-Windows + * platforms, this is the same as WL_SOCKET_READABLE) + * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer. + * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies + * + * Returns the offset in WaitEventSet->events (starting from 0), which can be + * used to modify previously added wait events using ModifyWaitEvent(). + * + * In the WL_LATCH_SET case the latch must be owned by the current process, + * i.e. it must be a process-local latch initialized with InitLatch, or a + * shared latch associated with the current process by calling OwnLatch. + * + * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED/ACCEPT cases, EOF and error + * conditions cause the socket to be reported as readable/writable/connected, + * so that the caller can deal with the condition. + * + * The user_data pointer specified here will be set for the events returned + * by WaitEventSetWait(), allowing to easily associate additional data with + * events. + */ +int +AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, + void *user_data) +{ + WaitEvent *event; + + /* not enough space */ + Assert(set->nevents < set->nevents_space); + + if (events == WL_EXIT_ON_PM_DEATH) + { + events = WL_POSTMASTER_DEATH; + set->exit_on_postmaster_death = true; + } + + if (latch) + { + if (latch->owner_pid != MyProcPid) + elog(ERROR, "cannot wait on a latch owned by another process"); + if (set->latch) + elog(ERROR, "cannot wait on more than one latch"); + if ((events & WL_LATCH_SET) != WL_LATCH_SET) + elog(ERROR, "latch events only support being set"); + } + else + { + if (events & WL_LATCH_SET) + elog(ERROR, "cannot wait on latch without a specified latch"); + } + + /* waiting for socket readiness without a socket indicates a bug */ + if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK)) + elog(ERROR, "cannot wait on socket event without a socket"); + + event = &set->events[set->nevents]; + event->pos = set->nevents++; + event->fd = fd; + event->events = events; + event->user_data = user_data; +#ifdef WIN32 + event->reset = false; +#endif + + if (events == WL_LATCH_SET) + { + set->latch = latch; + set->latch_pos = event->pos; +#if defined(WAIT_USE_SELF_PIPE) + event->fd = selfpipe_readfd; +#elif defined(WAIT_USE_SIGNALFD) + event->fd = signal_fd; +#else + event->fd = PGINVALID_SOCKET; +#ifdef WAIT_USE_EPOLL + return event->pos; +#endif +#endif + } + else if (events == WL_POSTMASTER_DEATH) + { +#ifndef WIN32 + event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH]; +#endif + } + + /* perform wait primitive specific initialization, if needed */ +#if defined(WAIT_USE_EPOLL) + WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD); +#elif defined(WAIT_USE_KQUEUE) + WaitEventAdjustKqueue(set, event, 0); +#elif defined(WAIT_USE_POLL) + WaitEventAdjustPoll(set, event); +#elif defined(WAIT_USE_WIN32) + WaitEventAdjustWin32(set, event); +#endif + + return event->pos; +} + +/* + * Change the event mask and, in the WL_LATCH_SET case, the latch associated + * with the WaitEvent. The latch may be changed to NULL to disable the latch + * temporarily, and then set back to a latch later. + * + * 'pos' is the id returned by AddWaitEventToSet. + */ +void +ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch) +{ + WaitEvent *event; +#if defined(WAIT_USE_KQUEUE) + int old_events; +#endif + + Assert(pos < set->nevents); + + event = &set->events[pos]; +#if defined(WAIT_USE_KQUEUE) + old_events = event->events; +#endif + + /* + * If neither the event mask nor the associated latch changes, return + * early. That's an important optimization for some sockets, where + * ModifyWaitEvent is frequently used to switch from waiting for reads to + * waiting on writes. + */ + if (events == event->events && + (!(event->events & WL_LATCH_SET) || set->latch == latch)) + return; + + if (event->events & WL_LATCH_SET && + events != event->events) + { + elog(ERROR, "cannot modify latch event"); + } + + if (event->events & WL_POSTMASTER_DEATH) + { + elog(ERROR, "cannot modify postmaster death event"); + } + + /* FIXME: validate event mask */ + event->events = events; + + if (events == WL_LATCH_SET) + { + if (latch && latch->owner_pid != MyProcPid) + elog(ERROR, "cannot wait on a latch owned by another process"); + set->latch = latch; + + /* + * On Unix, we don't need to modify the kernel object because the + * underlying pipe (if there is one) is the same for all latches so we + * can return immediately. On Windows, we need to update our array of + * handles, but we leave the old one in place and tolerate spurious + * wakeups if the latch is disabled. + */ +#if defined(WAIT_USE_WIN32) + if (!latch) + return; +#else + return; +#endif + } + +#if defined(WAIT_USE_EPOLL) + WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD); +#elif defined(WAIT_USE_KQUEUE) + WaitEventAdjustKqueue(set, event, old_events); +#elif defined(WAIT_USE_POLL) + WaitEventAdjustPoll(set, event); +#elif defined(WAIT_USE_WIN32) + WaitEventAdjustWin32(set, event); +#endif +} + +#if defined(WAIT_USE_EPOLL) +/* + * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL + */ +static void +WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action) +{ + struct epoll_event epoll_ev; + int rc; + + /* pointer to our event, returned by epoll_wait */ + epoll_ev.data.ptr = event; + /* always wait for errors */ + epoll_ev.events = EPOLLERR | EPOLLHUP; + + /* prepare pollfd entry once */ + if (event->events == WL_LATCH_SET) + { + Assert(set->latch != NULL); + epoll_ev.events |= EPOLLIN; + } + else if (event->events == WL_POSTMASTER_DEATH) + { + epoll_ev.events |= EPOLLIN; + } + else + { + Assert(event->fd != PGINVALID_SOCKET); + Assert(event->events & (WL_SOCKET_READABLE | + WL_SOCKET_WRITEABLE | + WL_SOCKET_CLOSED)); + + if (event->events & WL_SOCKET_READABLE) + epoll_ev.events |= EPOLLIN; + if (event->events & WL_SOCKET_WRITEABLE) + epoll_ev.events |= EPOLLOUT; + if (event->events & WL_SOCKET_CLOSED) + epoll_ev.events |= EPOLLRDHUP; + } + + /* + * Even though unused, we also pass epoll_ev as the data argument if + * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug + * requiring that, and actually it makes the code simpler... + */ + rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev); + + if (rc < 0) + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "epoll_ctl"))); +} +#endif + +#if defined(WAIT_USE_POLL) +static void +WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event) +{ + struct pollfd *pollfd = &set->pollfds[event->pos]; + + pollfd->revents = 0; + pollfd->fd = event->fd; + + /* prepare pollfd entry once */ + if (event->events == WL_LATCH_SET) + { + Assert(set->latch != NULL); + pollfd->events = POLLIN; + } + else if (event->events == WL_POSTMASTER_DEATH) + { + pollfd->events = POLLIN; + } + else + { + Assert(event->events & (WL_SOCKET_READABLE | + WL_SOCKET_WRITEABLE | + WL_SOCKET_CLOSED)); + pollfd->events = 0; + if (event->events & WL_SOCKET_READABLE) + pollfd->events |= POLLIN; + if (event->events & WL_SOCKET_WRITEABLE) + pollfd->events |= POLLOUT; +#ifdef POLLRDHUP + if (event->events & WL_SOCKET_CLOSED) + pollfd->events |= POLLRDHUP; +#endif + } + + Assert(event->fd != PGINVALID_SOCKET); +} +#endif + +#if defined(WAIT_USE_KQUEUE) + +/* + * On most BSD family systems, the udata member of struct kevent is of type + * void *, so we could directly convert to/from WaitEvent *. Unfortunately, + * NetBSD has it as intptr_t, so here we wallpaper over that difference with + * an lvalue cast. + */ +#define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata))) + +static inline void +WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action, + WaitEvent *event) +{ + k_ev->ident = event->fd; + k_ev->filter = filter; + k_ev->flags = action; + k_ev->fflags = 0; + k_ev->data = 0; + AccessWaitEvent(k_ev) = event; +} + +static inline void +WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event) +{ + /* For now postmaster death can only be added, not removed. */ + k_ev->ident = PostmasterPid; + k_ev->filter = EVFILT_PROC; + k_ev->flags = EV_ADD; + k_ev->fflags = NOTE_EXIT; + k_ev->data = 0; + AccessWaitEvent(k_ev) = event; +} + +static inline void +WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event) +{ + /* For now latch can only be added, not removed. */ + k_ev->ident = SIGURG; + k_ev->filter = EVFILT_SIGNAL; + k_ev->flags = EV_ADD; + k_ev->fflags = 0; + k_ev->data = 0; + AccessWaitEvent(k_ev) = event; +} + +/* + * old_events is the previous event mask, used to compute what has changed. + */ +static void +WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events) +{ + int rc; + struct kevent k_ev[2]; + int count = 0; + bool new_filt_read = false; + bool old_filt_read = false; + bool new_filt_write = false; + bool old_filt_write = false; + + if (old_events == event->events) + return; + + Assert(event->events != WL_LATCH_SET || set->latch != NULL); + Assert(event->events == WL_LATCH_SET || + event->events == WL_POSTMASTER_DEATH || + (event->events & (WL_SOCKET_READABLE | + WL_SOCKET_WRITEABLE | + WL_SOCKET_CLOSED))); + + if (event->events == WL_POSTMASTER_DEATH) + { + /* + * Unlike all the other implementations, we detect postmaster death + * using process notification instead of waiting on the postmaster + * alive pipe. + */ + WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event); + } + else if (event->events == WL_LATCH_SET) + { + /* We detect latch wakeup using a signal event. */ + WaitEventAdjustKqueueAddLatch(&k_ev[count++], event); + } + else + { + /* + * We need to compute the adds and deletes required to get from the + * old event mask to the new event mask, since kevent treats readable + * and writable as separate events. + */ + if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED)) + old_filt_read = true; + if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED)) + new_filt_read = true; + if (old_events & WL_SOCKET_WRITEABLE) + old_filt_write = true; + if (event->events & WL_SOCKET_WRITEABLE) + new_filt_write = true; + if (old_filt_read && !new_filt_read) + WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE, + event); + else if (!old_filt_read && new_filt_read) + WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD, + event); + if (old_filt_write && !new_filt_write) + WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE, + event); + else if (!old_filt_write && new_filt_write) + WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD, + event); + } + + /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */ + if (count == 0) + return; + + Assert(count <= 2); + + rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL); + + /* + * When adding the postmaster's pid, we have to consider that it might + * already have exited and perhaps even been replaced by another process + * with the same pid. If so, we have to defer reporting this as an event + * until the next call to WaitEventSetWaitBlock(). + */ + + if (rc < 0) + { + if (event->events == WL_POSTMASTER_DEATH && + (errno == ESRCH || errno == EACCES)) + set->report_postmaster_not_running = true; + else + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "kevent"))); + } + else if (event->events == WL_POSTMASTER_DEATH && + PostmasterPid != getppid() && + !PostmasterIsAlive()) + { + /* + * The extra PostmasterIsAliveInternal() check prevents false alarms + * on systems that give a different value for getppid() while being + * traced by a debugger. + */ + set->report_postmaster_not_running = true; + } +} + +#endif + +#if defined(WAIT_USE_WIN32) +static void +WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event) +{ + HANDLE *handle = &set->handles[event->pos + 1]; + + if (event->events == WL_LATCH_SET) + { + Assert(set->latch != NULL); + *handle = set->latch->event; + } + else if (event->events == WL_POSTMASTER_DEATH) + { + *handle = PostmasterHandle; + } + else + { + int flags = FD_CLOSE; /* always check for errors/EOF */ + + if (event->events & WL_SOCKET_READABLE) + flags |= FD_READ; + if (event->events & WL_SOCKET_WRITEABLE) + flags |= FD_WRITE; + if (event->events & WL_SOCKET_CONNECTED) + flags |= FD_CONNECT; + if (event->events & WL_SOCKET_ACCEPT) + flags |= FD_ACCEPT; + + if (*handle == WSA_INVALID_EVENT) + { + *handle = WSACreateEvent(); + if (*handle == WSA_INVALID_EVENT) + elog(ERROR, "failed to create event for socket: error code %d", + WSAGetLastError()); + } + if (WSAEventSelect(event->fd, *handle, flags) != 0) + elog(ERROR, "failed to set up event for socket: error code %d", + WSAGetLastError()); + + Assert(event->fd != PGINVALID_SOCKET); + } +} +#endif + +/* + * Wait for events added to the set to happen, or until the timeout is + * reached. At most nevents occurred events are returned. + * + * If timeout = -1, block until an event occurs; if 0, check sockets for + * readiness, but don't block; if > 0, block for at most timeout milliseconds. + * + * Returns the number of events occurred, or 0 if the timeout was reached. + * + * Returned events will have the fd, pos, user_data fields set to the + * values associated with the registered event. + */ +int +WaitEventSetWait(WaitEventSet *set, long timeout, + WaitEvent *occurred_events, int nevents, + uint32 wait_event_info) +{ + int returned_events = 0; + instr_time start_time; + instr_time cur_time; + long cur_timeout = -1; + + Assert(nevents > 0); + + /* + * Initialize timeout if requested. We must record the current time so + * that we can determine the remaining timeout if interrupted. + */ + if (timeout >= 0) + { + INSTR_TIME_SET_CURRENT(start_time); + Assert(timeout >= 0 && timeout <= INT_MAX); + cur_timeout = timeout; + } + else + INSTR_TIME_SET_ZERO(start_time); + + pgstat_report_wait_start(wait_event_info); + +#ifndef WIN32 + waiting = true; +#else + /* Ensure that signals are serviced even if latch is already set */ + pgwin32_dispatch_queued_signals(); +#endif + while (returned_events == 0) + { + int rc; + + /* + * Check if the latch is set already. If so, leave the loop + * immediately, avoid blocking again. We don't attempt to report any + * other events that might also be satisfied. + * + * If someone sets the latch between this and the + * WaitEventSetWaitBlock() below, the setter will write a byte to the + * pipe (or signal us and the signal handler will do that), and the + * readiness routine will return immediately. + * + * On unix, If there's a pending byte in the self pipe, we'll notice + * whenever blocking. Only clearing the pipe in that case avoids + * having to drain it every time WaitLatchOrSocket() is used. Should + * the pipe-buffer fill up we're still ok, because the pipe is in + * nonblocking mode. It's unlikely for that to happen, because the + * self pipe isn't filled unless we're blocking (waiting = true), or + * from inside a signal handler in latch_sigurg_handler(). + * + * On windows, we'll also notice if there's a pending event for the + * latch when blocking, but there's no danger of anything filling up, + * as "Setting an event that is already set has no effect.". + * + * Note: we assume that the kernel calls involved in latch management + * will provide adequate synchronization on machines with weak memory + * ordering, so that we cannot miss seeing is_set if a notification + * has already been queued. + */ + if (set->latch && !set->latch->is_set) + { + /* about to sleep on a latch */ + set->latch->maybe_sleeping = true; + pg_memory_barrier(); + /* and recheck */ + } + + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->pos = set->latch_pos; + occurred_events->user_data = + set->events[set->latch_pos].user_data; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + + /* could have been set above */ + set->latch->maybe_sleeping = false; + + break; + } + + /* + * Wait for events using the readiness primitive chosen at the top of + * this file. If -1 is returned, a timeout has occurred, if 0 we have + * to retry, everything >= 1 is the number of returned events. + */ + rc = WaitEventSetWaitBlock(set, cur_timeout, + occurred_events, nevents); + + if (set->latch) + { + Assert(set->latch->maybe_sleeping); + set->latch->maybe_sleeping = false; + } + + if (rc == -1) + break; /* timeout occurred */ + else + returned_events = rc; + + /* If we're not done, update cur_timeout for next iteration */ + if (returned_events == 0 && timeout >= 0) + { + INSTR_TIME_SET_CURRENT(cur_time); + INSTR_TIME_SUBTRACT(cur_time, start_time); + cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time); + if (cur_timeout <= 0) + break; + } + } +#ifndef WIN32 + waiting = false; +#endif + + pgstat_report_wait_end(); + + return returned_events; +} + + +#if defined(WAIT_USE_EPOLL) + +/* + * Wait using linux's epoll_wait(2). + * + * This is the preferable wait method, as several readiness notifications are + * delivered, without having to iterate through all of set->events. The return + * epoll_event struct contain a pointer to our events, making association + * easy. + */ +static inline int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + int rc; + WaitEvent *cur_event; + struct epoll_event *cur_epoll_event; + + /* Sleep */ + rc = epoll_wait(set->epoll_fd, set->epoll_ret_events, + Min(nevents, set->nevents_space), cur_timeout); + + /* Check return code */ + if (rc < 0) + { + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "epoll_wait"))); + } + return 0; + } + else if (rc == 0) + { + /* timeout exceeded */ + return -1; + } + + /* + * At least one event occurred, iterate over the returned epoll events + * until they're either all processed, or we've returned all the events + * the caller desired. + */ + for (cur_epoll_event = set->epoll_ret_events; + cur_epoll_event < (set->epoll_ret_events + rc) && + returned_events < nevents; + cur_epoll_event++) + { + /* epoll's data pointer is set to the associated WaitEvent */ + cur_event = (WaitEvent *) cur_epoll_event->data.ptr; + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET && + cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) + { + /* Drain the signalfd. */ + drain(); + + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH && + cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) + { + /* + * We expect an EPOLLHUP when the remote end is closed, but + * because we don't expect the pipe to become readable or to have + * any errors either, treat those cases as postmaster death, too. + * + * Be paranoid about a spurious event signaling the postmaster as + * being dead. There have been reports about that happening with + * older primitives (select(2) to be specific), and a spurious + * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't + * cost much. + */ + if (!PostmasterIsAliveInternal()) + { + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events & (WL_SOCKET_READABLE | + WL_SOCKET_WRITEABLE | + WL_SOCKET_CLOSED)) + { + Assert(cur_event->fd != PGINVALID_SOCKET); + + if ((cur_event->events & WL_SOCKET_READABLE) && + (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))) + { + /* data available in socket, or EOF */ + occurred_events->events |= WL_SOCKET_READABLE; + } + + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP))) + { + /* writable, or EOF */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + + if ((cur_event->events & WL_SOCKET_CLOSED) && + (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP))) + { + /* remote peer shut down, or error */ + occurred_events->events |= WL_SOCKET_CLOSED; + } + + if (occurred_events->events != 0) + { + occurred_events->fd = cur_event->fd; + occurred_events++; + returned_events++; + } + } + } + + return returned_events; +} + +#elif defined(WAIT_USE_KQUEUE) + +/* + * Wait using kevent(2) on BSD-family systems and macOS. + * + * For now this mirrors the epoll code, but in future it could modify the fd + * set in the same call to kevent as it uses for waiting instead of doing that + * with separate system calls. + */ +static int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + int rc; + WaitEvent *cur_event; + struct kevent *cur_kqueue_event; + struct timespec timeout; + struct timespec *timeout_p; + + if (cur_timeout < 0) + timeout_p = NULL; + else + { + timeout.tv_sec = cur_timeout / 1000; + timeout.tv_nsec = (cur_timeout % 1000) * 1000000; + timeout_p = &timeout; + } + + /* + * Report postmaster events discovered by WaitEventAdjustKqueue() or an + * earlier call to WaitEventSetWait(). + */ + if (unlikely(set->report_postmaster_not_running)) + { + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + return 1; + } + + /* Sleep */ + rc = kevent(set->kqueue_fd, NULL, 0, + set->kqueue_ret_events, + Min(nevents, set->nevents_space), + timeout_p); + + /* Check return code */ + if (rc < 0) + { + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "kevent"))); + } + return 0; + } + else if (rc == 0) + { + /* timeout exceeded */ + return -1; + } + + /* + * At least one event occurred, iterate over the returned kqueue events + * until they're either all processed, or we've returned all the events + * the caller desired. + */ + for (cur_kqueue_event = set->kqueue_ret_events; + cur_kqueue_event < (set->kqueue_ret_events + rc) && + returned_events < nevents; + cur_kqueue_event++) + { + /* kevent's udata points to the associated WaitEvent */ + cur_event = AccessWaitEvent(cur_kqueue_event); + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET && + cur_kqueue_event->filter == EVFILT_SIGNAL) + { + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH && + cur_kqueue_event->filter == EVFILT_PROC && + (cur_kqueue_event->fflags & NOTE_EXIT) != 0) + { + /* + * The kernel will tell this kqueue object only once about the + * exit of the postmaster, so let's remember that for next time so + * that we provide level-triggered semantics. + */ + set->report_postmaster_not_running = true; + + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + else if (cur_event->events & (WL_SOCKET_READABLE | + WL_SOCKET_WRITEABLE | + WL_SOCKET_CLOSED)) + { + Assert(cur_event->fd >= 0); + + if ((cur_event->events & WL_SOCKET_READABLE) && + (cur_kqueue_event->filter == EVFILT_READ)) + { + /* readable, or EOF */ + occurred_events->events |= WL_SOCKET_READABLE; + } + + if ((cur_event->events & WL_SOCKET_CLOSED) && + (cur_kqueue_event->filter == EVFILT_READ) && + (cur_kqueue_event->flags & EV_EOF)) + { + /* the remote peer has shut down */ + occurred_events->events |= WL_SOCKET_CLOSED; + } + + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (cur_kqueue_event->filter == EVFILT_WRITE)) + { + /* writable, or EOF */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + + if (occurred_events->events != 0) + { + occurred_events->fd = cur_event->fd; + occurred_events++; + returned_events++; + } + } + } + + return returned_events; +} + +#elif defined(WAIT_USE_POLL) + +/* + * Wait using poll(2). + * + * This allows to receive readiness notifications for several events at once, + * but requires iterating through all of set->pollfds. + */ +static inline int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + int rc; + WaitEvent *cur_event; + struct pollfd *cur_pollfd; + + /* Sleep */ + rc = poll(set->pollfds, set->nevents, (int) cur_timeout); + + /* Check return code */ + if (rc < 0) + { + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "poll"))); + } + return 0; + } + else if (rc == 0) + { + /* timeout exceeded */ + return -1; + } + + for (cur_event = set->events, cur_pollfd = set->pollfds; + cur_event < (set->events + set->nevents) && + returned_events < nevents; + cur_event++, cur_pollfd++) + { + /* no activity on this FD, skip */ + if (cur_pollfd->revents == 0) + continue; + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET && + (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) + { + /* There's data in the self-pipe, clear it. */ + drain(); + + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH && + (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) + { + /* + * We expect an POLLHUP when the remote end is closed, but because + * we don't expect the pipe to become readable or to have any + * errors either, treat those cases as postmaster death, too. + * + * Be paranoid about a spurious event signaling the postmaster as + * being dead. There have been reports about that happening with + * older primitives (select(2) to be specific), and a spurious + * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't + * cost much. + */ + if (!PostmasterIsAliveInternal()) + { + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events & (WL_SOCKET_READABLE | + WL_SOCKET_WRITEABLE | + WL_SOCKET_CLOSED)) + { + int errflags = POLLHUP | POLLERR | POLLNVAL; + + Assert(cur_event->fd >= PGINVALID_SOCKET); + + if ((cur_event->events & WL_SOCKET_READABLE) && + (cur_pollfd->revents & (POLLIN | errflags))) + { + /* data available in socket, or EOF */ + occurred_events->events |= WL_SOCKET_READABLE; + } + + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (cur_pollfd->revents & (POLLOUT | errflags))) + { + /* writeable, or EOF */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + +#ifdef POLLRDHUP + if ((cur_event->events & WL_SOCKET_CLOSED) && + (cur_pollfd->revents & (POLLRDHUP | errflags))) + { + /* remote peer closed, or error */ + occurred_events->events |= WL_SOCKET_CLOSED; + } +#endif + + if (occurred_events->events != 0) + { + occurred_events->fd = cur_event->fd; + occurred_events++; + returned_events++; + } + } + } + return returned_events; +} + +#elif defined(WAIT_USE_WIN32) + +/* + * Wait using Windows' WaitForMultipleObjects(). + * + * Unfortunately this will only ever return a single readiness notification at + * a time. Note that while the official documentation for + * WaitForMultipleObjects is ambiguous about multiple events being "consumed" + * with a single bWaitAll = FALSE call, + * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms + * that only one event is "consumed". + */ +static inline int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + DWORD rc; + WaitEvent *cur_event; + + /* Reset any wait events that need it */ + for (cur_event = set->events; + cur_event < (set->events + set->nevents); + cur_event++) + { + if (cur_event->reset) + { + WaitEventAdjustWin32(set, cur_event); + cur_event->reset = false; + } + + /* + * Windows does not guarantee to log an FD_WRITE network event + * indicating that more data can be sent unless the previous send() + * failed with WSAEWOULDBLOCK. While our caller might well have made + * such a call, we cannot assume that here. Therefore, if waiting for + * write-ready, force the issue by doing a dummy send(). If the dummy + * send() succeeds, assume that the socket is in fact write-ready, and + * return immediately. Also, if it fails with something other than + * WSAEWOULDBLOCK, return a write-ready indication to let our caller + * deal with the error condition. + */ + if (cur_event->events & WL_SOCKET_WRITEABLE) + { + char c; + WSABUF buf; + DWORD sent; + int r; + + buf.buf = &c; + buf.len = 0; + + r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL); + if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK) + { + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = WL_SOCKET_WRITEABLE; + occurred_events->fd = cur_event->fd; + return 1; + } + } + } + + /* + * Sleep. + * + * Need to wait for ->nevents + 1, because signal handle is in [0]. + */ + rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE, + cur_timeout); + + /* Check return code */ + if (rc == WAIT_FAILED) + elog(ERROR, "WaitForMultipleObjects() failed: error code %lu", + GetLastError()); + else if (rc == WAIT_TIMEOUT) + { + /* timeout exceeded */ + return -1; + } + + if (rc == WAIT_OBJECT_0) + { + /* Service newly-arrived signals */ + pgwin32_dispatch_queued_signals(); + return 0; /* retry */ + } + + /* + * With an offset of one, due to the always present pgwin32_signal_event, + * the handle offset directly corresponds to a wait event. + */ + cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1]; + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET) + { + /* + * We cannot use set->latch->event to reset the fired event if we + * aren't waiting on this latch now. + */ + if (!ResetEvent(set->handles[cur_event->pos + 1])) + elog(ERROR, "ResetEvent failed: error code %lu", GetLastError()); + + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH) + { + /* + * Postmaster apparently died. Since the consequences of falsely + * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take + * the trouble to positively verify this with PostmasterIsAlive(), + * even though there is no known reason to think that the event could + * be falsely set on Windows. + */ + if (!PostmasterIsAliveInternal()) + { + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events & WL_SOCKET_MASK) + { + WSANETWORKEVENTS resEvents; + HANDLE handle = set->handles[cur_event->pos + 1]; + + Assert(cur_event->fd); + + occurred_events->fd = cur_event->fd; + + ZeroMemory(&resEvents, sizeof(resEvents)); + if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0) + elog(ERROR, "failed to enumerate network events: error code %d", + WSAGetLastError()); + if ((cur_event->events & WL_SOCKET_READABLE) && + (resEvents.lNetworkEvents & FD_READ)) + { + /* data available in socket */ + occurred_events->events |= WL_SOCKET_READABLE; + + /*------ + * WaitForMultipleObjects doesn't guarantee that a read event will + * be returned if the latch is set at the same time. Even if it + * did, the caller might drop that event expecting it to reoccur + * on next call. So, we must force the event to be reset if this + * WaitEventSet is used again in order to avoid an indefinite + * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx + * for the behavior of socket events. + *------ + */ + cur_event->reset = true; + } + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (resEvents.lNetworkEvents & FD_WRITE)) + { + /* writeable */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + if ((cur_event->events & WL_SOCKET_CONNECTED) && + (resEvents.lNetworkEvents & FD_CONNECT)) + { + /* connected */ + occurred_events->events |= WL_SOCKET_CONNECTED; + } + if ((cur_event->events & WL_SOCKET_ACCEPT) && + (resEvents.lNetworkEvents & FD_ACCEPT)) + { + /* incoming connection could be accepted */ + occurred_events->events |= WL_SOCKET_ACCEPT; + } + if (resEvents.lNetworkEvents & FD_CLOSE) + { + /* EOF/error, so signal all caller-requested socket flags */ + occurred_events->events |= (cur_event->events & WL_SOCKET_MASK); + } + + if (occurred_events->events != 0) + { + occurred_events++; + returned_events++; + } + } + + return returned_events; +} +#endif + +/* + * Return whether the current build options can report WL_SOCKET_CLOSED. + */ +bool +WaitEventSetCanReportClosed(void) +{ +#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \ + defined(WAIT_USE_EPOLL) || \ + defined(WAIT_USE_KQUEUE) + return true; +#else + return false; +#endif +} + +/* + * Get the number of wait events registered in a given WaitEventSet. + */ +int +GetNumRegisteredWaitEvents(WaitEventSet *set) +{ + return set->nevents; +} + +#if defined(WAIT_USE_SELF_PIPE) + +/* + * SetLatch uses SIGURG to wake up the process waiting on the latch. + * + * Wake up WaitLatch, if we're waiting. + */ +static void +latch_sigurg_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + if (waiting) + sendSelfPipeByte(); + + errno = save_errno; +} + +/* Send one byte to the self-pipe, to wake up WaitLatch */ +static void +sendSelfPipeByte(void) +{ + int rc; + char dummy = 0; + +retry: + rc = write(selfpipe_writefd, &dummy, 1); + if (rc < 0) + { + /* If interrupted by signal, just retry */ + if (errno == EINTR) + goto retry; + + /* + * If the pipe is full, we don't need to retry, the data that's there + * already is enough to wake up WaitLatch. + */ + if (errno == EAGAIN || errno == EWOULDBLOCK) + return; + + /* + * Oops, the write() failed for some other reason. We might be in a + * signal handler, so it's not safe to elog(). We have no choice but + * silently ignore the error. + */ + return; + } +} + +#endif + +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) + +/* + * Read all available data from self-pipe or signalfd. + * + * Note: this is only called when waiting = true. If it fails and doesn't + * return, it must reset that flag first (though ideally, this will never + * happen). + */ +static void +drain(void) +{ + char buf[1024]; + int rc; + int fd; + +#ifdef WAIT_USE_SELF_PIPE + fd = selfpipe_readfd; +#else + fd = signal_fd; +#endif + + for (;;) + { + rc = read(fd, buf, sizeof(buf)); + if (rc < 0) + { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; /* the descriptor is empty */ + else if (errno == EINTR) + continue; /* retry */ + else + { + waiting = false; +#ifdef WAIT_USE_SELF_PIPE + elog(ERROR, "read() on self-pipe failed: %m"); +#else + elog(ERROR, "read() on signalfd failed: %m"); +#endif + } + } + else if (rc == 0) + { + waiting = false; +#ifdef WAIT_USE_SELF_PIPE + elog(ERROR, "unexpected EOF on self-pipe"); +#else + elog(ERROR, "unexpected EOF on signalfd"); +#endif + } + else if (rc < sizeof(buf)) + { + /* we successfully drained the pipe; no need to read() again */ + break; + } + /* else buffer wasn't big enough, so read again */ + } +} + +#endif diff --git a/src/backend/storage/ipc/meson.build b/src/backend/storage/ipc/meson.build new file mode 100644 index 0000000..79a16d0 --- /dev/null +++ b/src/backend/storage/ipc/meson.build @@ -0,0 +1,21 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'barrier.c', + 'dsm.c', + 'dsm_impl.c', + 'ipc.c', + 'ipci.c', + 'latch.c', + 'pmsignal.c', + 'procarray.c', + 'procsignal.c', + 'shm_mq.c', + 'shm_toc.c', + 'shmem.c', + 'signalfuncs.c', + 'sinval.c', + 'sinvaladt.c', + 'standby.c', + +) diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c new file mode 100644 index 0000000..5dc2da6 --- /dev/null +++ b/src/backend/storage/ipc/pmsignal.c @@ -0,0 +1,462 @@ +/*------------------------------------------------------------------------- + * + * pmsignal.c + * routines for signaling between the postmaster and its child processes + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/ipc/pmsignal.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#ifdef HAVE_SYS_PRCTL_H +#include +#endif + +#include "miscadmin.h" +#include "postmaster/postmaster.h" +#include "replication/walsender.h" +#include "storage/pmsignal.h" +#include "storage/shmem.h" +#include "utils/memutils.h" + + +/* + * The postmaster is signaled by its children by sending SIGUSR1. The + * specific reason is communicated via flags in shared memory. We keep + * a boolean flag for each possible "reason", so that different reasons + * can be signaled by different backends at the same time. (However, + * if the same reason is signaled more than once simultaneously, the + * postmaster will observe it only once.) + * + * The flags are actually declared as "volatile sig_atomic_t" for maximum + * portability. This should ensure that loads and stores of the flag + * values are atomic, allowing us to dispense with any explicit locking. + * + * In addition to the per-reason flags, we store a set of per-child-process + * flags that are currently used only for detecting whether a backend has + * exited without performing proper shutdown. The per-child-process flags + * have three possible states: UNUSED, ASSIGNED, ACTIVE. An UNUSED slot is + * available for assignment. An ASSIGNED slot is associated with a postmaster + * child process, but either the process has not touched shared memory yet, + * or it has successfully cleaned up after itself. A ACTIVE slot means the + * process is actively using shared memory. The slots are assigned to + * child processes at random, and postmaster.c is responsible for tracking + * which one goes with which PID. + * + * Actually there is a fourth state, WALSENDER. This is just like ACTIVE, + * but carries the extra information that the child is a WAL sender. + * WAL senders too start in ACTIVE state, but switch to WALSENDER once they + * start streaming the WAL (and they never go back to ACTIVE after that). + * + * We also have a shared-memory field that is used for communication in + * the opposite direction, from postmaster to children: it tells why the + * postmaster has broadcasted SIGQUIT signals, if indeed it has done so. + */ + +#define PM_CHILD_UNUSED 0 /* these values must fit in sig_atomic_t */ +#define PM_CHILD_ASSIGNED 1 +#define PM_CHILD_ACTIVE 2 +#define PM_CHILD_WALSENDER 3 + +/* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */ +struct PMSignalData +{ + /* per-reason flags for signaling the postmaster */ + sig_atomic_t PMSignalFlags[NUM_PMSIGNALS]; + /* global flags for signals from postmaster to children */ + QuitSignalReason sigquit_reason; /* why SIGQUIT was sent */ + /* per-child-process flags */ + int num_child_flags; /* # of entries in PMChildFlags[] */ + sig_atomic_t PMChildFlags[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* PMSignalState pointer is valid in both postmaster and child processes */ +NON_EXEC_STATIC volatile PMSignalData *PMSignalState = NULL; + +/* + * These static variables are valid only in the postmaster. We keep a + * duplicative private array so that we can trust its state even if some + * failing child has clobbered the PMSignalData struct in shared memory. + */ +static int num_child_inuse; /* # of entries in PMChildInUse[] */ +static int next_child_inuse; /* next slot to try to assign */ +static bool *PMChildInUse; /* true if i'th flag slot is assigned */ + +/* + * Signal handler to be notified if postmaster dies. + */ +#ifdef USE_POSTMASTER_DEATH_SIGNAL +volatile sig_atomic_t postmaster_possibly_dead = false; + +static void +postmaster_death_handler(SIGNAL_ARGS) +{ + postmaster_possibly_dead = true; +} + +/* + * The available signals depend on the OS. SIGUSR1 and SIGUSR2 are already + * used for other things, so choose another one. + * + * Currently, we assume that we can always find a signal to use. That + * seems like a reasonable assumption for all platforms that are modern + * enough to have a parent-death signaling mechanism. + */ +#if defined(SIGINFO) +#define POSTMASTER_DEATH_SIGNAL SIGINFO +#elif defined(SIGPWR) +#define POSTMASTER_DEATH_SIGNAL SIGPWR +#else +#error "cannot find a signal to use for postmaster death" +#endif + +#endif /* USE_POSTMASTER_DEATH_SIGNAL */ + +/* + * PMSignalShmemSize + * Compute space needed for pmsignal.c's shared memory + */ +Size +PMSignalShmemSize(void) +{ + Size size; + + size = offsetof(PMSignalData, PMChildFlags); + size = add_size(size, mul_size(MaxLivePostmasterChildren(), + sizeof(sig_atomic_t))); + + return size; +} + +/* + * PMSignalShmemInit - initialize during shared-memory creation + */ +void +PMSignalShmemInit(void) +{ + bool found; + + PMSignalState = (PMSignalData *) + ShmemInitStruct("PMSignalState", PMSignalShmemSize(), &found); + + if (!found) + { + /* initialize all flags to zeroes */ + MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize()); + num_child_inuse = MaxLivePostmasterChildren(); + PMSignalState->num_child_flags = num_child_inuse; + + /* + * Also allocate postmaster's private PMChildInUse[] array. We + * might've already done that in a previous shared-memory creation + * cycle, in which case free the old array to avoid a leak. (Do it + * like this to support the possibility that MaxLivePostmasterChildren + * changed.) In a standalone backend, we do not need this. + */ + if (PostmasterContext != NULL) + { + if (PMChildInUse) + pfree(PMChildInUse); + PMChildInUse = (bool *) + MemoryContextAllocZero(PostmasterContext, + num_child_inuse * sizeof(bool)); + } + next_child_inuse = 0; + } +} + +/* + * SendPostmasterSignal - signal the postmaster from a child process + */ +void +SendPostmasterSignal(PMSignalReason reason) +{ + /* If called in a standalone backend, do nothing */ + if (!IsUnderPostmaster) + return; + /* Atomically set the proper flag */ + PMSignalState->PMSignalFlags[reason] = true; + /* Send signal to postmaster */ + kill(PostmasterPid, SIGUSR1); +} + +/* + * CheckPostmasterSignal - check to see if a particular reason has been + * signaled, and clear the signal flag. Should be called by postmaster + * after receiving SIGUSR1. + */ +bool +CheckPostmasterSignal(PMSignalReason reason) +{ + /* Careful here --- don't clear flag if we haven't seen it set */ + if (PMSignalState->PMSignalFlags[reason]) + { + PMSignalState->PMSignalFlags[reason] = false; + return true; + } + return false; +} + +/* + * SetQuitSignalReason - broadcast the reason for a system shutdown. + * Should be called by postmaster before sending SIGQUIT to children. + * + * Note: in a crash-and-restart scenario, the "reason" field gets cleared + * as a part of rebuilding shared memory; the postmaster need not do it + * explicitly. + */ +void +SetQuitSignalReason(QuitSignalReason reason) +{ + PMSignalState->sigquit_reason = reason; +} + +/* + * GetQuitSignalReason - obtain the reason for a system shutdown. + * Called by child processes when they receive SIGQUIT. + * If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT. + */ +QuitSignalReason +GetQuitSignalReason(void) +{ + /* This is called in signal handlers, so be extra paranoid. */ + if (!IsUnderPostmaster || PMSignalState == NULL) + return PMQUIT_NOT_SENT; + return PMSignalState->sigquit_reason; +} + + +/* + * AssignPostmasterChildSlot - select an unused slot for a new postmaster + * child process, and set its state to ASSIGNED. Returns a slot number + * (one to N). + * + * Only the postmaster is allowed to execute this routine, so we need no + * special locking. + */ +int +AssignPostmasterChildSlot(void) +{ + int slot = next_child_inuse; + int n; + + /* + * Scan for a free slot. Notice that we trust nothing about the contents + * of PMSignalState, but use only postmaster-local data for this decision. + * We track the last slot assigned so as not to waste time repeatedly + * rescanning low-numbered slots. + */ + for (n = num_child_inuse; n > 0; n--) + { + if (--slot < 0) + slot = num_child_inuse - 1; + if (!PMChildInUse[slot]) + { + PMChildInUse[slot] = true; + PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED; + next_child_inuse = slot; + return slot + 1; + } + } + + /* Out of slots ... should never happen, else postmaster.c messed up */ + elog(FATAL, "no free slots in PMChildFlags array"); + return 0; /* keep compiler quiet */ +} + +/* + * ReleasePostmasterChildSlot - release a slot after death of a postmaster + * child process. This must be called in the postmaster process. + * + * Returns true if the slot had been in ASSIGNED state (the expected case), + * false otherwise (implying that the child failed to clean itself up). + */ +bool +ReleasePostmasterChildSlot(int slot) +{ + bool result; + + Assert(slot > 0 && slot <= num_child_inuse); + slot--; + + /* + * Note: the slot state might already be unused, because the logic in + * postmaster.c is such that this might get called twice when a child + * crashes. So we don't try to Assert anything about the state. + */ + result = (PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED); + PMSignalState->PMChildFlags[slot] = PM_CHILD_UNUSED; + PMChildInUse[slot] = false; + return result; +} + +/* + * IsPostmasterChildWalSender - check if given slot is in use by a + * walsender process. This is called only by the postmaster. + */ +bool +IsPostmasterChildWalSender(int slot) +{ + Assert(slot > 0 && slot <= num_child_inuse); + slot--; + + if (PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER) + return true; + else + return false; +} + +/* + * MarkPostmasterChildActive - mark a postmaster child as about to begin + * actively using shared memory. This is called in the child process. + */ +void +MarkPostmasterChildActive(void) +{ + int slot = MyPMChildSlot; + + Assert(slot > 0 && slot <= PMSignalState->num_child_flags); + slot--; + Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED); + PMSignalState->PMChildFlags[slot] = PM_CHILD_ACTIVE; +} + +/* + * MarkPostmasterChildWalSender - mark a postmaster child as a WAL sender + * process. This is called in the child process, sometime after marking the + * child as active. + */ +void +MarkPostmasterChildWalSender(void) +{ + int slot = MyPMChildSlot; + + Assert(am_walsender); + + Assert(slot > 0 && slot <= PMSignalState->num_child_flags); + slot--; + Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE); + PMSignalState->PMChildFlags[slot] = PM_CHILD_WALSENDER; +} + +/* + * MarkPostmasterChildInactive - mark a postmaster child as done using + * shared memory. This is called in the child process. + */ +void +MarkPostmasterChildInactive(void) +{ + int slot = MyPMChildSlot; + + Assert(slot > 0 && slot <= PMSignalState->num_child_flags); + slot--; + Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE || + PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER); + PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED; +} + + +/* + * PostmasterIsAliveInternal - check whether postmaster process is still alive + * + * This is the slow path of PostmasterIsAlive(), where the caller has already + * checked 'postmaster_possibly_dead'. (On platforms that don't support + * a signal for parent death, PostmasterIsAlive() is just an alias for this.) + */ +bool +PostmasterIsAliveInternal(void) +{ +#ifdef USE_POSTMASTER_DEATH_SIGNAL + /* + * Reset the flag before checking, so that we don't miss a signal if + * postmaster dies right after the check. If postmaster was indeed dead, + * we'll re-arm it before returning to caller. + */ + postmaster_possibly_dead = false; +#endif + +#ifndef WIN32 + { + char c; + ssize_t rc; + + rc = read(postmaster_alive_fds[POSTMASTER_FD_WATCH], &c, 1); + + /* + * In the usual case, the postmaster is still alive, and there is no + * data in the pipe. + */ + if (rc < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) + return true; + else + { + /* + * Postmaster is dead, or something went wrong with the read() + * call. + */ + +#ifdef USE_POSTMASTER_DEATH_SIGNAL + postmaster_possibly_dead = true; +#endif + + if (rc < 0) + elog(FATAL, "read on postmaster death monitoring pipe failed: %m"); + else if (rc > 0) + elog(FATAL, "unexpected data in postmaster death monitoring pipe"); + + return false; + } + } + +#else /* WIN32 */ + if (WaitForSingleObject(PostmasterHandle, 0) == WAIT_TIMEOUT) + return true; + else + { +#ifdef USE_POSTMASTER_DEATH_SIGNAL + postmaster_possibly_dead = true; +#endif + return false; + } +#endif /* WIN32 */ +} + +/* + * PostmasterDeathSignalInit - request signal on postmaster death if possible + */ +void +PostmasterDeathSignalInit(void) +{ +#ifdef USE_POSTMASTER_DEATH_SIGNAL + int signum = POSTMASTER_DEATH_SIGNAL; + + /* Register our signal handler. */ + pqsignal(signum, postmaster_death_handler); + + /* Request a signal on parent exit. */ +#if defined(PR_SET_PDEATHSIG) + if (prctl(PR_SET_PDEATHSIG, signum) < 0) + elog(ERROR, "could not request parent death signal: %m"); +#elif defined(PROC_PDEATHSIG_CTL) + if (procctl(P_PID, 0, PROC_PDEATHSIG_CTL, &signum) < 0) + elog(ERROR, "could not request parent death signal: %m"); +#else +#error "USE_POSTMASTER_DEATH_SIGNAL set, but there is no mechanism to request the signal" +#endif + + /* + * Just in case the parent was gone already and we missed it, we'd better + * check the slow way on the first call. + */ + postmaster_possibly_dead = true; +#endif /* USE_POSTMASTER_DEATH_SIGNAL */ +} diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c new file mode 100644 index 0000000..e630302 --- /dev/null +++ b/src/backend/storage/ipc/procarray.c @@ -0,0 +1,5224 @@ +/*------------------------------------------------------------------------- + * + * procarray.c + * POSTGRES process array code. + * + * + * This module maintains arrays of PGPROC substructures, as well as associated + * arrays in ProcGlobal, for all active backends. Although there are several + * uses for this, the principal one is as a means of determining the set of + * currently running transactions. + * + * Because of various subtle race conditions it is critical that a backend + * hold the correct locks while setting or clearing its xid (in + * ProcGlobal->xids[]/MyProc->xid). See notes in + * src/backend/access/transam/README. + * + * The process arrays now also include structures representing prepared + * transactions. The xid and subxids fields of these are valid, as are the + * myProcLocks lists. They can be distinguished from regular backend PGPROCs + * at need by checking for pid == 0. + * + * During hot standby, we also keep a list of XIDs representing transactions + * that are known to be running on the primary (or more precisely, were running + * as of the current point in the WAL stream). This list is kept in the + * KnownAssignedXids array, and is updated by watching the sequence of + * arriving XIDs. This is necessary because if we leave those XIDs out of + * snapshots taken for standby queries, then they will appear to be already + * complete, leading to MVCC failures. Note that in hot standby, the PGPROC + * array represents standby processes, which by definition are not running + * transactions that have XIDs. + * + * It is perhaps possible for a backend on the primary to terminate without + * writing an abort record for its transaction. While that shouldn't really + * happen, it would tie up KnownAssignedXids indefinitely, so we protect + * ourselves by pruning the array when a valid list of running XIDs arrives. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/procarray.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/clog.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlogutils.h" +#include "catalog/catalog.h" +#include "catalog/pg_authid.h" +#include "commands/dbcommands.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/pg_lfind.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/spin.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) + +/* Our shared memory area */ +typedef struct ProcArrayStruct +{ + int numProcs; /* number of valid procs entries */ + int maxProcs; /* allocated size of procs array */ + + /* + * Known assigned XIDs handling + */ + int maxKnownAssignedXids; /* allocated size of array */ + int numKnownAssignedXids; /* current # of valid entries */ + int tailKnownAssignedXids; /* index of oldest valid element */ + int headKnownAssignedXids; /* index of newest element, + 1 */ + slock_t known_assigned_xids_lck; /* protects head/tail pointers */ + + /* + * Highest subxid that has been removed from KnownAssignedXids array to + * prevent overflow; or InvalidTransactionId if none. We track this for + * similar reasons to tracking overflowing cached subxids in PGPROC + * entries. Must hold exclusive ProcArrayLock to change this, and shared + * lock to read it. + */ + TransactionId lastOverflowedXid; + + /* oldest xmin of any replication slot */ + TransactionId replication_slot_xmin; + /* oldest catalog xmin of any replication slot */ + TransactionId replication_slot_catalog_xmin; + + /* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */ + int pgprocnos[FLEXIBLE_ARRAY_MEMBER]; +} ProcArrayStruct; + +/* + * State for the GlobalVisTest* family of functions. Those functions can + * e.g. be used to decide if a deleted row can be removed without violating + * MVCC semantics: If the deleted row's xmax is not considered to be running + * by anyone, the row can be removed. + * + * To avoid slowing down GetSnapshotData(), we don't calculate a precise + * cutoff XID while building a snapshot (looking at the frequently changing + * xmins scales badly). Instead we compute two boundaries while building the + * snapshot: + * + * 1) definitely_needed, indicating that rows deleted by XIDs >= + * definitely_needed are definitely still visible. + * + * 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can + * definitely be removed + * + * When testing an XID that falls in between the two (i.e. XID >= maybe_needed + * && XID < definitely_needed), the boundaries can be recomputed (using + * ComputeXidHorizons()) to get a more accurate answer. This is cheaper than + * maintaining an accurate value all the time. + * + * As it is not cheap to compute accurate boundaries, we limit the number of + * times that happens in short succession. See GlobalVisTestShouldUpdate(). + * + * + * There are three backend lifetime instances of this struct, optimized for + * different types of relations. As e.g. a normal user defined table in one + * database is inaccessible to backends connected to another database, a test + * specific to a relation can be more aggressive than a test for a shared + * relation. Currently we track four different states: + * + * 1) GlobalVisSharedRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in any database, nor a + * replication slot's xmin, nor a replication slot's catalog_xmin might + * still consider XID as running. + * + * 2) GlobalVisCatalogRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in the current + * database, nor a replication slot's xmin, nor a replication slot's + * catalog_xmin might still consider XID as running. + * + * I.e. the difference to GlobalVisSharedRels is that + * snapshot in other databases are ignored. + * + * 3) GlobalVisDataRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in the current + * database, nor a replication slot's xmin consider XID as running. + * + * I.e. the difference to GlobalVisCatalogRels is that + * replication slot's catalog_xmin is not taken into account. + * + * 4) GlobalVisTempRels, which only considers the current session, as temp + * tables are not visible to other sessions. + * + * GlobalVisTestFor(relation) returns the appropriate state + * for the relation. + * + * The boundaries are FullTransactionIds instead of TransactionIds to avoid + * wraparound dangers. There e.g. would otherwise exist no procarray state to + * prevent maybe_needed to become old enough after the GetSnapshotData() + * call. + * + * The typedef is in the header. + */ +struct GlobalVisState +{ + /* XIDs >= are considered running by some backend */ + FullTransactionId definitely_needed; + + /* XIDs < are not considered to be running by any backend */ + FullTransactionId maybe_needed; +}; + +/* + * Result of ComputeXidHorizons(). + */ +typedef struct ComputeXidHorizonsResult +{ + /* + * The value of ShmemVariableCache->latestCompletedXid when + * ComputeXidHorizons() held ProcArrayLock. + */ + FullTransactionId latest_completed; + + /* + * The same for procArray->replication_slot_xmin and. + * procArray->replication_slot_catalog_xmin. + */ + TransactionId slot_xmin; + TransactionId slot_catalog_xmin; + + /* + * Oldest xid that any backend might still consider running. This needs to + * include processes running VACUUM, in contrast to the normal visibility + * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when + * determining visibility, but doesn't care about rows above its xmin to + * be removed. + * + * This likely should only be needed to determine whether pg_subtrans can + * be truncated. It currently includes the effects of replication slots, + * for historical reasons. But that could likely be changed. + */ + TransactionId oldest_considered_running; + + /* + * Oldest xid for which deleted tuples need to be retained in shared + * tables. + * + * This includes the effects of replication slots. If that's not desired, + * look at shared_oldest_nonremovable_raw; + */ + TransactionId shared_oldest_nonremovable; + + /* + * Oldest xid that may be necessary to retain in shared tables. This is + * the same as shared_oldest_nonremovable, except that is not affected by + * replication slot's catalog_xmin. + * + * This is mainly useful to be able to send the catalog_xmin to upstream + * streaming replication servers via hot_standby_feedback, so they can + * apply the limit only when accessing catalog tables. + */ + TransactionId shared_oldest_nonremovable_raw; + + /* + * Oldest xid for which deleted tuples need to be retained in non-shared + * catalog tables. + */ + TransactionId catalog_oldest_nonremovable; + + /* + * Oldest xid for which deleted tuples need to be retained in normal user + * defined tables. + */ + TransactionId data_oldest_nonremovable; + + /* + * Oldest xid for which deleted tuples need to be retained in this + * session's temporary tables. + */ + TransactionId temp_oldest_nonremovable; +} ComputeXidHorizonsResult; + +/* + * Return value for GlobalVisHorizonKindForRel(). + */ +typedef enum GlobalVisHorizonKind +{ + VISHORIZON_SHARED, + VISHORIZON_CATALOG, + VISHORIZON_DATA, + VISHORIZON_TEMP +} GlobalVisHorizonKind; + +/* + * Reason codes for KnownAssignedXidsCompress(). + */ +typedef enum KAXCompressReason +{ + KAX_NO_SPACE, /* need to free up space at array end */ + KAX_PRUNE, /* we just pruned old entries */ + KAX_TRANSACTION_END, /* we just committed/removed some XIDs */ + KAX_STARTUP_PROCESS_IDLE /* startup process is about to sleep */ +} KAXCompressReason; + + +static ProcArrayStruct *procArray; + +static PGPROC *allProcs; + +/* + * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress() + */ +static TransactionId cachedXidIsNotInProgress = InvalidTransactionId; + +/* + * Bookkeeping for tracking emulated transactions in recovery + */ +static TransactionId *KnownAssignedXids; +static bool *KnownAssignedXidsValid; +static TransactionId latestObservedXid = InvalidTransactionId; + +/* + * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is + * the highest xid that might still be running that we don't have in + * KnownAssignedXids. + */ +static TransactionId standbySnapshotPendingXmin; + +/* + * State for visibility checks on different types of relations. See struct + * GlobalVisState for details. As shared, catalog, normal and temporary + * relations can have different horizons, one such state exists for each. + */ +static GlobalVisState GlobalVisSharedRels; +static GlobalVisState GlobalVisCatalogRels; +static GlobalVisState GlobalVisDataRels; +static GlobalVisState GlobalVisTempRels; + +/* + * This backend's RecentXmin at the last time the accurate xmin horizon was + * recomputed, or InvalidTransactionId if it has not. Used to limit how many + * times accurate horizons are recomputed. See GlobalVisTestShouldUpdate(). + */ +static TransactionId ComputeXidHorizonsResultLastXmin; + +#ifdef XIDCACHE_DEBUG + +/* counters for XidCache measurement */ +static long xc_by_recent_xmin = 0; +static long xc_by_known_xact = 0; +static long xc_by_my_xact = 0; +static long xc_by_latest_xid = 0; +static long xc_by_main_xid = 0; +static long xc_by_child_xid = 0; +static long xc_by_known_assigned = 0; +static long xc_no_overflow = 0; +static long xc_slow_answer = 0; + +#define xc_by_recent_xmin_inc() (xc_by_recent_xmin++) +#define xc_by_known_xact_inc() (xc_by_known_xact++) +#define xc_by_my_xact_inc() (xc_by_my_xact++) +#define xc_by_latest_xid_inc() (xc_by_latest_xid++) +#define xc_by_main_xid_inc() (xc_by_main_xid++) +#define xc_by_child_xid_inc() (xc_by_child_xid++) +#define xc_by_known_assigned_inc() (xc_by_known_assigned++) +#define xc_no_overflow_inc() (xc_no_overflow++) +#define xc_slow_answer_inc() (xc_slow_answer++) + +static void DisplayXidCache(void); +#else /* !XIDCACHE_DEBUG */ + +#define xc_by_recent_xmin_inc() ((void) 0) +#define xc_by_known_xact_inc() ((void) 0) +#define xc_by_my_xact_inc() ((void) 0) +#define xc_by_latest_xid_inc() ((void) 0) +#define xc_by_main_xid_inc() ((void) 0) +#define xc_by_child_xid_inc() ((void) 0) +#define xc_by_known_assigned_inc() ((void) 0) +#define xc_no_overflow_inc() ((void) 0) +#define xc_slow_answer_inc() ((void) 0) +#endif /* XIDCACHE_DEBUG */ + +/* Primitives for KnownAssignedXids array handling for standby */ +static void KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock); +static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, + bool exclusive_lock); +static bool KnownAssignedXidsSearch(TransactionId xid, bool remove); +static bool KnownAssignedXidExists(TransactionId xid); +static void KnownAssignedXidsRemove(TransactionId xid); +static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, + TransactionId *subxids); +static void KnownAssignedXidsRemovePreceding(TransactionId removeXid); +static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax); +static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, + TransactionId *xmin, + TransactionId xmax); +static TransactionId KnownAssignedXidsGetOldestXmin(void); +static void KnownAssignedXidsDisplay(int trace_level); +static void KnownAssignedXidsReset(void); +static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid); +static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid); +static void MaintainLatestCompletedXid(TransactionId latestXid); +static void MaintainLatestCompletedXidRecovery(TransactionId latestXid); + +static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel, + TransactionId xid); +static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons); + +/* + * Report shared-memory space needed by CreateSharedProcArray. + */ +Size +ProcArrayShmemSize(void) +{ + Size size; + + /* Size of the ProcArray structure itself */ +#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts) + + size = offsetof(ProcArrayStruct, pgprocnos); + size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS)); + + /* + * During Hot Standby processing we have a data structure called + * KnownAssignedXids, created in shared memory. Local data structures are + * also created in various backends during GetSnapshotData(), + * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the + * main structures created in those functions must be identically sized, + * since we may at times copy the whole of the data structures around. We + * refer to this size as TOTAL_MAX_CACHED_SUBXIDS. + * + * Ideally we'd only create this structure if we were actually doing hot + * standby in the current run, but we don't know that yet at the time + * shared memory is being set up. + */ +#define TOTAL_MAX_CACHED_SUBXIDS \ + ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) + + if (EnableHotStandby) + { + size = add_size(size, + mul_size(sizeof(TransactionId), + TOTAL_MAX_CACHED_SUBXIDS)); + size = add_size(size, + mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS)); + } + + return size; +} + +/* + * Initialize the shared PGPROC array during postmaster startup. + */ +void +CreateSharedProcArray(void) +{ + bool found; + + /* Create or attach to the ProcArray shared structure */ + procArray = (ProcArrayStruct *) + ShmemInitStruct("Proc Array", + add_size(offsetof(ProcArrayStruct, pgprocnos), + mul_size(sizeof(int), + PROCARRAY_MAXPROCS)), + &found); + + if (!found) + { + /* + * We're the first - initialize. + */ + procArray->numProcs = 0; + procArray->maxProcs = PROCARRAY_MAXPROCS; + procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS; + procArray->numKnownAssignedXids = 0; + procArray->tailKnownAssignedXids = 0; + procArray->headKnownAssignedXids = 0; + SpinLockInit(&procArray->known_assigned_xids_lck); + procArray->lastOverflowedXid = InvalidTransactionId; + procArray->replication_slot_xmin = InvalidTransactionId; + procArray->replication_slot_catalog_xmin = InvalidTransactionId; + ShmemVariableCache->xactCompletionCount = 1; + } + + allProcs = ProcGlobal->allProcs; + + /* Create or attach to the KnownAssignedXids arrays too, if needed */ + if (EnableHotStandby) + { + KnownAssignedXids = (TransactionId *) + ShmemInitStruct("KnownAssignedXids", + mul_size(sizeof(TransactionId), + TOTAL_MAX_CACHED_SUBXIDS), + &found); + KnownAssignedXidsValid = (bool *) + ShmemInitStruct("KnownAssignedXidsValid", + mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS), + &found); + } +} + +/* + * Add the specified PGPROC to the shared array. + */ +void +ProcArrayAdd(PGPROC *proc) +{ + ProcArrayStruct *arrayP = procArray; + int index; + int movecount; + + /* See ProcGlobal comment explaining why both locks are held */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + + if (arrayP->numProcs >= arrayP->maxProcs) + { + /* + * Oops, no room. (This really shouldn't happen, since there is a + * fixed supply of PGPROC structs too, and so we should have failed + * earlier.) + */ + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("sorry, too many clients already"))); + } + + /* + * Keep the procs array sorted by (PGPROC *) so that we can utilize + * locality of references much better. This is useful while traversing the + * ProcArray because there is an increased likelihood of finding the next + * PGPROC structure in the cache. + * + * Since the occurrence of adding/removing a proc is much lower than the + * access to the ProcArray itself, the overhead should be marginal + */ + for (index = 0; index < arrayP->numProcs; index++) + { + int procno PG_USED_FOR_ASSERTS_ONLY = arrayP->pgprocnos[index]; + + Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); + Assert(allProcs[procno].pgxactoff == index); + + /* If we have found our right position in the array, break */ + if (arrayP->pgprocnos[index] > proc->pgprocno) + break; + } + + movecount = arrayP->numProcs - index; + memmove(&arrayP->pgprocnos[index + 1], + &arrayP->pgprocnos[index], + movecount * sizeof(*arrayP->pgprocnos)); + memmove(&ProcGlobal->xids[index + 1], + &ProcGlobal->xids[index], + movecount * sizeof(*ProcGlobal->xids)); + memmove(&ProcGlobal->subxidStates[index + 1], + &ProcGlobal->subxidStates[index], + movecount * sizeof(*ProcGlobal->subxidStates)); + memmove(&ProcGlobal->statusFlags[index + 1], + &ProcGlobal->statusFlags[index], + movecount * sizeof(*ProcGlobal->statusFlags)); + + arrayP->pgprocnos[index] = proc->pgprocno; + proc->pgxactoff = index; + ProcGlobal->xids[index] = proc->xid; + ProcGlobal->subxidStates[index] = proc->subxidStatus; + ProcGlobal->statusFlags[index] = proc->statusFlags; + + arrayP->numProcs++; + + /* adjust pgxactoff for all following PGPROCs */ + index++; + for (; index < arrayP->numProcs; index++) + { + int procno = arrayP->pgprocnos[index]; + + Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); + Assert(allProcs[procno].pgxactoff == index - 1); + + allProcs[procno].pgxactoff = index; + } + + /* + * Release in reversed acquisition order, to reduce frequency of having to + * wait for XidGenLock while holding ProcArrayLock. + */ + LWLockRelease(XidGenLock); + LWLockRelease(ProcArrayLock); +} + +/* + * Remove the specified PGPROC from the shared array. + * + * When latestXid is a valid XID, we are removing a live 2PC gxact from the + * array, and thus causing it to appear as "not running" anymore. In this + * case we must advance latestCompletedXid. (This is essentially the same + * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take + * the ProcArrayLock only once, and don't damage the content of the PGPROC; + * twophase.c depends on the latter.) + */ +void +ProcArrayRemove(PGPROC *proc, TransactionId latestXid) +{ + ProcArrayStruct *arrayP = procArray; + int myoff; + int movecount; + +#ifdef XIDCACHE_DEBUG + /* dump stats at backend shutdown, but not prepared-xact end */ + if (proc->pid != 0) + DisplayXidCache(); +#endif + + /* See ProcGlobal comment explaining why both locks are held */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + + myoff = proc->pgxactoff; + + Assert(myoff >= 0 && myoff < arrayP->numProcs); + Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff); + + if (TransactionIdIsValid(latestXid)) + { + Assert(TransactionIdIsValid(ProcGlobal->xids[myoff])); + + /* Advance global latestCompletedXid while holding the lock */ + MaintainLatestCompletedXid(latestXid); + + /* Same with xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; + + ProcGlobal->xids[myoff] = InvalidTransactionId; + ProcGlobal->subxidStates[myoff].overflowed = false; + ProcGlobal->subxidStates[myoff].count = 0; + } + else + { + /* Shouldn't be trying to remove a live transaction here */ + Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff])); + } + + Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff])); + Assert(ProcGlobal->subxidStates[myoff].count == 0); + Assert(ProcGlobal->subxidStates[myoff].overflowed == false); + + ProcGlobal->statusFlags[myoff] = 0; + + /* Keep the PGPROC array sorted. See notes above */ + movecount = arrayP->numProcs - myoff - 1; + memmove(&arrayP->pgprocnos[myoff], + &arrayP->pgprocnos[myoff + 1], + movecount * sizeof(*arrayP->pgprocnos)); + memmove(&ProcGlobal->xids[myoff], + &ProcGlobal->xids[myoff + 1], + movecount * sizeof(*ProcGlobal->xids)); + memmove(&ProcGlobal->subxidStates[myoff], + &ProcGlobal->subxidStates[myoff + 1], + movecount * sizeof(*ProcGlobal->subxidStates)); + memmove(&ProcGlobal->statusFlags[myoff], + &ProcGlobal->statusFlags[myoff + 1], + movecount * sizeof(*ProcGlobal->statusFlags)); + + arrayP->pgprocnos[arrayP->numProcs - 1] = -1; /* for debugging */ + arrayP->numProcs--; + + /* + * Adjust pgxactoff of following procs for removed PGPROC (note that + * numProcs already has been decremented). + */ + for (int index = myoff; index < arrayP->numProcs; index++) + { + int procno = arrayP->pgprocnos[index]; + + Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); + Assert(allProcs[procno].pgxactoff - 1 == index); + + allProcs[procno].pgxactoff = index; + } + + /* + * Release in reversed acquisition order, to reduce frequency of having to + * wait for XidGenLock while holding ProcArrayLock. + */ + LWLockRelease(XidGenLock); + LWLockRelease(ProcArrayLock); +} + + +/* + * ProcArrayEndTransaction -- mark a transaction as no longer running + * + * This is used interchangeably for commit and abort cases. The transaction + * commit/abort must already be reported to WAL and pg_xact. + * + * proc is currently always MyProc, but we pass it explicitly for flexibility. + * latestXid is the latest Xid among the transaction's main XID and + * subtransactions, or InvalidTransactionId if it has no XID. (We must ask + * the caller to pass latestXid, instead of computing it from the PGPROC's + * contents, because the subxid information in the PGPROC might be + * incomplete.) + */ +void +ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) +{ + if (TransactionIdIsValid(latestXid)) + { + /* + * We must lock ProcArrayLock while clearing our advertised XID, so + * that we do not exit the set of "running" transactions while someone + * else is taking a snapshot. See discussion in + * src/backend/access/transam/README. + */ + Assert(TransactionIdIsValid(proc->xid)); + + /* + * If we can immediately acquire ProcArrayLock, we clear our own XID + * and release the lock. If not, use group XID clearing to improve + * efficiency. + */ + if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE)) + { + ProcArrayEndTransactionInternal(proc, latestXid); + LWLockRelease(ProcArrayLock); + } + else + ProcArrayGroupClearXid(proc, latestXid); + } + else + { + /* + * If we have no XID, we don't need to lock, since we won't affect + * anyone else's calculation of a snapshot. We might change their + * estimate of global xmin, but that's OK. + */ + Assert(!TransactionIdIsValid(proc->xid)); + Assert(proc->subxidStatus.count == 0); + Assert(!proc->subxidStatus.overflowed); + + proc->lxid = InvalidLocalTransactionId; + proc->xmin = InvalidTransactionId; + + /* be sure this is cleared in abort */ + proc->delayChkptFlags = 0; + + proc->recoveryConflictPending = false; + + /* must be cleared with xid/xmin: */ + /* avoid unnecessarily dirtying shared cachelines */ + if (proc->statusFlags & PROC_VACUUM_STATE_MASK) + { + Assert(!LWLockHeldByMe(ProcArrayLock)); + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + Assert(proc->statusFlags == ProcGlobal->statusFlags[proc->pgxactoff]); + proc->statusFlags &= ~PROC_VACUUM_STATE_MASK; + ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags; + LWLockRelease(ProcArrayLock); + } + } +} + +/* + * Mark a write transaction as no longer running. + * + * We don't do any locking here; caller must handle that. + */ +static inline void +ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) +{ + int pgxactoff = proc->pgxactoff; + + /* + * Note: we need exclusive lock here because we're going to change other + * processes' PGPROC entries. + */ + Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE)); + Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff])); + Assert(ProcGlobal->xids[pgxactoff] == proc->xid); + + ProcGlobal->xids[pgxactoff] = InvalidTransactionId; + proc->xid = InvalidTransactionId; + proc->lxid = InvalidLocalTransactionId; + proc->xmin = InvalidTransactionId; + + /* be sure this is cleared in abort */ + proc->delayChkptFlags = 0; + + proc->recoveryConflictPending = false; + + /* must be cleared with xid/xmin: */ + /* avoid unnecessarily dirtying shared cachelines */ + if (proc->statusFlags & PROC_VACUUM_STATE_MASK) + { + proc->statusFlags &= ~PROC_VACUUM_STATE_MASK; + ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags; + } + + /* Clear the subtransaction-XID cache too while holding the lock */ + Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count && + ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed); + if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed) + { + ProcGlobal->subxidStates[pgxactoff].count = 0; + ProcGlobal->subxidStates[pgxactoff].overflowed = false; + proc->subxidStatus.count = 0; + proc->subxidStatus.overflowed = false; + } + + /* Also advance global latestCompletedXid while holding the lock */ + MaintainLatestCompletedXid(latestXid); + + /* Same with xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; +} + +/* + * ProcArrayGroupClearXid -- group XID clearing + * + * When we cannot immediately acquire ProcArrayLock in exclusive mode at + * commit time, add ourselves to a list of processes that need their XIDs + * cleared. The first process to add itself to the list will acquire + * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal + * on behalf of all group members. This avoids a great deal of contention + * around ProcArrayLock when many processes are trying to commit at once, + * since the lock need not be repeatedly handed off from one committing + * process to the next. + */ +static void +ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) +{ + PROC_HDR *procglobal = ProcGlobal; + uint32 nextidx; + uint32 wakeidx; + + /* We should definitely have an XID to clear. */ + Assert(TransactionIdIsValid(proc->xid)); + + /* Add ourselves to the list of processes needing a group XID clear. */ + proc->procArrayGroupMember = true; + proc->procArrayGroupMemberXid = latestXid; + nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst); + while (true) + { + pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx); + + if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst, + &nextidx, + (uint32) proc->pgprocno)) + break; + } + + /* + * If the list was not empty, the leader will clear our XID. It is + * impossible to have followers without a leader because the first process + * that has added itself to the list will always have nextidx as + * INVALID_PGPROCNO. + */ + if (nextidx != INVALID_PGPROCNO) + { + int extraWaits = 0; + + /* Sleep until the leader clears our XID. */ + pgstat_report_wait_start(WAIT_EVENT_PROCARRAY_GROUP_UPDATE); + for (;;) + { + /* acts as a read barrier */ + PGSemaphoreLock(proc->sem); + if (!proc->procArrayGroupMember) + break; + extraWaits++; + } + pgstat_report_wait_end(); + + Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO); + + /* Fix semaphore count for any absorbed wakeups */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + return; + } + + /* We are the leader. Acquire the lock on behalf of everyone. */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Now that we've got the lock, clear the list of processes waiting for + * group XID clearing, saving a pointer to the head of the list. Trying + * to pop elements one at a time could lead to an ABA problem. + */ + nextidx = pg_atomic_exchange_u32(&procglobal->procArrayGroupFirst, + INVALID_PGPROCNO); + + /* Remember head of list so we can perform wakeups after dropping lock. */ + wakeidx = nextidx; + + /* Walk the list and clear all XIDs. */ + while (nextidx != INVALID_PGPROCNO) + { + PGPROC *nextproc = &allProcs[nextidx]; + + ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid); + + /* Move to next proc in list. */ + nextidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext); + } + + /* We're done with the lock now. */ + LWLockRelease(ProcArrayLock); + + /* + * Now that we've released the lock, go back and wake everybody up. We + * don't do this under the lock so as to keep lock hold times to a + * minimum. The system calls we need to perform to wake other processes + * up are probably much slower than the simple memory writes we did while + * holding the lock. + */ + while (wakeidx != INVALID_PGPROCNO) + { + PGPROC *nextproc = &allProcs[wakeidx]; + + wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext); + pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PGPROCNO); + + /* ensure all previous writes are visible before follower continues. */ + pg_write_barrier(); + + nextproc->procArrayGroupMember = false; + + if (nextproc != MyProc) + PGSemaphoreUnlock(nextproc->sem); + } +} + +/* + * ProcArrayClearTransaction -- clear the transaction fields + * + * This is used after successfully preparing a 2-phase transaction. We are + * not actually reporting the transaction's XID as no longer running --- it + * will still appear as running because the 2PC's gxact is in the ProcArray + * too. We just have to clear out our own PGPROC. + */ +void +ProcArrayClearTransaction(PGPROC *proc) +{ + int pgxactoff; + + /* + * Currently we need to lock ProcArrayLock exclusively here, as we + * increment xactCompletionCount below. We also need it at least in shared + * mode for pgproc->pgxactoff to stay the same below. + * + * We could however, as this action does not actually change anyone's view + * of the set of running XIDs (our entry is duplicate with the gxact that + * has already been inserted into the ProcArray), lower the lock level to + * shared if we were to make xactCompletionCount an atomic variable. But + * that doesn't seem worth it currently, as a 2PC commit is heavyweight + * enough for this not to be the bottleneck. If it ever becomes a + * bottleneck it may also be worth considering to combine this with the + * subsequent ProcArrayRemove() + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + pgxactoff = proc->pgxactoff; + + ProcGlobal->xids[pgxactoff] = InvalidTransactionId; + proc->xid = InvalidTransactionId; + + proc->lxid = InvalidLocalTransactionId; + proc->xmin = InvalidTransactionId; + proc->recoveryConflictPending = false; + + Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); + Assert(!proc->delayChkptFlags); + + /* + * Need to increment completion count even though transaction hasn't + * really committed yet. The reason for that is that GetSnapshotData() + * omits the xid of the current transaction, thus without the increment we + * otherwise could end up reusing the snapshot later. Which would be bad, + * because it might not count the prepared transaction as running. + */ + ShmemVariableCache->xactCompletionCount++; + + /* Clear the subtransaction-XID cache too */ + Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count && + ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed); + if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed) + { + ProcGlobal->subxidStates[pgxactoff].count = 0; + ProcGlobal->subxidStates[pgxactoff].overflowed = false; + proc->subxidStatus.count = 0; + proc->subxidStatus.overflowed = false; + } + + LWLockRelease(ProcArrayLock); +} + +/* + * Update ShmemVariableCache->latestCompletedXid to point to latestXid if + * currently older. + */ +static void +MaintainLatestCompletedXid(TransactionId latestXid) +{ + FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid; + + Assert(FullTransactionIdIsValid(cur_latest)); + Assert(!RecoveryInProgress()); + Assert(LWLockHeldByMe(ProcArrayLock)); + + if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) + { + ShmemVariableCache->latestCompletedXid = + FullXidRelativeTo(cur_latest, latestXid); + } + + Assert(IsBootstrapProcessingMode() || + FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); +} + +/* + * Same as MaintainLatestCompletedXid, except for use during WAL replay. + */ +static void +MaintainLatestCompletedXidRecovery(TransactionId latestXid) +{ + FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid; + FullTransactionId rel; + + Assert(AmStartupProcess() || !IsUnderPostmaster); + Assert(LWLockHeldByMe(ProcArrayLock)); + + /* + * Need a FullTransactionId to compare latestXid with. Can't rely on + * latestCompletedXid to be initialized in recovery. But in recovery it's + * safe to access nextXid without a lock for the startup process. + */ + rel = ShmemVariableCache->nextXid; + Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid)); + + if (!FullTransactionIdIsValid(cur_latest) || + TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) + { + ShmemVariableCache->latestCompletedXid = + FullXidRelativeTo(rel, latestXid); + } + + Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); +} + +/* + * ProcArrayInitRecovery -- initialize recovery xid mgmt environment + * + * Remember up to where the startup process initialized the CLOG and subtrans + * so we can ensure it's initialized gaplessly up to the point where necessary + * while in recovery. + */ +void +ProcArrayInitRecovery(TransactionId initializedUptoXID) +{ + Assert(standbyState == STANDBY_INITIALIZED); + Assert(TransactionIdIsNormal(initializedUptoXID)); + + /* + * we set latestObservedXid to the xid SUBTRANS has been initialized up + * to, so we can extend it from that point onwards in + * RecordKnownAssignedTransactionIds, and when we get consistent in + * ProcArrayApplyRecoveryInfo(). + */ + latestObservedXid = initializedUptoXID; + TransactionIdRetreat(latestObservedXid); +} + +/* + * ProcArrayApplyRecoveryInfo -- apply recovery info about xids + * + * Takes us through 3 states: Initialized, Pending and Ready. + * Normal case is to go all the way to Ready straight away, though there + * are atypical cases where we need to take it in steps. + * + * Use the data about running transactions on the primary to create the initial + * state of KnownAssignedXids. We also use these records to regularly prune + * KnownAssignedXids because we know it is possible that some transactions + * with FATAL errors fail to write abort records, which could cause eventual + * overflow. + * + * See comments for LogStandbySnapshot(). + */ +void +ProcArrayApplyRecoveryInfo(RunningTransactions running) +{ + TransactionId *xids; + int nxids; + int i; + + Assert(standbyState >= STANDBY_INITIALIZED); + Assert(TransactionIdIsValid(running->nextXid)); + Assert(TransactionIdIsValid(running->oldestRunningXid)); + Assert(TransactionIdIsNormal(running->latestCompletedXid)); + + /* + * Remove stale transactions, if any. + */ + ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid); + + /* + * Remove stale locks, if any. + */ + StandbyReleaseOldLocks(running->oldestRunningXid); + + /* + * If our snapshot is already valid, nothing else to do... + */ + if (standbyState == STANDBY_SNAPSHOT_READY) + return; + + /* + * If our initial RunningTransactionsData had an overflowed snapshot then + * we knew we were missing some subxids from our snapshot. If we continue + * to see overflowed snapshots then we might never be able to start up, so + * we make another test to see if our snapshot is now valid. We know that + * the missing subxids are equal to or earlier than nextXid. After we + * initialise we continue to apply changes during recovery, so once the + * oldestRunningXid is later than the nextXid from the initial snapshot we + * know that we no longer have missing information and can mark the + * snapshot as valid. + */ + if (standbyState == STANDBY_SNAPSHOT_PENDING) + { + /* + * If the snapshot isn't overflowed or if its empty we can reset our + * pending state and use this snapshot instead. + */ + if (!running->subxid_overflow || running->xcnt == 0) + { + /* + * If we have already collected known assigned xids, we need to + * throw them away before we apply the recovery snapshot. + */ + KnownAssignedXidsReset(); + standbyState = STANDBY_INITIALIZED; + } + else + { + if (TransactionIdPrecedes(standbySnapshotPendingXmin, + running->oldestRunningXid)) + { + standbyState = STANDBY_SNAPSHOT_READY; + elog(trace_recovery(DEBUG1), + "recovery snapshots are now enabled"); + } + else + elog(trace_recovery(DEBUG1), + "recovery snapshot waiting for non-overflowed snapshot or " + "until oldest active xid on standby is at least %u (now %u)", + standbySnapshotPendingXmin, + running->oldestRunningXid); + return; + } + } + + Assert(standbyState == STANDBY_INITIALIZED); + + /* + * NB: this can be reached at least twice, so make sure new code can deal + * with that. + */ + + /* + * Nobody else is running yet, but take locks anyhow + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * KnownAssignedXids is sorted so we cannot just add the xids, we have to + * sort them first. + * + * Some of the new xids are top-level xids and some are subtransactions. + * We don't call SubTransSetParent because it doesn't matter yet. If we + * aren't overflowed then all xids will fit in snapshot and so we don't + * need subtrans. If we later overflow, an xid assignment record will add + * xids to subtrans. If RunningTransactionsData is overflowed then we + * don't have enough information to correctly update subtrans anyway. + */ + + /* + * Allocate a temporary array to avoid modifying the array passed as + * argument. + */ + xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt)); + + /* + * Add to the temp array any xids which have not already completed. + */ + nxids = 0; + for (i = 0; i < running->xcnt + running->subxcnt; i++) + { + TransactionId xid = running->xids[i]; + + /* + * The running-xacts snapshot can contain xids that were still visible + * in the procarray when the snapshot was taken, but were already + * WAL-logged as completed. They're not running anymore, so ignore + * them. + */ + if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + continue; + + xids[nxids++] = xid; + } + + if (nxids > 0) + { + if (procArray->numKnownAssignedXids != 0) + { + LWLockRelease(ProcArrayLock); + elog(ERROR, "KnownAssignedXids is not empty"); + } + + /* + * Sort the array so that we can add them safely into + * KnownAssignedXids. + * + * We have to sort them logically, because in KnownAssignedXidsAdd we + * call TransactionIdFollowsOrEquals and so on. But we know these XIDs + * come from RUNNING_XACTS, which means there are only normal XIDs + * from the same epoch, so this is safe. + */ + qsort(xids, nxids, sizeof(TransactionId), xidLogicalComparator); + + /* + * Add the sorted snapshot into KnownAssignedXids. The running-xacts + * snapshot may include duplicated xids because of prepared + * transactions, so ignore them. + */ + for (i = 0; i < nxids; i++) + { + if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i])) + { + elog(DEBUG1, + "found duplicated transaction %u for KnownAssignedXids insertion", + xids[i]); + continue; + } + KnownAssignedXidsAdd(xids[i], xids[i], true); + } + + KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); + } + + pfree(xids); + + /* + * latestObservedXid is at least set to the point where SUBTRANS was + * started up to (cf. ProcArrayInitRecovery()) or to the biggest xid + * RecordKnownAssignedTransactionIds() was called for. Initialize + * subtrans from thereon, up to nextXid - 1. + * + * We need to duplicate parts of RecordKnownAssignedTransactionId() here, + * because we've just added xids to the known assigned xids machinery that + * haven't gone through RecordKnownAssignedTransactionId(). + */ + Assert(TransactionIdIsNormal(latestObservedXid)); + TransactionIdAdvance(latestObservedXid); + while (TransactionIdPrecedes(latestObservedXid, running->nextXid)) + { + ExtendSUBTRANS(latestObservedXid); + TransactionIdAdvance(latestObservedXid); + } + TransactionIdRetreat(latestObservedXid); /* = running->nextXid - 1 */ + + /* ---------- + * Now we've got the running xids we need to set the global values that + * are used to track snapshots as they evolve further. + * + * - latestCompletedXid which will be the xmax for snapshots + * - lastOverflowedXid which shows whether snapshots overflow + * - nextXid + * + * If the snapshot overflowed, then we still initialise with what we know, + * but the recovery snapshot isn't fully valid yet because we know there + * are some subxids missing. We don't know the specific subxids that are + * missing, so conservatively assume the last one is latestObservedXid. + * ---------- + */ + if (running->subxid_overflow) + { + standbyState = STANDBY_SNAPSHOT_PENDING; + + standbySnapshotPendingXmin = latestObservedXid; + procArray->lastOverflowedXid = latestObservedXid; + } + else + { + standbyState = STANDBY_SNAPSHOT_READY; + + standbySnapshotPendingXmin = InvalidTransactionId; + } + + /* + * If a transaction wrote a commit record in the gap between taking and + * logging the snapshot then latestCompletedXid may already be higher than + * the value from the snapshot, so check before we use the incoming value. + * It also might not yet be set at all. + */ + MaintainLatestCompletedXidRecovery(running->latestCompletedXid); + + /* + * NB: No need to increment ShmemVariableCache->xactCompletionCount here, + * nobody can see it yet. + */ + + LWLockRelease(ProcArrayLock); + + /* ShmemVariableCache->nextXid must be beyond any observed xid. */ + AdvanceNextFullTransactionIdPastXid(latestObservedXid); + + Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid)); + + KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); + if (standbyState == STANDBY_SNAPSHOT_READY) + elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled"); + else + elog(trace_recovery(DEBUG1), + "recovery snapshot waiting for non-overflowed snapshot or " + "until oldest active xid on standby is at least %u (now %u)", + standbySnapshotPendingXmin, + running->oldestRunningXid); +} + +/* + * ProcArrayApplyXidAssignment + * Process an XLOG_XACT_ASSIGNMENT WAL record + */ +void +ProcArrayApplyXidAssignment(TransactionId topxid, + int nsubxids, TransactionId *subxids) +{ + TransactionId max_xid; + int i; + + Assert(standbyState >= STANDBY_INITIALIZED); + + max_xid = TransactionIdLatest(topxid, nsubxids, subxids); + + /* + * Mark all the subtransactions as observed. + * + * NOTE: This will fail if the subxid contains too many previously + * unobserved xids to fit into known-assigned-xids. That shouldn't happen + * as the code stands, because xid-assignment records should never contain + * more than PGPROC_MAX_CACHED_SUBXIDS entries. + */ + RecordKnownAssignedTransactionIds(max_xid); + + /* + * Notice that we update pg_subtrans with the top-level xid, rather than + * the parent xid. This is a difference between normal processing and + * recovery, yet is still correct in all cases. The reason is that + * subtransaction commit is not marked in clog until commit processing, so + * all aborted subtransactions have already been clearly marked in clog. + * As a result we are able to refer directly to the top-level + * transaction's state rather than skipping through all the intermediate + * states in the subtransaction tree. This should be the first time we + * have attempted to SubTransSetParent(). + */ + for (i = 0; i < nsubxids; i++) + SubTransSetParent(subxids[i], topxid); + + /* KnownAssignedXids isn't maintained yet, so we're done for now */ + if (standbyState == STANDBY_INITIALIZED) + return; + + /* + * Uses same locking as transaction commit + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Remove subxids from known-assigned-xacts. + */ + KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids); + + /* + * Advance lastOverflowedXid to be at least the last of these subxids. + */ + if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid)) + procArray->lastOverflowedXid = max_xid; + + LWLockRelease(ProcArrayLock); +} + +/* + * TransactionIdIsInProgress -- is given transaction running in some backend + * + * Aside from some shortcuts such as checking RecentXmin and our own Xid, + * there are four possibilities for finding a running transaction: + * + * 1. The given Xid is a main transaction Id. We will find this out cheaply + * by looking at ProcGlobal->xids. + * + * 2. The given Xid is one of the cached subxact Xids in the PGPROC array. + * We can find this out cheaply too. + * + * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see + * if the Xid is running on the primary. + * + * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see + * if that is running according to ProcGlobal->xids[] or KnownAssignedXids. + * This is the slowest way, but sadly it has to be done always if the others + * failed, unless we see that the cached subxact sets are complete (none have + * overflowed). + * + * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids + * while doing 1 and 3, we can release the ProcArrayLock while we do 4. + * This buys back some concurrency (and we can't retrieve the main Xids from + * ProcGlobal->xids[] again anyway; see GetNewTransactionId). + */ +bool +TransactionIdIsInProgress(TransactionId xid) +{ + static TransactionId *xids = NULL; + static TransactionId *other_xids; + XidCacheStatus *other_subxidstates; + int nxids = 0; + ProcArrayStruct *arrayP = procArray; + TransactionId topxid; + TransactionId latestCompletedXid; + int mypgxactoff; + int numProcs; + int j; + + /* + * Don't bother checking a transaction older than RecentXmin; it could not + * possibly still be running. (Note: in particular, this guarantees that + * we reject InvalidTransactionId, FrozenTransactionId, etc as not + * running.) + */ + if (TransactionIdPrecedes(xid, RecentXmin)) + { + xc_by_recent_xmin_inc(); + return false; + } + + /* + * We may have just checked the status of this transaction, so if it is + * already known to be completed, we can fall out without any access to + * shared memory. + */ + if (TransactionIdEquals(cachedXidIsNotInProgress, xid)) + { + xc_by_known_xact_inc(); + return false; + } + + /* + * Also, we can handle our own transaction (and subtransactions) without + * any access to shared memory. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + { + xc_by_my_xact_inc(); + return true; + } + + /* + * If first time through, get workspace to remember main XIDs in. We + * malloc it permanently to avoid repeated palloc/pfree overhead. + */ + if (xids == NULL) + { + /* + * In hot standby mode, reserve enough space to hold all xids in the + * known-assigned list. If we later finish recovery, we no longer need + * the bigger array, but we don't bother to shrink it. + */ + int maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs; + + xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId)); + if (xids == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + other_xids = ProcGlobal->xids; + other_subxidstates = ProcGlobal->subxidStates; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + /* + * Now that we have the lock, we can check latestCompletedXid; if the + * target Xid is after that, it's surely still running. + */ + latestCompletedXid = + XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid); + if (TransactionIdPrecedes(latestCompletedXid, xid)) + { + LWLockRelease(ProcArrayLock); + xc_by_latest_xid_inc(); + return true; + } + + /* No shortcuts, gotta grovel through the array */ + mypgxactoff = MyProc->pgxactoff; + numProcs = arrayP->numProcs; + for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) + { + int pgprocno; + PGPROC *proc; + TransactionId pxid; + int pxids; + + /* Ignore ourselves --- dealt with it above */ + if (pgxactoff == mypgxactoff) + continue; + + /* Fetch xid just once - see GetNewTransactionId */ + pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); + + if (!TransactionIdIsValid(pxid)) + continue; + + /* + * Step 1: check the main Xid + */ + if (TransactionIdEquals(pxid, xid)) + { + LWLockRelease(ProcArrayLock); + xc_by_main_xid_inc(); + return true; + } + + /* + * We can ignore main Xids that are younger than the target Xid, since + * the target could not possibly be their child. + */ + if (TransactionIdPrecedes(xid, pxid)) + continue; + + /* + * Step 2: check the cached child-Xids arrays + */ + pxids = other_subxidstates[pgxactoff].count; + pg_read_barrier(); /* pairs with barrier in GetNewTransactionId() */ + pgprocno = arrayP->pgprocnos[pgxactoff]; + proc = &allProcs[pgprocno]; + for (j = pxids - 1; j >= 0; j--) + { + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId cxid = UINT32_ACCESS_ONCE(proc->subxids.xids[j]); + + if (TransactionIdEquals(cxid, xid)) + { + LWLockRelease(ProcArrayLock); + xc_by_child_xid_inc(); + return true; + } + } + + /* + * Save the main Xid for step 4. We only need to remember main Xids + * that have uncached children. (Note: there is no race condition + * here because the overflowed flag cannot be cleared, only set, while + * we hold ProcArrayLock. So we can't miss an Xid that we need to + * worry about.) + */ + if (other_subxidstates[pgxactoff].overflowed) + xids[nxids++] = pxid; + } + + /* + * Step 3: in hot standby mode, check the known-assigned-xids list. XIDs + * in the list must be treated as running. + */ + if (RecoveryInProgress()) + { + /* none of the PGPROC entries should have XIDs in hot standby mode */ + Assert(nxids == 0); + + if (KnownAssignedXidExists(xid)) + { + LWLockRelease(ProcArrayLock); + xc_by_known_assigned_inc(); + return true; + } + + /* + * If the KnownAssignedXids overflowed, we have to check pg_subtrans + * too. Fetch all xids from KnownAssignedXids that are lower than + * xid, since if xid is a subtransaction its parent will always have a + * lower value. Note we will collect both main and subXIDs here, but + * there's no help for it. + */ + if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid)) + nxids = KnownAssignedXidsGet(xids, xid); + } + + LWLockRelease(ProcArrayLock); + + /* + * If none of the relevant caches overflowed, we know the Xid is not + * running without even looking at pg_subtrans. + */ + if (nxids == 0) + { + xc_no_overflow_inc(); + cachedXidIsNotInProgress = xid; + return false; + } + + /* + * Step 4: have to check pg_subtrans. + * + * At this point, we know it's either a subtransaction of one of the Xids + * in xids[], or it's not running. If it's an already-failed + * subtransaction, we want to say "not running" even though its parent may + * still be running. So first, check pg_xact to see if it's been aborted. + */ + xc_slow_answer_inc(); + + if (TransactionIdDidAbort(xid)) + { + cachedXidIsNotInProgress = xid; + return false; + } + + /* + * It isn't aborted, so check whether the transaction tree it belongs to + * is still running (or, more precisely, whether it was running when we + * held ProcArrayLock). + */ + topxid = SubTransGetTopmostTransaction(xid); + Assert(TransactionIdIsValid(topxid)); + if (!TransactionIdEquals(topxid, xid) && + pg_lfind32(topxid, xids, nxids)) + return true; + + cachedXidIsNotInProgress = xid; + return false; +} + +/* + * TransactionIdIsActive -- is xid the top-level XID of an active backend? + * + * This differs from TransactionIdIsInProgress in that it ignores prepared + * transactions, as well as transactions running on the primary if we're in + * hot standby. Also, we ignore subtransactions since that's not needed + * for current uses. + */ +bool +TransactionIdIsActive(TransactionId xid) +{ + bool result = false; + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + int i; + + /* + * Don't bother checking a transaction older than RecentXmin; it could not + * possibly still be running. + */ + if (TransactionIdPrecedes(xid, RecentXmin)) + return false; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (i = 0; i < arrayP->numProcs; i++) + { + int pgprocno = arrayP->pgprocnos[i]; + PGPROC *proc = &allProcs[pgprocno]; + TransactionId pxid; + + /* Fetch xid just once - see GetNewTransactionId */ + pxid = UINT32_ACCESS_ONCE(other_xids[i]); + + if (!TransactionIdIsValid(pxid)) + continue; + + if (proc->pid == 0) + continue; /* ignore prepared transactions */ + + if (TransactionIdEquals(pxid, xid)) + { + result = true; + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; +} + + +/* + * Determine XID horizons. + * + * This is used by wrapper functions like GetOldestNonRemovableTransactionId() + * (for VACUUM), GetReplicationHorizons() (for hot_standby_feedback), etc as + * well as "internally" by GlobalVisUpdate() (see comment above struct + * GlobalVisState). + * + * See the definition of ComputeXidHorizonsResult for the various computed + * horizons. + * + * For VACUUM separate horizons (used to decide which deleted tuples must + * be preserved), for shared and non-shared tables are computed. For shared + * relations backends in all databases must be considered, but for non-shared + * relations that's not required, since only backends in my own database could + * ever see the tuples in them. Also, we can ignore concurrently running lazy + * VACUUMs because (a) they must be working on other tables, and (b) they + * don't need to do snapshot-based lookups. + * + * This also computes a horizon used to truncate pg_subtrans. For that + * backends in all databases have to be considered, and concurrently running + * lazy VACUUMs cannot be ignored, as they still may perform pg_subtrans + * accesses. + * + * Note: we include all currently running xids in the set of considered xids. + * This ensures that if a just-started xact has not yet set its snapshot, + * when it does set the snapshot it cannot set xmin less than what we compute. + * See notes in src/backend/access/transam/README. + * + * Note: despite the above, it's possible for the calculated values to move + * backwards on repeated calls. The calculated values are conservative, so + * that anything older is definitely not considered as running by anyone + * anymore, but the exact values calculated depend on a number of things. For + * example, if there are no transactions running in the current database, the + * horizon for normal tables will be latestCompletedXid. If a transaction + * begins after that, its xmin will include in-progress transactions in other + * databases that started earlier, so another call will return a lower value. + * Nonetheless it is safe to vacuum a table in the current database with the + * first result. There are also replication-related effects: a walsender + * process can set its xmin based on transactions that are no longer running + * on the primary but are still being replayed on the standby, thus possibly + * making the values go backwards. In this case there is a possibility that + * we lose data that the standby would like to have, but unless the standby + * uses a replication slot to make its xmin persistent there is little we can + * do about that --- data is only protected if the walsender runs continuously + * while queries are executed on the standby. (The Hot Standby code deals + * with such cases by failing standby queries that needed to access + * already-removed data, so there's no integrity bug.) + * + * Note: the approximate horizons (see definition of GlobalVisState) are + * updated by the computations done here. That's currently required for + * correctness and a small optimization. Without doing so it's possible that + * heap vacuum's call to heap_page_prune() uses a more conservative horizon + * than later when deciding which tuples can be removed - which the code + * doesn't expect (breaking HOT). + */ +static void +ComputeXidHorizons(ComputeXidHorizonsResult *h) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId kaxmin; + bool in_recovery = RecoveryInProgress(); + TransactionId *other_xids = ProcGlobal->xids; + + /* inferred after ProcArrayLock is released */ + h->catalog_oldest_nonremovable = InvalidTransactionId; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + h->latest_completed = ShmemVariableCache->latestCompletedXid; + + /* + * We initialize the MIN() calculation with latestCompletedXid + 1. This + * is a lower bound for the XIDs that might appear in the ProcArray later, + * and so protects us against overestimating the result due to future + * additions. + */ + { + TransactionId initial; + + initial = XidFromFullTransactionId(h->latest_completed); + Assert(TransactionIdIsValid(initial)); + TransactionIdAdvance(initial); + + h->oldest_considered_running = initial; + h->shared_oldest_nonremovable = initial; + h->data_oldest_nonremovable = initial; + + /* + * Only modifications made by this backend affect the horizon for + * temporary relations. Instead of a check in each iteration of the + * loop over all PGPROCs it is cheaper to just initialize to the + * current top-level xid any. + * + * Without an assigned xid we could use a horizon as aggressive as + * GetNewTransactionId(), but we can get away with the much cheaper + * latestCompletedXid + 1: If this backend has no xid there, by + * definition, can't be any newer changes in the temp table than + * latestCompletedXid. + */ + if (TransactionIdIsValid(MyProc->xid)) + h->temp_oldest_nonremovable = MyProc->xid; + else + h->temp_oldest_nonremovable = initial; + } + + /* + * Fetch slot horizons while ProcArrayLock is held - the + * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside + * the lock. + */ + h->slot_xmin = procArray->replication_slot_xmin; + h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + + for (int index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + int8 statusFlags = ProcGlobal->statusFlags[index]; + TransactionId xid; + TransactionId xmin; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(other_xids[index]); + xmin = UINT32_ACCESS_ONCE(proc->xmin); + + /* + * Consider both the transaction's Xmin, and its Xid. + * + * We must check both because a transaction might have an Xmin but not + * (yet) an Xid; conversely, if it has an Xid, that could determine + * some not-yet-set Xmin. + */ + xmin = TransactionIdOlder(xmin, xid); + + /* if neither is set, this proc doesn't influence the horizon */ + if (!TransactionIdIsValid(xmin)) + continue; + + /* + * Don't ignore any procs when determining which transactions might be + * considered running. While slots should ensure logical decoding + * backends are protected even without this check, it can't hurt to + * include them here as well.. + */ + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, xmin); + + /* + * Skip over backends either vacuuming (which is ok with rows being + * removed, as long as pg_subtrans is not truncated) or doing logical + * decoding (which manages xmin separately, check below). + */ + if (statusFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING)) + continue; + + /* shared tables need to take backends in all databases into account */ + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, xmin); + + /* + * Normally sessions in other databases are ignored for anything but + * the shared horizon. + * + * However, include them when MyDatabaseId is not (yet) set. A + * backend in the process of starting up must not compute a "too + * aggressive" horizon, otherwise we could end up using it to prune + * still-needed data away. If the current backend never connects to a + * database this is harmless, because data_oldest_nonremovable will + * never be utilized. + * + * Also, sessions marked with PROC_AFFECTS_ALL_HORIZONS should always + * be included. (This flag is used for hot standby feedback, which + * can't be tied to a specific database.) + * + * Also, while in recovery we cannot compute an accurate per-database + * horizon, as all xids are managed via the KnownAssignedXids + * machinery. + */ + if (proc->databaseId == MyDatabaseId || + MyDatabaseId == InvalidOid || + (statusFlags & PROC_AFFECTS_ALL_HORIZONS) || + in_recovery) + { + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, xmin); + } + } + + /* + * If in recovery fetch oldest xid in KnownAssignedXids, will be applied + * after lock is released. + */ + if (in_recovery) + kaxmin = KnownAssignedXidsGetOldestXmin(); + + /* + * No other information from shared state is needed, release the lock + * immediately. The rest of the computations can be done without a lock. + */ + LWLockRelease(ProcArrayLock); + + if (in_recovery) + { + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, kaxmin); + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin); + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, kaxmin); + /* temp relations cannot be accessed in recovery */ + } + + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->shared_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable, + h->data_oldest_nonremovable)); + + /* + * Check whether there are replication slots requiring an older xmin. + */ + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_xmin); + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin); + + /* + * The only difference between catalog / data horizons is that the slot's + * catalog xmin is applied to the catalog one (so catalogs can be accessed + * for logical decoding). Initialize with data horizon, and then back up + * further if necessary. Have to back up the shared horizon as well, since + * that also can contain catalogs. + */ + h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable; + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, + h->slot_catalog_xmin); + h->catalog_oldest_nonremovable = h->data_oldest_nonremovable; + h->catalog_oldest_nonremovable = + TransactionIdOlder(h->catalog_oldest_nonremovable, + h->slot_catalog_xmin); + + /* + * It's possible that slots backed up the horizons further than + * oldest_considered_running. Fix. + */ + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->shared_oldest_nonremovable); + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->catalog_oldest_nonremovable); + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->data_oldest_nonremovable); + + /* + * shared horizons have to be at least as old as the oldest visible in + * current db + */ + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable, + h->data_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable, + h->catalog_oldest_nonremovable)); + + /* + * Horizons need to ensure that pg_subtrans access is still possible for + * the relevant backends. + */ + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->shared_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->catalog_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->data_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->temp_oldest_nonremovable)); + Assert(!TransactionIdIsValid(h->slot_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->slot_xmin)); + Assert(!TransactionIdIsValid(h->slot_catalog_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->slot_catalog_xmin)); + + /* update approximate horizons with the computed horizons */ + GlobalVisUpdateApply(h); +} + +/* + * Determine what kind of visibility horizon needs to be used for a + * relation. If rel is NULL, the most conservative horizon is used. + */ +static inline GlobalVisHorizonKind +GlobalVisHorizonKindForRel(Relation rel) +{ + /* + * Other relkinds currently don't contain xids, nor always the necessary + * logical decoding markers. + */ + Assert(!rel || + rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); + + if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress()) + return VISHORIZON_SHARED; + else if (IsCatalogRelation(rel) || + RelationIsAccessibleInLogicalDecoding(rel)) + return VISHORIZON_CATALOG; + else if (!RELATION_IS_LOCAL(rel)) + return VISHORIZON_DATA; + else + return VISHORIZON_TEMP; +} + +/* + * Return the oldest XID for which deleted tuples must be preserved in the + * passed table. + * + * If rel is not NULL the horizon may be considerably more recent than + * otherwise (i.e. fewer tuples will be removable). In the NULL case a horizon + * that is correct (but not optimal) for all relations will be returned. + * + * This is used by VACUUM to decide which deleted tuples must be preserved in + * the passed in table. + */ +TransactionId +GetOldestNonRemovableTransactionId(Relation rel) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + switch (GlobalVisHorizonKindForRel(rel)) + { + case VISHORIZON_SHARED: + return horizons.shared_oldest_nonremovable; + case VISHORIZON_CATALOG: + return horizons.catalog_oldest_nonremovable; + case VISHORIZON_DATA: + return horizons.data_oldest_nonremovable; + case VISHORIZON_TEMP: + return horizons.temp_oldest_nonremovable; + } + + /* just to prevent compiler warnings */ + return InvalidTransactionId; +} + +/* + * Return the oldest transaction id any currently running backend might still + * consider running. This should not be used for visibility / pruning + * determinations (see GetOldestNonRemovableTransactionId()), but for + * decisions like up to where pg_subtrans can be truncated. + */ +TransactionId +GetOldestTransactionIdConsideredRunning(void) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + return horizons.oldest_considered_running; +} + +/* + * Return the visibility horizons for a hot standby feedback message. + */ +void +GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + /* + * Don't want to use shared_oldest_nonremovable here, as that contains the + * effect of replication slot's catalog_xmin. We want to send a separate + * feedback for the catalog horizon, so the primary can remove data table + * contents more aggressively. + */ + *xmin = horizons.shared_oldest_nonremovable_raw; + *catalog_xmin = horizons.slot_catalog_xmin; +} + +/* + * GetMaxSnapshotXidCount -- get max size for snapshot XID array + * + * We have to export this for use by snapmgr.c. + */ +int +GetMaxSnapshotXidCount(void) +{ + return procArray->maxProcs; +} + +/* + * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array + * + * We have to export this for use by snapmgr.c. + */ +int +GetMaxSnapshotSubxidCount(void) +{ + return TOTAL_MAX_CACHED_SUBXIDS; +} + +/* + * Initialize old_snapshot_threshold specific parts of a newly build snapshot. + */ +static void +GetSnapshotDataInitOldSnapshot(Snapshot snapshot) +{ + if (!OldSnapshotThresholdActive()) + { + /* + * If not using "snapshot too old" feature, fill related fields with + * dummy values that don't require any locking. + */ + snapshot->lsn = InvalidXLogRecPtr; + snapshot->whenTaken = 0; + } + else + { + /* + * Capture the current time and WAL stream location in case this + * snapshot becomes old enough to need to fall back on the special + * "old snapshot" logic. + */ + snapshot->lsn = GetXLogInsertRecPtr(); + snapshot->whenTaken = GetSnapshotCurrentTimestamp(); + MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin); + } +} + +/* + * Helper function for GetSnapshotData() that checks if the bulk of the + * visibility information in the snapshot is still valid. If so, it updates + * the fields that need to change and returns true. Otherwise it returns + * false. + * + * This very likely can be evolved to not need ProcArrayLock held (at very + * least in the case we already hold a snapshot), but that's for another day. + */ +static bool +GetSnapshotDataReuse(Snapshot snapshot) +{ + uint64 curXactCompletionCount; + + Assert(LWLockHeldByMe(ProcArrayLock)); + + if (unlikely(snapshot->snapXactCompletionCount == 0)) + return false; + + curXactCompletionCount = ShmemVariableCache->xactCompletionCount; + if (curXactCompletionCount != snapshot->snapXactCompletionCount) + return false; + + /* + * If the current xactCompletionCount is still the same as it was at the + * time the snapshot was built, we can be sure that rebuilding the + * contents of the snapshot the hard way would result in the same snapshot + * contents: + * + * As explained in transam/README, the set of xids considered running by + * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot + * contents only depend on transactions with xids and xactCompletionCount + * is incremented whenever a transaction with an xid finishes (while + * holding ProcArrayLock exclusively). Thus the xactCompletionCount check + * ensures we would detect if the snapshot would have changed. + * + * As the snapshot contents are the same as it was before, it is safe to + * re-enter the snapshot's xmin into the PGPROC array. None of the rows + * visible under the snapshot could already have been removed (that'd + * require the set of running transactions to change) and it fulfills the + * requirement that concurrent GetSnapshotData() calls yield the same + * xmin. + */ + if (!TransactionIdIsValid(MyProc->xmin)) + MyProc->xmin = TransactionXmin = snapshot->xmin; + + RecentXmin = snapshot->xmin; + Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin)); + + snapshot->curcid = GetCurrentCommandId(false); + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->copied = false; + + GetSnapshotDataInitOldSnapshot(snapshot); + + return true; +} + +/* + * GetSnapshotData -- returns information about running transactions. + * + * The returned snapshot includes xmin (lowest still-running xact ID), + * xmax (highest completed xact ID + 1), and a list of running xact IDs + * in the range xmin <= xid < xmax. It is used as follows: + * All xact IDs < xmin are considered finished. + * All xact IDs >= xmax are considered still running. + * For an xact ID xmin <= xid < xmax, consult list to see whether + * it is considered running or not. + * This ensures that the set of transactions seen as "running" by the + * current xact will not change after it takes the snapshot. + * + * All running top-level XIDs are included in the snapshot, except for lazy + * VACUUM processes. We also try to include running subtransaction XIDs, + * but since PGPROC has only a limited cache area for subxact XIDs, full + * information may not be available. If we find any overflowed subxid arrays, + * we have to mark the snapshot's subxid data as overflowed, and extra work + * *may* need to be done to determine what's running (see XidInMVCCSnapshot()). + * + * We also update the following backend-global variables: + * TransactionXmin: the oldest xmin of any snapshot in use in the + * current transaction (this is the same as MyProc->xmin). + * RecentXmin: the xmin computed for the most recent snapshot. XIDs + * older than this are known not running any more. + * + * And try to advance the bounds of GlobalVis{Shared,Catalog,Data,Temp}Rels + * for the benefit of the GlobalVisTest* family of functions. + * + * Note: this function should probably not be called with an argument that's + * not statically allocated (see xip allocation below). + */ +Snapshot +GetSnapshotData(Snapshot snapshot) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + TransactionId xmin; + TransactionId xmax; + int count = 0; + int subcount = 0; + bool suboverflowed = false; + FullTransactionId latest_completed; + TransactionId oldestxid; + int mypgxactoff; + TransactionId myxid; + uint64 curXactCompletionCount; + + TransactionId replication_slot_xmin = InvalidTransactionId; + TransactionId replication_slot_catalog_xmin = InvalidTransactionId; + + Assert(snapshot != NULL); + + /* + * Allocating space for maxProcs xids is usually overkill; numProcs would + * be sufficient. But it seems better to do the malloc while not holding + * the lock, so we can't look at numProcs. Likewise, we allocate much + * more subxip storage than is probably needed. + * + * This does open a possibility for avoiding repeated malloc/free: since + * maxProcs does not change at runtime, we can simply reuse the previous + * xip arrays if any. (This relies on the fact that all callers pass + * static SnapshotData structs.) + */ + if (snapshot->xip == NULL) + { + /* + * First call for this snapshot. Snapshot is same size whether or not + * we are in recovery, see later comments. + */ + snapshot->xip = (TransactionId *) + malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + Assert(snapshot->subxip == NULL); + snapshot->subxip = (TransactionId *) + malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId)); + if (snapshot->subxip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* + * It is sufficient to get shared lock on ProcArrayLock, even if we are + * going to set MyProc->xmin. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + if (GetSnapshotDataReuse(snapshot)) + { + LWLockRelease(ProcArrayLock); + return snapshot; + } + + latest_completed = ShmemVariableCache->latestCompletedXid; + mypgxactoff = MyProc->pgxactoff; + myxid = other_xids[mypgxactoff]; + Assert(myxid == MyProc->xid); + + oldestxid = ShmemVariableCache->oldestXid; + curXactCompletionCount = ShmemVariableCache->xactCompletionCount; + + /* xmax is always latestCompletedXid + 1 */ + xmax = XidFromFullTransactionId(latest_completed); + TransactionIdAdvance(xmax); + Assert(TransactionIdIsNormal(xmax)); + + /* initialize xmin calculation with xmax */ + xmin = xmax; + + /* take own xid into account, saves a check inside the loop */ + if (TransactionIdIsNormal(myxid) && NormalTransactionIdPrecedes(myxid, xmin)) + xmin = myxid; + + snapshot->takenDuringRecovery = RecoveryInProgress(); + + if (!snapshot->takenDuringRecovery) + { + int numProcs = arrayP->numProcs; + TransactionId *xip = snapshot->xip; + int *pgprocnos = arrayP->pgprocnos; + XidCacheStatus *subxidStates = ProcGlobal->subxidStates; + uint8 *allStatusFlags = ProcGlobal->statusFlags; + + /* + * First collect set of pgxactoff/xids that need to be included in the + * snapshot. + */ + for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) + { + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); + uint8 statusFlags; + + Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff); + + /* + * If the transaction has no XID assigned, we can skip it; it + * won't have sub-XIDs either. + */ + if (likely(xid == InvalidTransactionId)) + continue; + + /* + * We don't include our own XIDs (if any) in the snapshot. It + * needs to be included in the xmin computation, but we did so + * outside the loop. + */ + if (pgxactoff == mypgxactoff) + continue; + + /* + * The only way we are able to get here with a non-normal xid is + * during bootstrap - with this backend using + * BootstrapTransactionId. But the above test should filter that + * out. + */ + Assert(TransactionIdIsNormal(xid)); + + /* + * If the XID is >= xmax, we can skip it; such transactions will + * be treated as running anyway (and any sub-XIDs will also be >= + * xmax). + */ + if (!NormalTransactionIdPrecedes(xid, xmax)) + continue; + + /* + * Skip over backends doing logical decoding which manages xmin + * separately (check below) and ones running LAZY VACUUM. + */ + statusFlags = allStatusFlags[pgxactoff]; + if (statusFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM)) + continue; + + if (NormalTransactionIdPrecedes(xid, xmin)) + xmin = xid; + + /* Add XID to snapshot. */ + xip[count++] = xid; + + /* + * Save subtransaction XIDs if possible (if we've already + * overflowed, there's no point). Note that the subxact XIDs must + * be later than their parent, so no need to check them against + * xmin. We could filter against xmax, but it seems better not to + * do that much work while holding the ProcArrayLock. + * + * The other backend can add more subxids concurrently, but cannot + * remove any. Hence it's important to fetch nxids just once. + * Should be safe to use memcpy, though. (We needn't worry about + * missing any xids added concurrently, because they must postdate + * xmax.) + * + * Again, our own XIDs are not included in the snapshot. + */ + if (!suboverflowed) + { + + if (subxidStates[pgxactoff].overflowed) + suboverflowed = true; + else + { + int nsubxids = subxidStates[pgxactoff].count; + + if (nsubxids > 0) + { + int pgprocno = pgprocnos[pgxactoff]; + PGPROC *proc = &allProcs[pgprocno]; + + pg_read_barrier(); /* pairs with GetNewTransactionId */ + + memcpy(snapshot->subxip + subcount, + proc->subxids.xids, + nsubxids * sizeof(TransactionId)); + subcount += nsubxids; + } + } + } + } + } + else + { + /* + * We're in hot standby, so get XIDs from KnownAssignedXids. + * + * We store all xids directly into subxip[]. Here's why: + * + * In recovery we don't know which xids are top-level and which are + * subxacts, a design choice that greatly simplifies xid processing. + * + * It seems like we would want to try to put xids into xip[] only, but + * that is fairly small. We would either need to make that bigger or + * to increase the rate at which we WAL-log xid assignment; neither is + * an appealing choice. + * + * We could try to store xids into xip[] first and then into subxip[] + * if there are too many xids. That only works if the snapshot doesn't + * overflow because we do not search subxip[] in that case. A simpler + * way is to just store all xids in the subxip array because this is + * by far the bigger array. We just leave the xip array empty. + * + * Either way we need to change the way XidInMVCCSnapshot() works + * depending upon when the snapshot was taken, or change normal + * snapshot processing so it matches. + * + * Note: It is possible for recovery to end before we finish taking + * the snapshot, and for newly assigned transaction ids to be added to + * the ProcArray. xmax cannot change while we hold ProcArrayLock, so + * those newly added transaction ids would be filtered away, so we + * need not be concerned about them. + */ + subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin, + xmax); + + if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid)) + suboverflowed = true; + } + + + /* + * Fetch into local variable while ProcArrayLock is held - the + * LWLockRelease below is a barrier, ensuring this happens inside the + * lock. + */ + replication_slot_xmin = procArray->replication_slot_xmin; + replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + + if (!TransactionIdIsValid(MyProc->xmin)) + MyProc->xmin = TransactionXmin = xmin; + + LWLockRelease(ProcArrayLock); + + /* maintain state for GlobalVis* */ + { + TransactionId def_vis_xid; + TransactionId def_vis_xid_data; + FullTransactionId def_vis_fxid; + FullTransactionId def_vis_fxid_data; + FullTransactionId oldestfxid; + + /* + * Converting oldestXid is only safe when xid horizon cannot advance, + * i.e. holding locks. While we don't hold the lock anymore, all the + * necessary data has been gathered with lock held. + */ + oldestfxid = FullXidRelativeTo(latest_completed, oldestxid); + + /* Check whether there's a replication slot requiring an older xmin. */ + def_vis_xid_data = + TransactionIdOlder(xmin, replication_slot_xmin); + + /* + * Rows in non-shared, non-catalog tables possibly could be vacuumed + * if older than this xid. + */ + def_vis_xid = def_vis_xid_data; + + /* + * Check whether there's a replication slot requiring an older catalog + * xmin. + */ + def_vis_xid = + TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid); + + def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid); + def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data); + + /* + * Check if we can increase upper bound. As a previous + * GlobalVisUpdate() might have computed more aggressive values, don't + * overwrite them if so. + */ + GlobalVisSharedRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid, + GlobalVisSharedRels.definitely_needed); + GlobalVisCatalogRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid, + GlobalVisCatalogRels.definitely_needed); + GlobalVisDataRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid_data, + GlobalVisDataRels.definitely_needed); + /* See temp_oldest_nonremovable computation in ComputeXidHorizons() */ + if (TransactionIdIsNormal(myxid)) + GlobalVisTempRels.definitely_needed = + FullXidRelativeTo(latest_completed, myxid); + else + { + GlobalVisTempRels.definitely_needed = latest_completed; + FullTransactionIdAdvance(&GlobalVisTempRels.definitely_needed); + } + + /* + * Check if we know that we can initialize or increase the lower + * bound. Currently the only cheap way to do so is to use + * ShmemVariableCache->oldestXid as input. + * + * We should definitely be able to do better. We could e.g. put a + * global lower bound value into ShmemVariableCache. + */ + GlobalVisSharedRels.maybe_needed = + FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed, + oldestfxid); + GlobalVisCatalogRels.maybe_needed = + FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed, + oldestfxid); + GlobalVisDataRels.maybe_needed = + FullTransactionIdNewer(GlobalVisDataRels.maybe_needed, + oldestfxid); + /* accurate value known */ + GlobalVisTempRels.maybe_needed = GlobalVisTempRels.definitely_needed; + } + + RecentXmin = xmin; + Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin)); + + snapshot->xmin = xmin; + snapshot->xmax = xmax; + snapshot->xcnt = count; + snapshot->subxcnt = subcount; + snapshot->suboverflowed = suboverflowed; + snapshot->snapXactCompletionCount = curXactCompletionCount; + + snapshot->curcid = GetCurrentCommandId(false); + + /* + * This is a new snapshot, so set both refcounts are zero, and mark it as + * not copied in persistent memory. + */ + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->copied = false; + + GetSnapshotDataInitOldSnapshot(snapshot); + + return snapshot; +} + +/* + * ProcArrayInstallImportedXmin -- install imported xmin into MyProc->xmin + * + * This is called when installing a snapshot imported from another + * transaction. To ensure that OldestXmin doesn't go backwards, we must + * check that the source transaction is still running, and we'd better do + * that atomically with installing the new xmin. + * + * Returns true if successful, false if source xact is no longer running. + */ +bool +ProcArrayInstallImportedXmin(TransactionId xmin, + VirtualTransactionId *sourcevxid) +{ + bool result = false; + ProcArrayStruct *arrayP = procArray; + int index; + + Assert(TransactionIdIsNormal(xmin)); + if (!sourcevxid) + return false; + + /* Get lock so source xact can't end while we're doing this */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + int statusFlags = ProcGlobal->statusFlags[index]; + TransactionId xid; + + /* Ignore procs running LAZY VACUUM */ + if (statusFlags & PROC_IN_VACUUM) + continue; + + /* We are only interested in the specific virtual transaction. */ + if (proc->backendId != sourcevxid->backendId) + continue; + if (proc->lxid != sourcevxid->localTransactionId) + continue; + + /* + * We check the transaction's database ID for paranoia's sake: if it's + * in another DB then its xmin does not cover us. Caller should have + * detected this already, so we just treat any funny cases as + * "transaction not found". + */ + if (proc->databaseId != MyDatabaseId) + continue; + + /* + * Likewise, let's just make real sure its xmin does cover us. + */ + xid = UINT32_ACCESS_ONCE(proc->xmin); + if (!TransactionIdIsNormal(xid) || + !TransactionIdPrecedesOrEquals(xid, xmin)) + continue; + + /* + * We're good. Install the new xmin. As in GetSnapshotData, set + * TransactionXmin too. (Note that because snapmgr.c called + * GetSnapshotData first, we'll be overwriting a valid xmin here, so + * we don't check that.) + */ + MyProc->xmin = TransactionXmin = xmin; + + result = true; + break; + } + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * ProcArrayInstallRestoredXmin -- install restored xmin into MyProc->xmin + * + * This is like ProcArrayInstallImportedXmin, but we have a pointer to the + * PGPROC of the transaction from which we imported the snapshot, rather than + * an XID. + * + * Note that this function also copies statusFlags from the source `proc` in + * order to avoid the case where MyProc's xmin needs to be skipped for + * computing xid horizon. + * + * Returns true if successful, false if source xact is no longer running. + */ +bool +ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) +{ + bool result = false; + TransactionId xid; + + Assert(TransactionIdIsNormal(xmin)); + Assert(proc != NULL); + + /* + * Get an exclusive lock so that we can copy statusFlags from source proc. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Be certain that the referenced PGPROC has an advertised xmin which is + * no later than the one we're installing, so that the system-wide xmin + * can't go backwards. Also, make sure it's running in the same database, + * so that the per-database xmin cannot go backwards. + */ + xid = UINT32_ACCESS_ONCE(proc->xmin); + if (proc->databaseId == MyDatabaseId && + TransactionIdIsNormal(xid) && + TransactionIdPrecedesOrEquals(xid, xmin)) + { + /* + * Install xmin and propagate the statusFlags that affect how the + * value is interpreted by vacuum. + */ + MyProc->xmin = TransactionXmin = xmin; + MyProc->statusFlags = (MyProc->statusFlags & ~PROC_XMIN_FLAGS) | + (proc->statusFlags & PROC_XMIN_FLAGS); + ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; + + result = true; + } + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * GetRunningTransactionData -- returns information about running transactions. + * + * Similar to GetSnapshotData but returns more information. We include + * all PGPROCs with an assigned TransactionId, even VACUUM processes and + * prepared transactions. + * + * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for + * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc + * array until the caller has WAL-logged this snapshot, and releases the + * lock. Acquiring ProcArrayLock ensures that no transactions commit until the + * lock is released. + * + * The returned data structure is statically allocated; caller should not + * modify it, and must not assume it is valid past the next call. + * + * This is never executed during recovery so there is no need to look at + * KnownAssignedXids. + * + * Dummy PGPROCs from prepared transaction are included, meaning that this + * may return entries with duplicated TransactionId values coming from + * transaction finishing to prepare. Nothing is done about duplicated + * entries here to not hold on ProcArrayLock more than necessary. + * + * We don't worry about updating other counters, we want to keep this as + * simple as possible and leave GetSnapshotData() as the primary code for + * that bookkeeping. + * + * Note that if any transaction has overflowed its cached subtransactions + * then there is no real need include any subtransactions. + */ +RunningTransactions +GetRunningTransactionData(void) +{ + /* result workspace */ + static RunningTransactionsData CurrentRunningXactsData; + + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData; + TransactionId latestCompletedXid; + TransactionId oldestRunningXid; + TransactionId *xids; + int index; + int count; + int subcount; + bool suboverflowed; + + Assert(!RecoveryInProgress()); + + /* + * Allocating space for maxProcs xids is usually overkill; numProcs would + * be sufficient. But it seems better to do the malloc while not holding + * the lock, so we can't look at numProcs. Likewise, we allocate much + * more subxip storage than is probably needed. + * + * Should only be allocated in bgwriter, since only ever executed during + * checkpoints. + */ + if (CurrentRunningXacts->xids == NULL) + { + /* + * First call + */ + CurrentRunningXacts->xids = (TransactionId *) + malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); + if (CurrentRunningXacts->xids == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + xids = CurrentRunningXacts->xids; + + count = subcount = 0; + suboverflowed = false; + + /* + * Ensure that no xids enter or leave the procarray while we obtain + * snapshot. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + LWLockAcquire(XidGenLock, LW_SHARED); + + latestCompletedXid = + XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid); + oldestRunningXid = + XidFromFullTransactionId(ShmemVariableCache->nextXid); + + /* + * Spin over procArray collecting all xids + */ + for (index = 0; index < arrayP->numProcs; index++) + { + TransactionId xid; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(other_xids[index]); + + /* + * We don't need to store transactions that don't have a TransactionId + * yet because they will not show as running on a standby server. + */ + if (!TransactionIdIsValid(xid)) + continue; + + /* + * Be careful not to exclude any xids before calculating the values of + * oldestRunningXid and suboverflowed, since these are used to clean + * up transaction information held on standbys. + */ + if (TransactionIdPrecedes(xid, oldestRunningXid)) + oldestRunningXid = xid; + + if (ProcGlobal->subxidStates[index].overflowed) + suboverflowed = true; + + /* + * If we wished to exclude xids this would be the right place for it. + * Procs with the PROC_IN_VACUUM flag set don't usually assign xids, + * but they do during truncation at the end when they get the lock and + * truncate, so it is not much of a problem to include them if they + * are seen and it is cleaner to include them. + */ + + xids[count++] = xid; + } + + /* + * Spin over procArray collecting all subxids, but only if there hasn't + * been a suboverflow. + */ + if (!suboverflowed) + { + XidCacheStatus *other_subxidstates = ProcGlobal->subxidStates; + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + int nsubxids; + + /* + * Save subtransaction XIDs. Other backends can't add or remove + * entries while we're holding XidGenLock. + */ + nsubxids = other_subxidstates[index].count; + if (nsubxids > 0) + { + /* barrier not really required, as XidGenLock is held, but ... */ + pg_read_barrier(); /* pairs with GetNewTransactionId */ + + memcpy(&xids[count], proc->subxids.xids, + nsubxids * sizeof(TransactionId)); + count += nsubxids; + subcount += nsubxids; + + /* + * Top-level XID of a transaction is always less than any of + * its subxids, so we don't need to check if any of the + * subxids are smaller than oldestRunningXid + */ + } + } + } + + /* + * It's important *not* to include the limits set by slots here because + * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those + * were to be included here the initial value could never increase because + * of a circular dependency where slots only increase their limits when + * running xacts increases oldestRunningXid and running xacts only + * increases if slots do. + */ + + CurrentRunningXacts->xcnt = count - subcount; + CurrentRunningXacts->subxcnt = subcount; + CurrentRunningXacts->subxid_overflow = suboverflowed; + CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + CurrentRunningXacts->oldestRunningXid = oldestRunningXid; + CurrentRunningXacts->latestCompletedXid = latestCompletedXid; + + Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); + Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); + Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid)); + + /* We don't release the locks here, the caller is responsible for that */ + + return CurrentRunningXacts; +} + +/* + * GetOldestActiveTransactionId() + * + * Similar to GetSnapshotData but returns just oldestActiveXid. We include + * all PGPROCs with an assigned TransactionId, even VACUUM processes. + * We look at all databases, though there is no need to include WALSender + * since this has no effect on hot standby conflicts. + * + * This is never executed during recovery so there is no need to look at + * KnownAssignedXids. + * + * We don't worry about updating other counters, we want to keep this as + * simple as possible and leave GetSnapshotData() as the primary code for + * that bookkeeping. + */ +TransactionId +GetOldestActiveTransactionId(void) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + TransactionId oldestRunningXid; + int index; + + Assert(!RecoveryInProgress()); + + /* + * Read nextXid, as the upper bound of what's still active. + * + * Reading a TransactionId is atomic, but we must grab the lock to make + * sure that all XIDs < nextXid are already present in the proc array (or + * have already completed), when we spin over it. + */ + LWLockAcquire(XidGenLock, LW_SHARED); + oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + LWLockRelease(XidGenLock); + + /* + * Spin over procArray collecting all xids and subxids. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + for (index = 0; index < arrayP->numProcs; index++) + { + TransactionId xid; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(other_xids[index]); + + if (!TransactionIdIsNormal(xid)) + continue; + + if (TransactionIdPrecedes(xid, oldestRunningXid)) + oldestRunningXid = xid; + + /* + * Top-level XID of a transaction is always less than any of its + * subxids, so we don't need to check if any of the subxids are + * smaller than oldestRunningXid + */ + } + LWLockRelease(ProcArrayLock); + + return oldestRunningXid; +} + +/* + * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum + * + * Returns the oldest xid that we can guarantee not to have been affected by + * vacuum, i.e. no rows >= that xid have been vacuumed away unless the + * transaction aborted. Note that the value can (and most of the time will) be + * much more conservative than what really has been affected by vacuum, but we + * currently don't have better data available. + * + * This is useful to initialize the cutoff xid after which a new changeset + * extraction replication slot can start decoding changes. + * + * Must be called with ProcArrayLock held either shared or exclusively, + * although most callers will want to use exclusive mode since it is expected + * that the caller will immediately use the xid to peg the xmin horizon. + */ +TransactionId +GetOldestSafeDecodingTransactionId(bool catalogOnly) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId oldestSafeXid; + int index; + bool recovery_in_progress = RecoveryInProgress(); + + Assert(LWLockHeldByMe(ProcArrayLock)); + + /* + * Acquire XidGenLock, so no transactions can acquire an xid while we're + * running. If no transaction with xid were running concurrently a new xid + * could influence the RecentXmin et al. + * + * We initialize the computation to nextXid since that's guaranteed to be + * a safe, albeit pessimal, value. + */ + LWLockAcquire(XidGenLock, LW_SHARED); + oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + + /* + * If there's already a slot pegging the xmin horizon, we can start with + * that value, it's guaranteed to be safe since it's computed by this + * routine initially and has been enforced since. We can always use the + * slot's general xmin horizon, but the catalog horizon is only usable + * when only catalog data is going to be looked at. + */ + if (TransactionIdIsValid(procArray->replication_slot_xmin) && + TransactionIdPrecedes(procArray->replication_slot_xmin, + oldestSafeXid)) + oldestSafeXid = procArray->replication_slot_xmin; + + if (catalogOnly && + TransactionIdIsValid(procArray->replication_slot_catalog_xmin) && + TransactionIdPrecedes(procArray->replication_slot_catalog_xmin, + oldestSafeXid)) + oldestSafeXid = procArray->replication_slot_catalog_xmin; + + /* + * If we're not in recovery, we walk over the procarray and collect the + * lowest xid. Since we're called with ProcArrayLock held and have + * acquired XidGenLock, no entries can vanish concurrently, since + * ProcGlobal->xids[i] is only set with XidGenLock held and only cleared + * with ProcArrayLock held. + * + * In recovery we can't lower the safe value besides what we've computed + * above, so we'll have to wait a bit longer there. We unfortunately can + * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids + * machinery can miss values and return an older value than is safe. + */ + if (!recovery_in_progress) + { + TransactionId *other_xids = ProcGlobal->xids; + + /* + * Spin over procArray collecting min(ProcGlobal->xids[i]) + */ + for (index = 0; index < arrayP->numProcs; index++) + { + TransactionId xid; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(other_xids[index]); + + if (!TransactionIdIsNormal(xid)) + continue; + + if (TransactionIdPrecedes(xid, oldestSafeXid)) + oldestSafeXid = xid; + } + } + + LWLockRelease(XidGenLock); + + return oldestSafeXid; +} + +/* + * GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are + * delaying checkpoint because they have critical actions in progress. + * + * Constructs an array of VXIDs of transactions that are currently in commit + * critical sections, as shown by having specified delayChkptFlags bits set + * in their PGPROC. + * + * Returns a palloc'd array that should be freed by the caller. + * *nvxids is the number of valid entries. + * + * Note that because backends set or clear delayChkptFlags without holding any + * lock, the result is somewhat indeterminate, but we don't really care. Even + * in a multiprocessor with delayed writes to shared memory, it should be + * certain that setting of delayChkptFlags will propagate to shared memory + * when the backend takes a lock, so we cannot fail to see a virtual xact as + * delayChkptFlags if it's already inserted its commit record. Whether it + * takes a little while for clearing of delayChkptFlags to propagate is + * unimportant for correctness. + */ +VirtualTransactionId * +GetVirtualXIDsDelayingChkpt(int *nvxids, int type) +{ + VirtualTransactionId *vxids; + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + Assert(type != 0); + + /* allocate what's certainly enough result space */ + vxids = (VirtualTransactionId *) + palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if ((proc->delayChkptFlags & type) != 0) + { + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + } + } + + LWLockRelease(ProcArrayLock); + + *nvxids = count; + return vxids; +} + +/* + * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying? + * + * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any + * of the specified VXIDs are still in critical sections of code. + * + * Note: this is O(N^2) in the number of vxacts that are/were delaying, but + * those numbers should be small enough for it not to be a problem. + */ +bool +HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type) +{ + bool result = false; + ProcArrayStruct *arrayP = procArray; + int index; + + Assert(type != 0); + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + + if ((proc->delayChkptFlags & type) != 0 && + VirtualTransactionIdIsValid(vxid)) + { + int i; + + for (i = 0; i < nvxids; i++) + { + if (VirtualTransactionIdEquals(vxid, vxids[i])) + { + result = true; + break; + } + } + if (result) + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * BackendPidGetProc -- get a backend's PGPROC given its PID + * + * Returns NULL if not found. Note that it is up to the caller to be + * sure that the question remains meaningful for long enough for the + * answer to be used ... + */ +PGPROC * +BackendPidGetProc(int pid) +{ + PGPROC *result; + + if (pid == 0) /* never match dummy PGPROCs */ + return NULL; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + result = BackendPidGetProcWithLock(pid); + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * BackendPidGetProcWithLock -- get a backend's PGPROC given its PID + * + * Same as above, except caller must be holding ProcArrayLock. The found + * entry, if any, can be assumed to be valid as long as the lock remains held. + */ +PGPROC * +BackendPidGetProcWithLock(int pid) +{ + PGPROC *result = NULL; + ProcArrayStruct *arrayP = procArray; + int index; + + if (pid == 0) /* never match dummy PGPROCs */ + return NULL; + + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *proc = &allProcs[arrayP->pgprocnos[index]]; + + if (proc->pid == pid) + { + result = proc; + break; + } + } + + return result; +} + +/* + * BackendXidGetPid -- get a backend's pid given its XID + * + * Returns 0 if not found or it's a prepared transaction. Note that + * it is up to the caller to be sure that the question remains + * meaningful for long enough for the answer to be used ... + * + * Only main transaction Ids are considered. This function is mainly + * useful for determining what backend owns a lock. + * + * Beware that not every xact has an XID assigned. However, as long as you + * only call this using an XID found on disk, you're safe. + */ +int +BackendXidGetPid(TransactionId xid) +{ + int result = 0; + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + int index; + + if (xid == InvalidTransactionId) /* never match invalid xid */ + return 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (other_xids[index] == xid) + { + result = proc->pid; + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * IsBackendPid -- is a given pid a running backend + * + * This is not called by the backend, but is called by external modules. + */ +bool +IsBackendPid(int pid) +{ + return (BackendPidGetProc(pid) != NULL); +} + + +/* + * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs. + * + * The array is palloc'd. The number of valid entries is returned into *nvxids. + * + * The arguments allow filtering the set of VXIDs returned. Our own process + * is always skipped. In addition: + * If limitXmin is not InvalidTransactionId, skip processes with + * xmin > limitXmin. + * If excludeXmin0 is true, skip processes with xmin = 0. + * If allDbs is false, skip processes attached to other databases. + * If excludeVacuum isn't zero, skip processes for which + * (statusFlags & excludeVacuum) is not zero. + * + * Note: the purpose of the limitXmin and excludeXmin0 parameters is to + * allow skipping backends whose oldest live snapshot is no older than + * some snapshot we have. Since we examine the procarray with only shared + * lock, there are race conditions: a backend could set its xmin just after + * we look. Indeed, on multiprocessors with weak memory ordering, the + * other backend could have set its xmin *before* we look. We know however + * that such a backend must have held shared ProcArrayLock overlapping our + * own hold of ProcArrayLock, else we would see its xmin update. Therefore, + * any snapshot the other backend is taking concurrently with our scan cannot + * consider any transactions as still running that we think are committed + * (since backends must hold ProcArrayLock exclusive to commit). + */ +VirtualTransactionId * +GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, + bool allDbs, int excludeVacuum, + int *nvxids) +{ + VirtualTransactionId *vxids; + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + /* allocate what's certainly enough result space */ + vxids = (VirtualTransactionId *) + palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + uint8 statusFlags = ProcGlobal->statusFlags[index]; + + if (proc == MyProc) + continue; + + if (excludeVacuum & statusFlags) + continue; + + if (allDbs || proc->databaseId == MyDatabaseId) + { + /* Fetch xmin just once - might change on us */ + TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin); + + if (excludeXmin0 && !TransactionIdIsValid(pxmin)) + continue; + + /* + * InvalidTransactionId precedes all other XIDs, so a proc that + * hasn't set xmin yet will not be rejected by this test. + */ + if (!TransactionIdIsValid(limitXmin) || + TransactionIdPrecedesOrEquals(pxmin, limitXmin)) + { + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + } + } + } + + LWLockRelease(ProcArrayLock); + + *nvxids = count; + return vxids; +} + +/* + * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs. + * + * Usage is limited to conflict resolution during recovery on standby servers. + * limitXmin is supplied as either a cutoff with snapshotConflictHorizon + * semantics, or InvalidTransactionId in cases where caller cannot accurately + * determine a safe snapshotConflictHorizon value. + * + * If limitXmin is InvalidTransactionId then we want to kill everybody, + * so we're not worried if they have a snapshot or not, nor does it really + * matter what type of lock we hold. Caller must avoid calling here with + * snapshotConflictHorizon style cutoffs that were set to InvalidTransactionId + * during original execution, since that actually indicates that there is + * definitely no need for a recovery conflict (the snapshotConflictHorizon + * convention for InvalidTransactionId values is the opposite of our own!). + * + * All callers that are checking xmins always now supply a valid and useful + * value for limitXmin. The limitXmin is always lower than the lowest + * numbered KnownAssignedXid that is not already a FATAL error. This is + * because we only care about cleanup records that are cleaning up tuple + * versions from committed transactions. In that case they will only occur + * at the point where the record is less than the lowest running xid. That + * allows us to say that if any backend takes a snapshot concurrently with + * us then the conflict assessment made here would never include the snapshot + * that is being derived. So we take LW_SHARED on the ProcArray and allow + * concurrent snapshots when limitXmin is valid. We might think about adding + * Assert(limitXmin < lowest(KnownAssignedXids)) + * but that would not be true in the case of FATAL errors lagging in array, + * but we already know those are bogus anyway, so we skip that test. + * + * If dbOid is valid we skip backends attached to other databases. + * + * Be careful to *not* pfree the result from this function. We reuse + * this array sufficiently often that we use malloc for the result. + */ +VirtualTransactionId * +GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) +{ + static VirtualTransactionId *vxids; + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + /* + * If first time through, get workspace to remember main XIDs in. We + * malloc it permanently to avoid repeated palloc/pfree overhead. Allow + * result space, remembering room for a terminator. + */ + if (vxids == NULL) + { + vxids = (VirtualTransactionId *) + malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1)); + if (vxids == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + /* Exclude prepared transactions */ + if (proc->pid == 0) + continue; + + if (!OidIsValid(dbOid) || + proc->databaseId == dbOid) + { + /* Fetch xmin just once - can't change on us, but good coding */ + TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin); + + /* + * We ignore an invalid pxmin because this means that backend has + * no snapshot currently. We hold a Share lock to avoid contention + * with users taking snapshots. That is not a problem because the + * current xmin is always at least one higher than the latest + * removed xid, so any new snapshot would never conflict with the + * test here. + */ + if (!TransactionIdIsValid(limitXmin) || + (TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin))) + { + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + } + } + } + + LWLockRelease(ProcArrayLock); + + /* add the terminator */ + vxids[count].backendId = InvalidBackendId; + vxids[count].localTransactionId = InvalidLocalTransactionId; + + return vxids; +} + +/* + * CancelVirtualTransaction - used in recovery conflict processing + * + * Returns pid of the process signaled, or 0 if not found. + */ +pid_t +CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode) +{ + return SignalVirtualTransaction(vxid, sigmode, true); +} + +pid_t +SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, + bool conflictPending) +{ + ProcArrayStruct *arrayP = procArray; + int index; + pid_t pid = 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + VirtualTransactionId procvxid; + + GET_VXID_FROM_PGPROC(procvxid, *proc); + + if (procvxid.backendId == vxid.backendId && + procvxid.localTransactionId == vxid.localTransactionId) + { + proc->recoveryConflictPending = conflictPending; + pid = proc->pid; + if (pid != 0) + { + /* + * Kill the pid if it's still here. If not, that's what we + * wanted so ignore any errors. + */ + (void) SendProcSignal(pid, sigmode, vxid.backendId); + } + break; + } + } + + LWLockRelease(ProcArrayLock); + + return pid; +} + +/* + * MinimumActiveBackends --- count backends (other than myself) that are + * in active transactions. Return true if the count exceeds the + * minimum threshold passed. This is used as a heuristic to decide if + * a pre-XLOG-flush delay is worthwhile during commit. + * + * Do not count backends that are blocked waiting for locks, since they are + * not going to get to run until someone else commits. + */ +bool +MinimumActiveBackends(int min) +{ + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + /* Quick short-circuit if no minimum is specified */ + if (min == 0) + return true; + + /* + * Note: for speed, we don't acquire ProcArrayLock. This is a little bit + * bogus, but since we are only testing fields for zero or nonzero, it + * should be OK. The result is only used for heuristic purposes anyway... + */ + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + /* + * Since we're not holding a lock, need to be prepared to deal with + * garbage, as someone could have incremented numProcs but not yet + * filled the structure. + * + * If someone just decremented numProcs, 'proc' could also point to a + * PGPROC entry that's no longer in the array. It still points to a + * PGPROC struct, though, because freed PGPROC entries just go to the + * free list and are recycled. Its contents are nonsense in that case, + * but that's acceptable for this function. + */ + if (pgprocno == -1) + continue; /* do not count deleted entries */ + if (proc == MyProc) + continue; /* do not count myself */ + if (proc->xid == InvalidTransactionId) + continue; /* do not count if no XID assigned */ + if (proc->pid == 0) + continue; /* do not count prepared xacts */ + if (proc->waitLock != NULL) + continue; /* do not count if blocked on a lock */ + count++; + if (count >= min) + break; + } + + return count >= min; +} + +/* + * CountDBBackends --- count backends that are using specified database + */ +int +CountDBBackends(Oid databaseid) +{ + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (proc->pid == 0) + continue; /* do not count prepared xacts */ + if (!OidIsValid(databaseid) || + proc->databaseId == databaseid) + count++; + } + + LWLockRelease(ProcArrayLock); + + return count; +} + +/* + * CountDBConnections --- counts database backends ignoring any background + * worker processes + */ +int +CountDBConnections(Oid databaseid) +{ + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (proc->pid == 0) + continue; /* do not count prepared xacts */ + if (proc->isBackgroundWorker) + continue; /* do not count background workers */ + if (!OidIsValid(databaseid) || + proc->databaseId == databaseid) + count++; + } + + LWLockRelease(ProcArrayLock); + + return count; +} + +/* + * CancelDBBackends --- cancel backends that are using specified database + */ +void +CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending) +{ + ProcArrayStruct *arrayP = procArray; + int index; + + /* tell all backends to die */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (databaseid == InvalidOid || proc->databaseId == databaseid) + { + VirtualTransactionId procvxid; + pid_t pid; + + GET_VXID_FROM_PGPROC(procvxid, *proc); + + proc->recoveryConflictPending = conflictPending; + pid = proc->pid; + if (pid != 0) + { + /* + * Kill the pid if it's still here. If not, that's what we + * wanted so ignore any errors. + */ + (void) SendProcSignal(pid, sigmode, procvxid.backendId); + } + } + } + + LWLockRelease(ProcArrayLock); +} + +/* + * CountUserBackends --- count backends that are used by specified user + */ +int +CountUserBackends(Oid roleid) +{ + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (proc->pid == 0) + continue; /* do not count prepared xacts */ + if (proc->isBackgroundWorker) + continue; /* do not count background workers */ + if (proc->roleId == roleid) + count++; + } + + LWLockRelease(ProcArrayLock); + + return count; +} + +/* + * CountOtherDBBackends -- check for other backends running in the given DB + * + * If there are other backends in the DB, we will wait a maximum of 5 seconds + * for them to exit. Autovacuum backends are encouraged to exit early by + * sending them SIGTERM, but normal user backends are just waited for. + * + * The current backend is always ignored; it is caller's responsibility to + * check whether the current backend uses the given DB, if it's important. + * + * Returns true if there are (still) other backends in the DB, false if not. + * Also, *nbackends and *nprepared are set to the number of other backends + * and prepared transactions in the DB, respectively. + * + * This function is used to interlock DROP DATABASE and related commands + * against there being any active backends in the target DB --- dropping the + * DB while active backends remain would be a Bad Thing. Note that we cannot + * detect here the possibility of a newly-started backend that is trying to + * connect to the doomed database, so additional interlocking is needed during + * backend startup. The caller should normally hold an exclusive lock on the + * target DB before calling this, which is one reason we mustn't wait + * indefinitely. + */ +bool +CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) +{ + ProcArrayStruct *arrayP = procArray; + +#define MAXAUTOVACPIDS 10 /* max autovacs to SIGTERM per iteration */ + int autovac_pids[MAXAUTOVACPIDS]; + int tries; + + /* 50 tries with 100ms sleep between tries makes 5 sec total wait */ + for (tries = 0; tries < 50; tries++) + { + int nautovacs = 0; + bool found = false; + int index; + + CHECK_FOR_INTERRUPTS(); + + *nbackends = *nprepared = 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + uint8 statusFlags = ProcGlobal->statusFlags[index]; + + if (proc->databaseId != databaseId) + continue; + if (proc == MyProc) + continue; + + found = true; + + if (proc->pid == 0) + (*nprepared)++; + else + { + (*nbackends)++; + if ((statusFlags & PROC_IS_AUTOVACUUM) && + nautovacs < MAXAUTOVACPIDS) + autovac_pids[nautovacs++] = proc->pid; + } + } + + LWLockRelease(ProcArrayLock); + + if (!found) + return false; /* no conflicting backends, so done */ + + /* + * Send SIGTERM to any conflicting autovacuums before sleeping. We + * postpone this step until after the loop because we don't want to + * hold ProcArrayLock while issuing kill(). We have no idea what might + * block kill() inside the kernel... + */ + for (index = 0; index < nautovacs; index++) + (void) kill(autovac_pids[index], SIGTERM); /* ignore any error */ + + /* sleep, then try again */ + pg_usleep(100 * 1000L); /* 100ms */ + } + + return true; /* timed out, still conflicts */ +} + +/* + * Terminate existing connections to the specified database. This routine + * is used by the DROP DATABASE command when user has asked to forcefully + * drop the database. + * + * The current backend is always ignored; it is caller's responsibility to + * check whether the current backend uses the given DB, if it's important. + * + * It doesn't allow to terminate the connections even if there is a one + * backend with the prepared transaction in the target database. + */ +void +TerminateOtherDBBackends(Oid databaseId) +{ + ProcArrayStruct *arrayP = procArray; + List *pids = NIL; + int nprepared = 0; + int i; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (i = 0; i < procArray->numProcs; i++) + { + int pgprocno = arrayP->pgprocnos[i]; + PGPROC *proc = &allProcs[pgprocno]; + + if (proc->databaseId != databaseId) + continue; + if (proc == MyProc) + continue; + + if (proc->pid != 0) + pids = lappend_int(pids, proc->pid); + else + nprepared++; + } + + LWLockRelease(ProcArrayLock); + + if (nprepared > 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("database \"%s\" is being used by prepared transactions", + get_database_name(databaseId)), + errdetail_plural("There is %d prepared transaction using the database.", + "There are %d prepared transactions using the database.", + nprepared, + nprepared))); + + if (pids) + { + ListCell *lc; + + /* + * Check whether we have the necessary rights to terminate other + * sessions. We don't terminate any session until we ensure that we + * have rights on all the sessions to be terminated. These checks are + * the same as we do in pg_terminate_backend. + * + * In this case we don't raise some warnings - like "PID %d is not a + * PostgreSQL server process", because for us already finished session + * is not a problem. + */ + foreach(lc, pids) + { + int pid = lfirst_int(lc); + PGPROC *proc = BackendPidGetProc(pid); + + if (proc != NULL) + { + /* Only allow superusers to signal superuser-owned backends. */ + if (superuser_arg(proc->roleId) && !superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to terminate process"), + errdetail("Only roles with the %s attribute may terminate processes of roles with the %s attribute.", + "SUPERUSER", "SUPERUSER"))); + + /* Users can signal backends they have role membership in. */ + if (!has_privs_of_role(GetUserId(), proc->roleId) && + !has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to terminate process"), + errdetail("Only roles with privileges of the role whose process is being terminated or with privileges of the \"%s\" role may terminate this process.", + "pg_signal_backend"))); + } + } + + /* + * There's a race condition here: once we release the ProcArrayLock, + * it's possible for the session to exit before we issue kill. That + * race condition possibility seems too unlikely to worry about. See + * pg_signal_backend. + */ + foreach(lc, pids) + { + int pid = lfirst_int(lc); + PGPROC *proc = BackendPidGetProc(pid); + + if (proc != NULL) + { + /* + * If we have setsid(), signal the backend's whole process + * group + */ +#ifdef HAVE_SETSID + (void) kill(-pid, SIGTERM); +#else + (void) kill(pid, SIGTERM); +#endif + } + } + } +} + +/* + * ProcArraySetReplicationSlotXmin + * + * Install limits to future computations of the xmin horizon to prevent vacuum + * and HOT pruning from removing affected rows still needed by clients with + * replication slots. + */ +void +ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, + bool already_locked) +{ + Assert(!already_locked || LWLockHeldByMe(ProcArrayLock)); + + if (!already_locked) + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + procArray->replication_slot_xmin = xmin; + procArray->replication_slot_catalog_xmin = catalog_xmin; + + if (!already_locked) + LWLockRelease(ProcArrayLock); + + elog(DEBUG1, "xmin required by slots: data %u, catalog %u", + xmin, catalog_xmin); +} + +/* + * ProcArrayGetReplicationSlotXmin + * + * Return the current slot xmin limits. That's useful to be able to remove + * data that's older than those limits. + */ +void +ProcArrayGetReplicationSlotXmin(TransactionId *xmin, + TransactionId *catalog_xmin) +{ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + if (xmin != NULL) + *xmin = procArray->replication_slot_xmin; + + if (catalog_xmin != NULL) + *catalog_xmin = procArray->replication_slot_catalog_xmin; + + LWLockRelease(ProcArrayLock); +} + +/* + * XidCacheRemoveRunningXids + * + * Remove a bunch of TransactionIds from the list of known-running + * subtransactions for my backend. Both the specified xid and those in + * the xids[] array (of length nxids) are removed from the subxids cache. + * latestXid must be the latest XID among the group. + */ +void +XidCacheRemoveRunningXids(TransactionId xid, + int nxids, const TransactionId *xids, + TransactionId latestXid) +{ + int i, + j; + XidCacheStatus *mysubxidstat; + + Assert(TransactionIdIsValid(xid)); + + /* + * We must hold ProcArrayLock exclusively in order to remove transactions + * from the PGPROC array. (See src/backend/access/transam/README.) It's + * possible this could be relaxed since we know this routine is only used + * to abort subtransactions, but pending closer analysis we'd best be + * conservative. + * + * Note that we do not have to be careful about memory ordering of our own + * reads wrt. GetNewTransactionId() here - only this process can modify + * relevant fields of MyProc/ProcGlobal->xids[]. But we do have to be + * careful about our own writes being well ordered. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + mysubxidstat = &ProcGlobal->subxidStates[MyProc->pgxactoff]; + + /* + * Under normal circumstances xid and xids[] will be in increasing order, + * as will be the entries in subxids. Scan backwards to avoid O(N^2) + * behavior when removing a lot of xids. + */ + for (i = nxids - 1; i >= 0; i--) + { + TransactionId anxid = xids[i]; + + for (j = MyProc->subxidStatus.count - 1; j >= 0; j--) + { + if (TransactionIdEquals(MyProc->subxids.xids[j], anxid)) + { + MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1]; + pg_write_barrier(); + mysubxidstat->count--; + MyProc->subxidStatus.count--; + break; + } + } + + /* + * Ordinarily we should have found it, unless the cache has + * overflowed. However it's also possible for this routine to be + * invoked multiple times for the same subtransaction, in case of an + * error during AbortSubTransaction. So instead of Assert, emit a + * debug warning. + */ + if (j < 0 && !MyProc->subxidStatus.overflowed) + elog(WARNING, "did not find subXID %u in MyProc", anxid); + } + + for (j = MyProc->subxidStatus.count - 1; j >= 0; j--) + { + if (TransactionIdEquals(MyProc->subxids.xids[j], xid)) + { + MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1]; + pg_write_barrier(); + mysubxidstat->count--; + MyProc->subxidStatus.count--; + break; + } + } + /* Ordinarily we should have found it, unless the cache has overflowed */ + if (j < 0 && !MyProc->subxidStatus.overflowed) + elog(WARNING, "did not find subXID %u in MyProc", xid); + + /* Also advance global latestCompletedXid while holding the lock */ + MaintainLatestCompletedXid(latestXid); + + /* ... and xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; + + LWLockRelease(ProcArrayLock); +} + +#ifdef XIDCACHE_DEBUG + +/* + * Print stats about effectiveness of XID cache + */ +static void +DisplayXidCache(void) +{ + fprintf(stderr, + "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n", + xc_by_recent_xmin, + xc_by_known_xact, + xc_by_my_xact, + xc_by_latest_xid, + xc_by_main_xid, + xc_by_child_xid, + xc_by_known_assigned, + xc_no_overflow, + xc_slow_answer); +} +#endif /* XIDCACHE_DEBUG */ + +/* + * If rel != NULL, return test state appropriate for relation, otherwise + * return state usable for all relations. The latter may consider XIDs as + * not-yet-visible-to-everyone that a state for a specific relation would + * already consider visible-to-everyone. + * + * This needs to be called while a snapshot is active or registered, otherwise + * there are wraparound and other dangers. + * + * See comment for GlobalVisState for details. + */ +GlobalVisState * +GlobalVisTestFor(Relation rel) +{ + GlobalVisState *state = NULL; + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(RecentXmin); + + switch (GlobalVisHorizonKindForRel(rel)) + { + case VISHORIZON_SHARED: + state = &GlobalVisSharedRels; + break; + case VISHORIZON_CATALOG: + state = &GlobalVisCatalogRels; + break; + case VISHORIZON_DATA: + state = &GlobalVisDataRels; + break; + case VISHORIZON_TEMP: + state = &GlobalVisTempRels; + break; + } + + Assert(FullTransactionIdIsValid(state->definitely_needed) && + FullTransactionIdIsValid(state->maybe_needed)); + + return state; +} + +/* + * Return true if it's worth updating the accurate maybe_needed boundary. + * + * As it is somewhat expensive to determine xmin horizons, we don't want to + * repeatedly do so when there is a low likelihood of it being beneficial. + * + * The current heuristic is that we update only if RecentXmin has changed + * since the last update. If the oldest currently running transaction has not + * finished, it is unlikely that recomputing the horizon would be useful. + */ +static bool +GlobalVisTestShouldUpdate(GlobalVisState *state) +{ + /* hasn't been updated yet */ + if (!TransactionIdIsValid(ComputeXidHorizonsResultLastXmin)) + return true; + + /* + * If the maybe_needed/definitely_needed boundaries are the same, it's + * unlikely to be beneficial to refresh boundaries. + */ + if (FullTransactionIdFollowsOrEquals(state->maybe_needed, + state->definitely_needed)) + return false; + + /* does the last snapshot built have a different xmin? */ + return RecentXmin != ComputeXidHorizonsResultLastXmin; +} + +static void +GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons) +{ + GlobalVisSharedRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->shared_oldest_nonremovable); + GlobalVisCatalogRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->catalog_oldest_nonremovable); + GlobalVisDataRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->data_oldest_nonremovable); + GlobalVisTempRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->temp_oldest_nonremovable); + + /* + * In longer running transactions it's possible that transactions we + * previously needed to treat as running aren't around anymore. So update + * definitely_needed to not be earlier than maybe_needed. + */ + GlobalVisSharedRels.definitely_needed = + FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed, + GlobalVisSharedRels.definitely_needed); + GlobalVisCatalogRels.definitely_needed = + FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed, + GlobalVisCatalogRels.definitely_needed); + GlobalVisDataRels.definitely_needed = + FullTransactionIdNewer(GlobalVisDataRels.maybe_needed, + GlobalVisDataRels.definitely_needed); + GlobalVisTempRels.definitely_needed = GlobalVisTempRels.maybe_needed; + + ComputeXidHorizonsResultLastXmin = RecentXmin; +} + +/* + * Update boundaries in GlobalVis{Shared,Catalog, Data}Rels + * using ComputeXidHorizons(). + */ +static void +GlobalVisUpdate(void) +{ + ComputeXidHorizonsResult horizons; + + /* updates the horizons as a side-effect */ + ComputeXidHorizons(&horizons); +} + +/* + * Return true if no snapshot still considers fxid to be running. + * + * The state passed needs to have been initialized for the relation fxid is + * from (NULL is also OK), otherwise the result may not be correct. + * + * See comment for GlobalVisState for details. + */ +bool +GlobalVisTestIsRemovableFullXid(GlobalVisState *state, + FullTransactionId fxid) +{ + /* + * If fxid is older than maybe_needed bound, it definitely is visible to + * everyone. + */ + if (FullTransactionIdPrecedes(fxid, state->maybe_needed)) + return true; + + /* + * If fxid is >= definitely_needed bound, it is very likely to still be + * considered running. + */ + if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed)) + return false; + + /* + * fxid is between maybe_needed and definitely_needed, i.e. there might or + * might not exist a snapshot considering fxid running. If it makes sense, + * update boundaries and recheck. + */ + if (GlobalVisTestShouldUpdate(state)) + { + GlobalVisUpdate(); + + Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed)); + + return FullTransactionIdPrecedes(fxid, state->maybe_needed); + } + else + return false; +} + +/* + * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids. + * + * It is crucial that this only gets called for xids from a source that + * protects against xid wraparounds (e.g. from a table and thus protected by + * relfrozenxid). + */ +bool +GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid) +{ + FullTransactionId fxid; + + /* + * Convert 32 bit argument to FullTransactionId. We can do so safely + * because we know the xid has to, at the very least, be between + * [oldestXid, nextXid), i.e. within 2 billion of xid. To avoid taking a + * lock to determine either, we can just compare with + * state->definitely_needed, which was based on those value at the time + * the current snapshot was built. + */ + fxid = FullXidRelativeTo(state->definitely_needed, xid); + + return GlobalVisTestIsRemovableFullXid(state, fxid); +} + +/* + * Return FullTransactionId below which all transactions are not considered + * running anymore. + * + * Note: This is less efficient than testing with + * GlobalVisTestIsRemovableFullXid as it likely requires building an accurate + * cutoff, even in the case all the XIDs compared with the cutoff are outside + * [maybe_needed, definitely_needed). + */ +FullTransactionId +GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state) +{ + /* acquire accurate horizon if not already done */ + if (GlobalVisTestShouldUpdate(state)) + GlobalVisUpdate(); + + return state->maybe_needed; +} + +/* Convenience wrapper around GlobalVisTestNonRemovableFullHorizon */ +TransactionId +GlobalVisTestNonRemovableHorizon(GlobalVisState *state) +{ + FullTransactionId cutoff; + + cutoff = GlobalVisTestNonRemovableFullHorizon(state); + + return XidFromFullTransactionId(cutoff); +} + +/* + * Convenience wrapper around GlobalVisTestFor() and + * GlobalVisTestIsRemovableFullXid(), see their comments. + */ +bool +GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid) +{ + GlobalVisState *state; + + state = GlobalVisTestFor(rel); + + return GlobalVisTestIsRemovableFullXid(state, fxid); +} + +/* + * Convenience wrapper around GlobalVisTestFor() and + * GlobalVisTestIsRemovableXid(), see their comments. + */ +bool +GlobalVisCheckRemovableXid(Relation rel, TransactionId xid) +{ + GlobalVisState *state; + + state = GlobalVisTestFor(rel); + + return GlobalVisTestIsRemovableXid(state, xid); +} + +/* + * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it + * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel). + * + * Be very careful about when to use this function. It can only safely be used + * when there is a guarantee that xid is within MaxTransactionId / 2 xids of + * rel. That e.g. can be guaranteed if the caller assures a snapshot is + * held by the backend and xid is from a table (where vacuum/freezing ensures + * the xid has to be within that range), or if xid is from the procarray and + * prevents xid wraparound that way. + */ +static inline FullTransactionId +FullXidRelativeTo(FullTransactionId rel, TransactionId xid) +{ + TransactionId rel_xid = XidFromFullTransactionId(rel); + + Assert(TransactionIdIsValid(xid)); + Assert(TransactionIdIsValid(rel_xid)); + + /* not guaranteed to find issues, but likely to catch mistakes */ + AssertTransactionIdInAllowableRange(xid); + + return FullTransactionIdFromU64(U64FromFullTransactionId(rel) + + (int32) (xid - rel_xid)); +} + + +/* ---------------------------------------------- + * KnownAssignedTransactionIds sub-module + * ---------------------------------------------- + */ + +/* + * In Hot Standby mode, we maintain a list of transactions that are (or were) + * running on the primary at the current point in WAL. These XIDs must be + * treated as running by standby transactions, even though they are not in + * the standby server's PGPROC array. + * + * We record all XIDs that we know have been assigned. That includes all the + * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have + * been assigned. We can deduce the existence of unobserved XIDs because we + * know XIDs are assigned in sequence, with no gaps. The KnownAssignedXids + * list expands as new XIDs are observed or inferred, and contracts when + * transaction completion records arrive. + * + * During hot standby we do not fret too much about the distinction between + * top-level XIDs and subtransaction XIDs. We store both together in the + * KnownAssignedXids list. In backends, this is copied into snapshots in + * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot() + * doesn't care about the distinction either. Subtransaction XIDs are + * effectively treated as top-level XIDs and in the typical case pg_subtrans + * links are *not* maintained (which does not affect visibility). + * + * We have room in KnownAssignedXids and in snapshots to hold maxProcs * + * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must + * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at + * least every PGPROC_MAX_CACHED_SUBXIDS. When we receive one of these + * records, we mark the subXIDs as children of the top XID in pg_subtrans, + * and then remove them from KnownAssignedXids. This prevents overflow of + * KnownAssignedXids and snapshots, at the cost that status checks for these + * subXIDs will take a slower path through TransactionIdIsInProgress(). + * This means that KnownAssignedXids is not necessarily complete for subXIDs, + * though it should be complete for top-level XIDs; this is the same situation + * that holds with respect to the PGPROC entries in normal running. + * + * When we throw away subXIDs from KnownAssignedXids, we need to keep track of + * that, similarly to tracking overflow of a PGPROC's subxids array. We do + * that by remembering the lastOverflowedXid, ie the last thrown-away subXID. + * As long as that is within the range of interesting XIDs, we have to assume + * that subXIDs are missing from snapshots. (Note that subXID overflow occurs + * on primary when 65th subXID arrives, whereas on standby it occurs when 64th + * subXID arrives - that is not an error.) + * + * Should a backend on primary somehow disappear before it can write an abort + * record, then we just leave those XIDs in KnownAssignedXids. They actually + * aborted but we think they were running; the distinction is irrelevant + * because either way any changes done by the transaction are not visible to + * backends in the standby. We prune KnownAssignedXids when + * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the + * array due to such dead XIDs. + */ + +/* + * RecordKnownAssignedTransactionIds + * Record the given XID in KnownAssignedXids, as well as any preceding + * unobserved XIDs. + * + * RecordKnownAssignedTransactionIds() should be run for *every* WAL record + * associated with a transaction. Must be called for each record after we + * have executed StartupCLOG() et al, since we must ExtendCLOG() etc.. + * + * Called during recovery in analogy with and in place of GetNewTransactionId() + */ +void +RecordKnownAssignedTransactionIds(TransactionId xid) +{ + Assert(standbyState >= STANDBY_INITIALIZED); + Assert(TransactionIdIsValid(xid)); + Assert(TransactionIdIsValid(latestObservedXid)); + + elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u", + xid, latestObservedXid); + + /* + * When a newly observed xid arrives, it is frequently the case that it is + * *not* the next xid in sequence. When this occurs, we must treat the + * intervening xids as running also. + */ + if (TransactionIdFollows(xid, latestObservedXid)) + { + TransactionId next_expected_xid; + + /* + * Extend subtrans like we do in GetNewTransactionId() during normal + * operation using individual extend steps. Note that we do not need + * to extend clog since its extensions are WAL logged. + * + * This part has to be done regardless of standbyState since we + * immediately start assigning subtransactions to their toplevel + * transactions. + */ + next_expected_xid = latestObservedXid; + while (TransactionIdPrecedes(next_expected_xid, xid)) + { + TransactionIdAdvance(next_expected_xid); + ExtendSUBTRANS(next_expected_xid); + } + Assert(next_expected_xid == xid); + + /* + * If the KnownAssignedXids machinery isn't up yet, there's nothing + * more to do since we don't track assigned xids yet. + */ + if (standbyState <= STANDBY_INITIALIZED) + { + latestObservedXid = xid; + return; + } + + /* + * Add (latestObservedXid, xid] onto the KnownAssignedXids array. + */ + next_expected_xid = latestObservedXid; + TransactionIdAdvance(next_expected_xid); + KnownAssignedXidsAdd(next_expected_xid, xid, false); + + /* + * Now we can advance latestObservedXid + */ + latestObservedXid = xid; + + /* ShmemVariableCache->nextXid must be beyond any observed xid */ + AdvanceNextFullTransactionIdPastXid(latestObservedXid); + } +} + +/* + * ExpireTreeKnownAssignedTransactionIds + * Remove the given XIDs from KnownAssignedXids. + * + * Called during recovery in analogy with and in place of ProcArrayEndTransaction() + */ +void +ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids, + TransactionId *subxids, TransactionId max_xid) +{ + Assert(standbyState >= STANDBY_INITIALIZED); + + /* + * Uses same locking as transaction commit + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + KnownAssignedXidsRemoveTree(xid, nsubxids, subxids); + + /* As in ProcArrayEndTransaction, advance latestCompletedXid */ + MaintainLatestCompletedXidRecovery(max_xid); + + /* ... and xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; + + LWLockRelease(ProcArrayLock); +} + +/* + * ExpireAllKnownAssignedTransactionIds + * Remove all entries in KnownAssignedXids and reset lastOverflowedXid. + */ +void +ExpireAllKnownAssignedTransactionIds(void) +{ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + KnownAssignedXidsRemovePreceding(InvalidTransactionId); + + /* + * Reset lastOverflowedXid. Currently, lastOverflowedXid has no use after + * the call of this function. But do this for unification with what + * ExpireOldKnownAssignedTransactionIds() do. + */ + procArray->lastOverflowedXid = InvalidTransactionId; + LWLockRelease(ProcArrayLock); +} + +/* + * ExpireOldKnownAssignedTransactionIds + * Remove KnownAssignedXids entries preceding the given XID and + * potentially reset lastOverflowedXid. + */ +void +ExpireOldKnownAssignedTransactionIds(TransactionId xid) +{ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Reset lastOverflowedXid if we know all transactions that have been + * possibly running are being gone. Not doing so could cause an incorrect + * lastOverflowedXid value, which makes extra snapshots be marked as + * suboverflowed. + */ + if (TransactionIdPrecedes(procArray->lastOverflowedXid, xid)) + procArray->lastOverflowedXid = InvalidTransactionId; + KnownAssignedXidsRemovePreceding(xid); + LWLockRelease(ProcArrayLock); +} + +/* + * KnownAssignedTransactionIdsIdleMaintenance + * Opportunistically do maintenance work when the startup process + * is about to go idle. + */ +void +KnownAssignedTransactionIdsIdleMaintenance(void) +{ + KnownAssignedXidsCompress(KAX_STARTUP_PROCESS_IDLE, false); +} + + +/* + * Private module functions to manipulate KnownAssignedXids + * + * There are 5 main uses of the KnownAssignedXids data structure: + * + * * backends taking snapshots - all valid XIDs need to be copied out + * * backends seeking to determine presence of a specific XID + * * startup process adding new known-assigned XIDs + * * startup process removing specific XIDs as transactions end + * * startup process pruning array when special WAL records arrive + * + * This data structure is known to be a hot spot during Hot Standby, so we + * go to some lengths to make these operations as efficient and as concurrent + * as possible. + * + * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes + * order, to be exact --- to allow binary search for specific XIDs. Note: + * in general TransactionIdPrecedes would not provide a total order, but + * we know that the entries present at any instant should not extend across + * a large enough fraction of XID space to wrap around (the primary would + * shut down for fear of XID wrap long before that happens). So it's OK to + * use TransactionIdPrecedes as a binary-search comparator. + * + * It's cheap to maintain the sortedness during insertions, since new known + * XIDs are always reported in XID order; we just append them at the right. + * + * To keep individual deletions cheap, we need to allow gaps in the array. + * This is implemented by marking array elements as valid or invalid using + * the parallel boolean array KnownAssignedXidsValid[]. A deletion is done + * by setting KnownAssignedXidsValid[i] to false, *without* clearing the + * XID entry itself. This preserves the property that the XID entries are + * sorted, so we can do binary searches easily. Periodically we compress + * out the unused entries; that's much cheaper than having to compress the + * array immediately on every deletion. + * + * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[] + * are those with indexes tail <= i < head; items outside this subscript range + * have unspecified contents. When head reaches the end of the array, we + * force compression of unused entries rather than wrapping around, since + * allowing wraparound would greatly complicate the search logic. We maintain + * an explicit tail pointer so that pruning of old XIDs can be done without + * immediately moving the array contents. In most cases only a small fraction + * of the array contains valid entries at any instant. + * + * Although only the startup process can ever change the KnownAssignedXids + * data structure, we still need interlocking so that standby backends will + * not observe invalid intermediate states. The convention is that backends + * must hold shared ProcArrayLock to examine the array. To remove XIDs from + * the array, the startup process must hold ProcArrayLock exclusively, for + * the usual transactional reasons (compare commit/abort of a transaction + * during normal running). Compressing unused entries out of the array + * likewise requires exclusive lock. To add XIDs to the array, we just insert + * them into slots to the right of the head pointer and then advance the head + * pointer. This wouldn't require any lock at all, except that on machines + * with weak memory ordering we need to be careful that other processors + * see the array element changes before they see the head pointer change. + * We handle this by using a spinlock to protect reads and writes of the + * head/tail pointers. (We could dispense with the spinlock if we were to + * create suitable memory access barrier primitives and use those instead.) + * The spinlock must be taken to read or write the head/tail pointers unless + * the caller holds ProcArrayLock exclusively. + * + * Algorithmic analysis: + * + * If we have a maximum of M slots, with N XIDs currently spread across + * S elements then we have N <= S <= M always. + * + * * Adding a new XID is O(1) and needs little locking (unless compression + * must happen) + * * Compressing the array is O(S) and requires exclusive lock + * * Removing an XID is O(logS) and requires exclusive lock + * * Taking a snapshot is O(S) and requires shared lock + * * Checking for an XID is O(logS) and requires shared lock + * + * In comparison, using a hash table for KnownAssignedXids would mean that + * taking snapshots would be O(M). If we can maintain S << M then the + * sorted array technique will deliver significantly faster snapshots. + * If we try to keep S too small then we will spend too much time compressing, + * so there is an optimal point for any workload mix. We use a heuristic to + * decide when to compress the array, though trimming also helps reduce + * frequency of compressing. The heuristic requires us to track the number of + * currently valid XIDs in the array (N). Except in special cases, we'll + * compress when S >= 2N. Bounding S at 2N in turn bounds the time for + * taking a snapshot to be O(N), which it would have to be anyway. + */ + + +/* + * Compress KnownAssignedXids by shifting valid data down to the start of the + * array, removing any gaps. + * + * A compression step is forced if "reason" is KAX_NO_SPACE, otherwise + * we do it only if a heuristic indicates it's a good time to do it. + * + * Compression requires holding ProcArrayLock in exclusive mode. + * Caller must pass haveLock = true if it already holds the lock. + */ +static void +KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock) +{ + ProcArrayStruct *pArray = procArray; + int head, + tail, + nelements; + int compress_index; + int i; + + /* Counters for compression heuristics */ + static unsigned int transactionEndsCounter; + static TimestampTz lastCompressTs; + + /* Tuning constants */ +#define KAX_COMPRESS_FREQUENCY 128 /* in transactions */ +#define KAX_COMPRESS_IDLE_INTERVAL 1000 /* in ms */ + + /* + * Since only the startup process modifies the head/tail pointers, we + * don't need a lock to read them here. + */ + head = pArray->headKnownAssignedXids; + tail = pArray->tailKnownAssignedXids; + nelements = head - tail; + + /* + * If we can choose whether to compress, use a heuristic to avoid + * compressing too often or not often enough. "Compress" here simply + * means moving the values to the beginning of the array, so it is not as + * complex or costly as typical data compression algorithms. + */ + if (nelements == pArray->numKnownAssignedXids) + { + /* + * When there are no gaps between head and tail, don't bother to + * compress, except in the KAX_NO_SPACE case where we must compress to + * create some space after the head. + */ + if (reason != KAX_NO_SPACE) + return; + } + else if (reason == KAX_TRANSACTION_END) + { + /* + * Consider compressing only once every so many commits. Frequency + * determined by benchmarks. + */ + if ((transactionEndsCounter++) % KAX_COMPRESS_FREQUENCY != 0) + return; + + /* + * Furthermore, compress only if the used part of the array is less + * than 50% full (see comments above). + */ + if (nelements < 2 * pArray->numKnownAssignedXids) + return; + } + else if (reason == KAX_STARTUP_PROCESS_IDLE) + { + /* + * We're about to go idle for lack of new WAL, so we might as well + * compress. But not too often, to avoid ProcArray lock contention + * with readers. + */ + if (lastCompressTs != 0) + { + TimestampTz compress_after; + + compress_after = TimestampTzPlusMilliseconds(lastCompressTs, + KAX_COMPRESS_IDLE_INTERVAL); + if (GetCurrentTimestamp() < compress_after) + return; + } + } + + /* Need to compress, so get the lock if we don't have it. */ + if (!haveLock) + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * We compress the array by reading the valid values from tail to head, + * re-aligning data to 0th element. + */ + compress_index = 0; + for (i = tail; i < head; i++) + { + if (KnownAssignedXidsValid[i]) + { + KnownAssignedXids[compress_index] = KnownAssignedXids[i]; + KnownAssignedXidsValid[compress_index] = true; + compress_index++; + } + } + Assert(compress_index == pArray->numKnownAssignedXids); + + pArray->tailKnownAssignedXids = 0; + pArray->headKnownAssignedXids = compress_index; + + if (!haveLock) + LWLockRelease(ProcArrayLock); + + /* Update timestamp for maintenance. No need to hold lock for this. */ + lastCompressTs = GetCurrentTimestamp(); +} + +/* + * Add xids into KnownAssignedXids at the head of the array. + * + * xids from from_xid to to_xid, inclusive, are added to the array. + * + * If exclusive_lock is true then caller already holds ProcArrayLock in + * exclusive mode, so we need no extra locking here. Else caller holds no + * lock, so we need to be sure we maintain sufficient interlocks against + * concurrent readers. (Only the startup process ever calls this, so no need + * to worry about concurrent writers.) + */ +static void +KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, + bool exclusive_lock) +{ + ProcArrayStruct *pArray = procArray; + TransactionId next_xid; + int head, + tail; + int nxids; + int i; + + Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid)); + + /* + * Calculate how many array slots we'll need. Normally this is cheap; in + * the unusual case where the XIDs cross the wrap point, we do it the hard + * way. + */ + if (to_xid >= from_xid) + nxids = to_xid - from_xid + 1; + else + { + nxids = 1; + next_xid = from_xid; + while (TransactionIdPrecedes(next_xid, to_xid)) + { + nxids++; + TransactionIdAdvance(next_xid); + } + } + + /* + * Since only the startup process modifies the head/tail pointers, we + * don't need a lock to read them here. + */ + head = pArray->headKnownAssignedXids; + tail = pArray->tailKnownAssignedXids; + + Assert(head >= 0 && head <= pArray->maxKnownAssignedXids); + Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids); + + /* + * Verify that insertions occur in TransactionId sequence. Note that even + * if the last existing element is marked invalid, it must still have a + * correctly sequenced XID value. + */ + if (head > tail && + TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid)) + { + KnownAssignedXidsDisplay(LOG); + elog(ERROR, "out-of-order XID insertion in KnownAssignedXids"); + } + + /* + * If our xids won't fit in the remaining space, compress out free space + */ + if (head + nxids > pArray->maxKnownAssignedXids) + { + KnownAssignedXidsCompress(KAX_NO_SPACE, exclusive_lock); + + head = pArray->headKnownAssignedXids; + /* note: we no longer care about the tail pointer */ + + /* + * If it still won't fit then we're out of memory + */ + if (head + nxids > pArray->maxKnownAssignedXids) + elog(ERROR, "too many KnownAssignedXids"); + } + + /* Now we can insert the xids into the space starting at head */ + next_xid = from_xid; + for (i = 0; i < nxids; i++) + { + KnownAssignedXids[head] = next_xid; + KnownAssignedXidsValid[head] = true; + TransactionIdAdvance(next_xid); + head++; + } + + /* Adjust count of number of valid entries */ + pArray->numKnownAssignedXids += nxids; + + /* + * Now update the head pointer. We use a spinlock to protect this + * pointer, not because the update is likely to be non-atomic, but to + * ensure that other processors see the above array updates before they + * see the head pointer change. + * + * If we're holding ProcArrayLock exclusively, there's no need to take the + * spinlock. + */ + if (exclusive_lock) + pArray->headKnownAssignedXids = head; + else + { + SpinLockAcquire(&pArray->known_assigned_xids_lck); + pArray->headKnownAssignedXids = head; + SpinLockRelease(&pArray->known_assigned_xids_lck); + } +} + +/* + * KnownAssignedXidsSearch + * + * Searches KnownAssignedXids for a specific xid and optionally removes it. + * Returns true if it was found, false if not. + * + * Caller must hold ProcArrayLock in shared or exclusive mode. + * Exclusive lock must be held for remove = true. + */ +static bool +KnownAssignedXidsSearch(TransactionId xid, bool remove) +{ + ProcArrayStruct *pArray = procArray; + int first, + last; + int head; + int tail; + int result_index = -1; + + if (remove) + { + /* we hold ProcArrayLock exclusively, so no need for spinlock */ + tail = pArray->tailKnownAssignedXids; + head = pArray->headKnownAssignedXids; + } + else + { + /* take spinlock to ensure we see up-to-date array contents */ + SpinLockAcquire(&pArray->known_assigned_xids_lck); + tail = pArray->tailKnownAssignedXids; + head = pArray->headKnownAssignedXids; + SpinLockRelease(&pArray->known_assigned_xids_lck); + } + + /* + * Standard binary search. Note we can ignore the KnownAssignedXidsValid + * array here, since even invalid entries will contain sorted XIDs. + */ + first = tail; + last = head - 1; + while (first <= last) + { + int mid_index; + TransactionId mid_xid; + + mid_index = (first + last) / 2; + mid_xid = KnownAssignedXids[mid_index]; + + if (xid == mid_xid) + { + result_index = mid_index; + break; + } + else if (TransactionIdPrecedes(xid, mid_xid)) + last = mid_index - 1; + else + first = mid_index + 1; + } + + if (result_index < 0) + return false; /* not in array */ + + if (!KnownAssignedXidsValid[result_index]) + return false; /* in array, but invalid */ + + if (remove) + { + KnownAssignedXidsValid[result_index] = false; + + pArray->numKnownAssignedXids--; + Assert(pArray->numKnownAssignedXids >= 0); + + /* + * If we're removing the tail element then advance tail pointer over + * any invalid elements. This will speed future searches. + */ + if (result_index == tail) + { + tail++; + while (tail < head && !KnownAssignedXidsValid[tail]) + tail++; + if (tail >= head) + { + /* Array is empty, so we can reset both pointers */ + pArray->headKnownAssignedXids = 0; + pArray->tailKnownAssignedXids = 0; + } + else + { + pArray->tailKnownAssignedXids = tail; + } + } + } + + return true; +} + +/* + * Is the specified XID present in KnownAssignedXids[]? + * + * Caller must hold ProcArrayLock in shared or exclusive mode. + */ +static bool +KnownAssignedXidExists(TransactionId xid) +{ + Assert(TransactionIdIsValid(xid)); + + return KnownAssignedXidsSearch(xid, false); +} + +/* + * Remove the specified XID from KnownAssignedXids[]. + * + * Caller must hold ProcArrayLock in exclusive mode. + */ +static void +KnownAssignedXidsRemove(TransactionId xid) +{ + Assert(TransactionIdIsValid(xid)); + + elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid); + + /* + * Note: we cannot consider it an error to remove an XID that's not + * present. We intentionally remove subxact IDs while processing + * XLOG_XACT_ASSIGNMENT, to avoid array overflow. Then those XIDs will be + * removed again when the top-level xact commits or aborts. + * + * It might be possible to track such XIDs to distinguish this case from + * actual errors, but it would be complicated and probably not worth it. + * So, just ignore the search result. + */ + (void) KnownAssignedXidsSearch(xid, true); +} + +/* + * KnownAssignedXidsRemoveTree + * Remove xid (if it's not InvalidTransactionId) and all the subxids. + * + * Caller must hold ProcArrayLock in exclusive mode. + */ +static void +KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, + TransactionId *subxids) +{ + int i; + + if (TransactionIdIsValid(xid)) + KnownAssignedXidsRemove(xid); + + for (i = 0; i < nsubxids; i++) + KnownAssignedXidsRemove(subxids[i]); + + /* Opportunistically compress the array */ + KnownAssignedXidsCompress(KAX_TRANSACTION_END, true); +} + +/* + * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid + * then clear the whole table. + * + * Caller must hold ProcArrayLock in exclusive mode. + */ +static void +KnownAssignedXidsRemovePreceding(TransactionId removeXid) +{ + ProcArrayStruct *pArray = procArray; + int count = 0; + int head, + tail, + i; + + if (!TransactionIdIsValid(removeXid)) + { + elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids"); + pArray->numKnownAssignedXids = 0; + pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0; + return; + } + + elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid); + + /* + * Mark entries invalid starting at the tail. Since array is sorted, we + * can stop as soon as we reach an entry >= removeXid. + */ + tail = pArray->tailKnownAssignedXids; + head = pArray->headKnownAssignedXids; + + for (i = tail; i < head; i++) + { + if (KnownAssignedXidsValid[i]) + { + TransactionId knownXid = KnownAssignedXids[i]; + + if (TransactionIdFollowsOrEquals(knownXid, removeXid)) + break; + + if (!StandbyTransactionIdIsPrepared(knownXid)) + { + KnownAssignedXidsValid[i] = false; + count++; + } + } + } + + pArray->numKnownAssignedXids -= count; + Assert(pArray->numKnownAssignedXids >= 0); + + /* + * Advance the tail pointer if we've marked the tail item invalid. + */ + for (i = tail; i < head; i++) + { + if (KnownAssignedXidsValid[i]) + break; + } + if (i >= head) + { + /* Array is empty, so we can reset both pointers */ + pArray->headKnownAssignedXids = 0; + pArray->tailKnownAssignedXids = 0; + } + else + { + pArray->tailKnownAssignedXids = i; + } + + /* Opportunistically compress the array */ + KnownAssignedXidsCompress(KAX_PRUNE, true); +} + +/* + * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids. + * We filter out anything >= xmax. + * + * Returns the number of XIDs stored into xarray[]. Caller is responsible + * that array is large enough. + * + * Caller must hold ProcArrayLock in (at least) shared mode. + */ +static int +KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax) +{ + TransactionId xtmp = InvalidTransactionId; + + return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax); +} + +/* + * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus + * we reduce *xmin to the lowest xid value seen if not already lower. + * + * Caller must hold ProcArrayLock in (at least) shared mode. + */ +static int +KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, + TransactionId xmax) +{ + int count = 0; + int head, + tail; + int i; + + /* + * Fetch head just once, since it may change while we loop. We can stop + * once we reach the initially seen head, since we are certain that an xid + * cannot enter and then leave the array while we hold ProcArrayLock. We + * might miss newly-added xids, but they should be >= xmax so irrelevant + * anyway. + * + * Must take spinlock to ensure we see up-to-date array contents. + */ + SpinLockAcquire(&procArray->known_assigned_xids_lck); + tail = procArray->tailKnownAssignedXids; + head = procArray->headKnownAssignedXids; + SpinLockRelease(&procArray->known_assigned_xids_lck); + + for (i = tail; i < head; i++) + { + /* Skip any gaps in the array */ + if (KnownAssignedXidsValid[i]) + { + TransactionId knownXid = KnownAssignedXids[i]; + + /* + * Update xmin if required. Only the first XID need be checked, + * since the array is sorted. + */ + if (count == 0 && + TransactionIdPrecedes(knownXid, *xmin)) + *xmin = knownXid; + + /* + * Filter out anything >= xmax, again relying on sorted property + * of array. + */ + if (TransactionIdIsValid(xmax) && + TransactionIdFollowsOrEquals(knownXid, xmax)) + break; + + /* Add knownXid into output array */ + xarray[count++] = knownXid; + } + } + + return count; +} + +/* + * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId + * if nothing there. + */ +static TransactionId +KnownAssignedXidsGetOldestXmin(void) +{ + int head, + tail; + int i; + + /* + * Fetch head just once, since it may change while we loop. + */ + SpinLockAcquire(&procArray->known_assigned_xids_lck); + tail = procArray->tailKnownAssignedXids; + head = procArray->headKnownAssignedXids; + SpinLockRelease(&procArray->known_assigned_xids_lck); + + for (i = tail; i < head; i++) + { + /* Skip any gaps in the array */ + if (KnownAssignedXidsValid[i]) + return KnownAssignedXids[i]; + } + + return InvalidTransactionId; +} + +/* + * Display KnownAssignedXids to provide debug trail + * + * Currently this is only called within startup process, so we need no + * special locking. + * + * Note this is pretty expensive, and much of the expense will be incurred + * even if the elog message will get discarded. It's not currently called + * in any performance-critical places, however, so no need to be tenser. + */ +static void +KnownAssignedXidsDisplay(int trace_level) +{ + ProcArrayStruct *pArray = procArray; + StringInfoData buf; + int head, + tail, + i; + int nxids = 0; + + tail = pArray->tailKnownAssignedXids; + head = pArray->headKnownAssignedXids; + + initStringInfo(&buf); + + for (i = tail; i < head; i++) + { + if (KnownAssignedXidsValid[i]) + { + nxids++; + appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]); + } + } + + elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s", + nxids, + pArray->numKnownAssignedXids, + pArray->tailKnownAssignedXids, + pArray->headKnownAssignedXids, + buf.data); + + pfree(buf.data); +} + +/* + * KnownAssignedXidsReset + * Resets KnownAssignedXids to be empty + */ +static void +KnownAssignedXidsReset(void) +{ + ProcArrayStruct *pArray = procArray; + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + pArray->numKnownAssignedXids = 0; + pArray->tailKnownAssignedXids = 0; + pArray->headKnownAssignedXids = 0; + + LWLockRelease(ProcArrayLock); +} diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c new file mode 100644 index 0000000..c85cb5c --- /dev/null +++ b/src/backend/storage/ipc/procsignal.c @@ -0,0 +1,688 @@ +/*------------------------------------------------------------------------- + * + * procsignal.c + * Routines for interprocess signaling + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/ipc/procsignal.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "access/parallel.h" +#include "port/pg_bitutils.h" +#include "commands/async.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/logicalworker.h" +#include "replication/walsender.h" +#include "storage/condition_variable.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/proc.h" +#include "storage/shmem.h" +#include "storage/smgr.h" +#include "storage/sinval.h" +#include "tcop/tcopprot.h" +#include "utils/memutils.h" + +/* + * The SIGUSR1 signal is multiplexed to support signaling multiple event + * types. The specific reason is communicated via flags in shared memory. + * We keep a boolean flag for each possible "reason", so that different + * reasons can be signaled to a process concurrently. (However, if the same + * reason is signaled more than once nearly simultaneously, the process may + * observe it only once.) + * + * Each process that wants to receive signals registers its process ID + * in the ProcSignalSlots array. The array is indexed by backend ID to make + * slot allocation simple, and to avoid having to search the array when you + * know the backend ID of the process you're signaling. (We do support + * signaling without backend ID, but it's a bit less efficient.) + * + * The flags are actually declared as "volatile sig_atomic_t" for maximum + * portability. This should ensure that loads and stores of the flag + * values are atomic, allowing us to dispense with any explicit locking. + * + * pss_signalFlags are intended to be set in cases where we don't need to + * keep track of whether or not the target process has handled the signal, + * but sometimes we need confirmation, as when making a global state change + * that cannot be considered complete until all backends have taken notice + * of it. For such use cases, we set a bit in pss_barrierCheckMask and then + * increment the current "barrier generation"; when the new barrier generation + * (or greater) appears in the pss_barrierGeneration flag of every process, + * we know that the message has been received everywhere. + */ +typedef struct +{ + volatile pid_t pss_pid; + volatile sig_atomic_t pss_signalFlags[NUM_PROCSIGNALS]; + pg_atomic_uint64 pss_barrierGeneration; + pg_atomic_uint32 pss_barrierCheckMask; + ConditionVariable pss_barrierCV; +} ProcSignalSlot; + +/* + * Information that is global to the entire ProcSignal system can be stored + * here. + * + * psh_barrierGeneration is the highest barrier generation in existence. + */ +typedef struct +{ + pg_atomic_uint64 psh_barrierGeneration; + ProcSignalSlot psh_slot[FLEXIBLE_ARRAY_MEMBER]; +} ProcSignalHeader; + +/* + * We reserve a slot for each possible BackendId, plus one for each + * possible auxiliary process type. (This scheme assumes there is not + * more than one of any auxiliary process type at a time.) + */ +#define NumProcSignalSlots (MaxBackends + NUM_AUXPROCTYPES) + +/* Check whether the relevant type bit is set in the flags. */ +#define BARRIER_SHOULD_CHECK(flags, type) \ + (((flags) & (((uint32) 1) << (uint32) (type))) != 0) + +/* Clear the relevant type bit from the flags. */ +#define BARRIER_CLEAR_BIT(flags, type) \ + ((flags) &= ~(((uint32) 1) << (uint32) (type))) + +static ProcSignalHeader *ProcSignal = NULL; +static ProcSignalSlot *MyProcSignalSlot = NULL; + +static bool CheckProcSignal(ProcSignalReason reason); +static void CleanupProcSignalState(int status, Datum arg); +static void ResetProcSignalBarrierBits(uint32 flags); + +/* + * ProcSignalShmemSize + * Compute space needed for ProcSignal's shared memory + */ +Size +ProcSignalShmemSize(void) +{ + Size size; + + size = mul_size(NumProcSignalSlots, sizeof(ProcSignalSlot)); + size = add_size(size, offsetof(ProcSignalHeader, psh_slot)); + return size; +} + +/* + * ProcSignalShmemInit + * Allocate and initialize ProcSignal's shared memory + */ +void +ProcSignalShmemInit(void) +{ + Size size = ProcSignalShmemSize(); + bool found; + + ProcSignal = (ProcSignalHeader *) + ShmemInitStruct("ProcSignal", size, &found); + + /* If we're first, initialize. */ + if (!found) + { + int i; + + pg_atomic_init_u64(&ProcSignal->psh_barrierGeneration, 0); + + for (i = 0; i < NumProcSignalSlots; ++i) + { + ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; + + slot->pss_pid = 0; + MemSet(slot->pss_signalFlags, 0, sizeof(slot->pss_signalFlags)); + pg_atomic_init_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX); + pg_atomic_init_u32(&slot->pss_barrierCheckMask, 0); + ConditionVariableInit(&slot->pss_barrierCV); + } + } +} + +/* + * ProcSignalInit + * Register the current process in the ProcSignal array + * + * The passed index should be my BackendId if the process has one, + * or MaxBackends + aux process type if not. + */ +void +ProcSignalInit(int pss_idx) +{ + ProcSignalSlot *slot; + uint64 barrier_generation; + + Assert(pss_idx >= 1 && pss_idx <= NumProcSignalSlots); + + slot = &ProcSignal->psh_slot[pss_idx - 1]; + + /* sanity check */ + if (slot->pss_pid != 0) + elog(LOG, "process %d taking over ProcSignal slot %d, but it's not empty", + MyProcPid, pss_idx); + + /* Clear out any leftover signal reasons */ + MemSet(slot->pss_signalFlags, 0, NUM_PROCSIGNALS * sizeof(sig_atomic_t)); + + /* + * Initialize barrier state. Since we're a brand-new process, there + * shouldn't be any leftover backend-private state that needs to be + * updated. Therefore, we can broadcast the latest barrier generation and + * disregard any previously-set check bits. + * + * NB: This only works if this initialization happens early enough in the + * startup sequence that we haven't yet cached any state that might need + * to be invalidated. That's also why we have a memory barrier here, to be + * sure that any later reads of memory happen strictly after this. + */ + pg_atomic_write_u32(&slot->pss_barrierCheckMask, 0); + barrier_generation = + pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration); + pg_atomic_write_u64(&slot->pss_barrierGeneration, barrier_generation); + pg_memory_barrier(); + + /* Mark slot with my PID */ + slot->pss_pid = MyProcPid; + + /* Remember slot location for CheckProcSignal */ + MyProcSignalSlot = slot; + + /* Set up to release the slot on process exit */ + on_shmem_exit(CleanupProcSignalState, Int32GetDatum(pss_idx)); +} + +/* + * CleanupProcSignalState + * Remove current process from ProcSignal mechanism + * + * This function is called via on_shmem_exit() during backend shutdown. + */ +static void +CleanupProcSignalState(int status, Datum arg) +{ + int pss_idx = DatumGetInt32(arg); + ProcSignalSlot *slot; + + slot = &ProcSignal->psh_slot[pss_idx - 1]; + Assert(slot == MyProcSignalSlot); + + /* + * Clear MyProcSignalSlot, so that a SIGUSR1 received after this point + * won't try to access it after it's no longer ours (and perhaps even + * after we've unmapped the shared memory segment). + */ + MyProcSignalSlot = NULL; + + /* sanity check */ + if (slot->pss_pid != MyProcPid) + { + /* + * don't ERROR here. We're exiting anyway, and don't want to get into + * infinite loop trying to exit + */ + elog(LOG, "process %d releasing ProcSignal slot %d, but it contains %d", + MyProcPid, pss_idx, (int) slot->pss_pid); + return; /* XXX better to zero the slot anyway? */ + } + + /* + * Make this slot look like it's absorbed all possible barriers, so that + * no barrier waits block on it. + */ + pg_atomic_write_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX); + ConditionVariableBroadcast(&slot->pss_barrierCV); + + slot->pss_pid = 0; +} + +/* + * SendProcSignal + * Send a signal to a Postgres process + * + * Providing backendId is optional, but it will speed up the operation. + * + * On success (a signal was sent), zero is returned. + * On error, -1 is returned, and errno is set (typically to ESRCH or EPERM). + * + * Not to be confused with ProcSendSignal + */ +int +SendProcSignal(pid_t pid, ProcSignalReason reason, BackendId backendId) +{ + volatile ProcSignalSlot *slot; + + if (backendId != InvalidBackendId) + { + slot = &ProcSignal->psh_slot[backendId - 1]; + + /* + * Note: Since there's no locking, it's possible that the target + * process detaches from shared memory and exits right after this + * test, before we set the flag and send signal. And the signal slot + * might even be recycled by a new process, so it's remotely possible + * that we set a flag for a wrong process. That's OK, all the signals + * are such that no harm is done if they're mistakenly fired. + */ + if (slot->pss_pid == pid) + { + /* Atomically set the proper flag */ + slot->pss_signalFlags[reason] = true; + /* Send signal */ + return kill(pid, SIGUSR1); + } + } + else + { + /* + * BackendId not provided, so search the array using pid. We search + * the array back to front so as to reduce search overhead. Passing + * InvalidBackendId means that the target is most likely an auxiliary + * process, which will have a slot near the end of the array. + */ + int i; + + for (i = NumProcSignalSlots - 1; i >= 0; i--) + { + slot = &ProcSignal->psh_slot[i]; + + if (slot->pss_pid == pid) + { + /* the above note about race conditions applies here too */ + + /* Atomically set the proper flag */ + slot->pss_signalFlags[reason] = true; + /* Send signal */ + return kill(pid, SIGUSR1); + } + } + } + + errno = ESRCH; + return -1; +} + +/* + * EmitProcSignalBarrier + * Send a signal to every Postgres process + * + * The return value of this function is the barrier "generation" created + * by this operation. This value can be passed to WaitForProcSignalBarrier + * to wait until it is known that every participant in the ProcSignal + * mechanism has absorbed the signal (or started afterwards). + * + * Note that it would be a bad idea to use this for anything that happens + * frequently, as interrupting every backend could cause a noticeable + * performance hit. + * + * Callers are entitled to assume that this function will not throw ERROR + * or FATAL. + */ +uint64 +EmitProcSignalBarrier(ProcSignalBarrierType type) +{ + uint32 flagbit = 1 << (uint32) type; + uint64 generation; + + /* + * Set all the flags. + * + * Note that pg_atomic_fetch_or_u32 has full barrier semantics, so this is + * totally ordered with respect to anything the caller did before, and + * anything that we do afterwards. (This is also true of the later call to + * pg_atomic_add_fetch_u64.) + */ + for (int i = 0; i < NumProcSignalSlots; i++) + { + volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; + + pg_atomic_fetch_or_u32(&slot->pss_barrierCheckMask, flagbit); + } + + /* + * Increment the generation counter. + */ + generation = + pg_atomic_add_fetch_u64(&ProcSignal->psh_barrierGeneration, 1); + + /* + * Signal all the processes, so that they update their advertised barrier + * generation. + * + * Concurrency is not a problem here. Backends that have exited don't + * matter, and new backends that have joined since we entered this + * function must already have current state, since the caller is + * responsible for making sure that the relevant state is entirely visible + * before calling this function in the first place. We still have to wake + * them up - because we can't distinguish between such backends and older + * backends that need to update state - but they won't actually need to + * change any state. + */ + for (int i = NumProcSignalSlots - 1; i >= 0; i--) + { + volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; + pid_t pid = slot->pss_pid; + + if (pid != 0) + { + /* see SendProcSignal for details */ + slot->pss_signalFlags[PROCSIG_BARRIER] = true; + kill(pid, SIGUSR1); + } + } + + return generation; +} + +/* + * WaitForProcSignalBarrier - wait until it is guaranteed that all changes + * requested by a specific call to EmitProcSignalBarrier() have taken effect. + */ +void +WaitForProcSignalBarrier(uint64 generation) +{ + Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration)); + + elog(DEBUG1, + "waiting for all backends to process ProcSignalBarrier generation " + UINT64_FORMAT, + generation); + + for (int i = NumProcSignalSlots - 1; i >= 0; i--) + { + ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; + uint64 oldval; + + /* + * It's important that we check only pss_barrierGeneration here and + * not pss_barrierCheckMask. Bits in pss_barrierCheckMask get cleared + * before the barrier is actually absorbed, but pss_barrierGeneration + * is updated only afterward. + */ + oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration); + while (oldval < generation) + { + if (ConditionVariableTimedSleep(&slot->pss_barrierCV, + 5000, + WAIT_EVENT_PROC_SIGNAL_BARRIER)) + ereport(LOG, + (errmsg("still waiting for backend with PID %d to accept ProcSignalBarrier", + (int) slot->pss_pid))); + oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration); + } + ConditionVariableCancelSleep(); + } + + elog(DEBUG1, + "finished waiting for all backends to process ProcSignalBarrier generation " + UINT64_FORMAT, + generation); + + /* + * The caller is probably calling this function because it wants to read + * the shared state or perform further writes to shared state once all + * backends are known to have absorbed the barrier. However, the read of + * pss_barrierGeneration was performed unlocked; insert a memory barrier + * to separate it from whatever follows. + */ + pg_memory_barrier(); +} + +/* + * Handle receipt of an interrupt indicating a global barrier event. + * + * All the actual work is deferred to ProcessProcSignalBarrier(), because we + * cannot safely access the barrier generation inside the signal handler as + * 64bit atomics might use spinlock based emulation, even for reads. As this + * routine only gets called when PROCSIG_BARRIER is sent that won't cause a + * lot of unnecessary work. + */ +static void +HandleProcSignalBarrierInterrupt(void) +{ + InterruptPending = true; + ProcSignalBarrierPending = true; + /* latch will be set by procsignal_sigusr1_handler */ +} + +/* + * Perform global barrier related interrupt checking. + * + * Any backend that participates in ProcSignal signaling must arrange to + * call this function periodically. It is called from CHECK_FOR_INTERRUPTS(), + * which is enough for normal backends, but not necessarily for all types of + * background processes. + */ +void +ProcessProcSignalBarrier(void) +{ + uint64 local_gen; + uint64 shared_gen; + volatile uint32 flags; + + Assert(MyProcSignalSlot); + + /* Exit quickly if there's no work to do. */ + if (!ProcSignalBarrierPending) + return; + ProcSignalBarrierPending = false; + + /* + * It's not unlikely to process multiple barriers at once, before the + * signals for all the barriers have arrived. To avoid unnecessary work in + * response to subsequent signals, exit early if we already have processed + * all of them. + */ + local_gen = pg_atomic_read_u64(&MyProcSignalSlot->pss_barrierGeneration); + shared_gen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration); + + Assert(local_gen <= shared_gen); + + if (local_gen == shared_gen) + return; + + /* + * Get and clear the flags that are set for this backend. Note that + * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the + * read of the barrier generation above happens before we atomically + * extract the flags, and that any subsequent state changes happen + * afterward. + * + * NB: In order to avoid race conditions, we must zero + * pss_barrierCheckMask first and only afterwards try to do barrier + * processing. If we did it in the other order, someone could send us + * another barrier of some type right after we called the + * barrier-processing function but before we cleared the bit. We would + * have no way of knowing that the bit needs to stay set in that case, so + * the need to call the barrier-processing function again would just get + * forgotten. So instead, we tentatively clear all the bits and then put + * back any for which we don't manage to successfully absorb the barrier. + */ + flags = pg_atomic_exchange_u32(&MyProcSignalSlot->pss_barrierCheckMask, 0); + + /* + * If there are no flags set, then we can skip doing any real work. + * Otherwise, establish a PG_TRY block, so that we don't lose track of + * which types of barrier processing are needed if an ERROR occurs. + */ + if (flags != 0) + { + bool success = true; + + PG_TRY(); + { + /* + * Process each type of barrier. The barrier-processing functions + * should normally return true, but may return false if the + * barrier can't be absorbed at the current time. This should be + * rare, because it's pretty expensive. Every single + * CHECK_FOR_INTERRUPTS() will return here until we manage to + * absorb the barrier, and that cost will add up in a hurry. + * + * NB: It ought to be OK to call the barrier-processing functions + * unconditionally, but it's more efficient to call only the ones + * that might need us to do something based on the flags. + */ + while (flags != 0) + { + ProcSignalBarrierType type; + bool processed = true; + + type = (ProcSignalBarrierType) pg_rightmost_one_pos32(flags); + switch (type) + { + case PROCSIGNAL_BARRIER_SMGRRELEASE: + processed = ProcessBarrierSmgrRelease(); + break; + } + + /* + * To avoid an infinite loop, we must always unset the bit in + * flags. + */ + BARRIER_CLEAR_BIT(flags, type); + + /* + * If we failed to process the barrier, reset the shared bit + * so we try again later, and set a flag so that we don't bump + * our generation. + */ + if (!processed) + { + ResetProcSignalBarrierBits(((uint32) 1) << type); + success = false; + } + } + } + PG_CATCH(); + { + /* + * If an ERROR occurred, we'll need to try again later to handle + * that barrier type and any others that haven't been handled yet + * or weren't successfully absorbed. + */ + ResetProcSignalBarrierBits(flags); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* + * If some barrier types were not successfully absorbed, we will have + * to try again later. + */ + if (!success) + return; + } + + /* + * State changes related to all types of barriers that might have been + * emitted have now been handled, so we can update our notion of the + * generation to the one we observed before beginning the updates. If + * things have changed further, it'll get fixed up when this function is + * next called. + */ + pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen); + ConditionVariableBroadcast(&MyProcSignalSlot->pss_barrierCV); +} + +/* + * If it turns out that we couldn't absorb one or more barrier types, either + * because the barrier-processing functions returned false or due to an error, + * arrange for processing to be retried later. + */ +static void +ResetProcSignalBarrierBits(uint32 flags) +{ + pg_atomic_fetch_or_u32(&MyProcSignalSlot->pss_barrierCheckMask, flags); + ProcSignalBarrierPending = true; + InterruptPending = true; +} + +/* + * CheckProcSignal - check to see if a particular reason has been + * signaled, and clear the signal flag. Should be called after receiving + * SIGUSR1. + */ +static bool +CheckProcSignal(ProcSignalReason reason) +{ + volatile ProcSignalSlot *slot = MyProcSignalSlot; + + if (slot != NULL) + { + /* Careful here --- don't clear flag if we haven't seen it set */ + if (slot->pss_signalFlags[reason]) + { + slot->pss_signalFlags[reason] = false; + return true; + } + } + + return false; +} + +/* + * procsignal_sigusr1_handler - handle SIGUSR1 signal. + */ +void +procsignal_sigusr1_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + if (CheckProcSignal(PROCSIG_CATCHUP_INTERRUPT)) + HandleCatchupInterrupt(); + + if (CheckProcSignal(PROCSIG_NOTIFY_INTERRUPT)) + HandleNotifyInterrupt(); + + if (CheckProcSignal(PROCSIG_PARALLEL_MESSAGE)) + HandleParallelMessageInterrupt(); + + if (CheckProcSignal(PROCSIG_WALSND_INIT_STOPPING)) + HandleWalSndInitStopping(); + + if (CheckProcSignal(PROCSIG_BARRIER)) + HandleProcSignalBarrierInterrupt(); + + if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT)) + HandleLogMemoryContextInterrupt(); + + if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE)) + HandleParallelApplyMessageInterrupt(); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + + SetLatch(MyLatch); + + errno = save_errno; +} diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c new file mode 100644 index 0000000..d134a09 --- /dev/null +++ b/src/backend/storage/ipc/shm_mq.c @@ -0,0 +1,1329 @@ +/*------------------------------------------------------------------------- + * + * shm_mq.c + * single-reader, single-writer shared memory message queue + * + * Both the sender and the receiver must have a PGPROC; their respective + * process latches are used for synchronization. Only the sender may send, + * and only the receiver may receive. This is intended to allow a user + * backend to communicate with worker backends that it has registered. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/storage/ipc/shm_mq.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "pgstat.h" +#include "port/pg_bitutils.h" +#include "postmaster/bgworker.h" +#include "storage/procsignal.h" +#include "storage/shm_mq.h" +#include "storage/spin.h" +#include "utils/memutils.h" + +/* + * This structure represents the actual queue, stored in shared memory. + * + * Some notes on synchronization: + * + * mq_receiver and mq_bytes_read can only be changed by the receiver; and + * mq_sender and mq_bytes_written can only be changed by the sender. + * mq_receiver and mq_sender are protected by mq_mutex, although, importantly, + * they cannot change once set, and thus may be read without a lock once this + * is known to be the case. + * + * mq_bytes_read and mq_bytes_written are not protected by the mutex. Instead, + * they are written atomically using 8 byte loads and stores. Memory barriers + * must be carefully used to synchronize reads and writes of these values with + * reads and writes of the actual data in mq_ring. + * + * mq_detached needs no locking. It can be set by either the sender or the + * receiver, but only ever from false to true, so redundant writes don't + * matter. It is important that if we set mq_detached and then set the + * counterparty's latch, the counterparty must be certain to see the change + * after waking up. Since SetLatch begins with a memory barrier and ResetLatch + * ends with one, this should be OK. + * + * mq_ring_size and mq_ring_offset never change after initialization, and + * can therefore be read without the lock. + * + * Importantly, mq_ring can be safely read and written without a lock. + * At any given time, the difference between mq_bytes_read and + * mq_bytes_written defines the number of bytes within mq_ring that contain + * unread data, and mq_bytes_read defines the position where those bytes + * begin. The sender can increase the number of unread bytes at any time, + * but only the receiver can give license to overwrite those bytes, by + * incrementing mq_bytes_read. Therefore, it's safe for the receiver to read + * the unread bytes it knows to be present without the lock. Conversely, + * the sender can write to the unused portion of the ring buffer without + * the lock, because nobody else can be reading or writing those bytes. The + * receiver could be making more bytes unused by incrementing mq_bytes_read, + * but that's OK. Note that it would be unsafe for the receiver to read any + * data it's already marked as read, or to write any data; and it would be + * unsafe for the sender to reread any data after incrementing + * mq_bytes_written, but fortunately there's no need for any of that. + */ +struct shm_mq +{ + slock_t mq_mutex; + PGPROC *mq_receiver; + PGPROC *mq_sender; + pg_atomic_uint64 mq_bytes_read; + pg_atomic_uint64 mq_bytes_written; + Size mq_ring_size; + bool mq_detached; + uint8 mq_ring_offset; + char mq_ring[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * This structure is a backend-private handle for access to a queue. + * + * mqh_queue is a pointer to the queue we've attached, and mqh_segment is + * an optional pointer to the dynamic shared memory segment that contains it. + * (If mqh_segment is provided, we register an on_dsm_detach callback to + * make sure we detach from the queue before detaching from DSM.) + * + * If this queue is intended to connect the current process with a background + * worker that started it, the user can pass a pointer to the worker handle + * to shm_mq_attach(), and we'll store it in mqh_handle. The point of this + * is to allow us to begin sending to or receiving from that queue before the + * process we'll be communicating with has even been started. If it fails + * to start, the handle will allow us to notice that and fail cleanly, rather + * than waiting forever; see shm_mq_wait_internal. This is mostly useful in + * simple cases - e.g. where there are just 2 processes communicating; in + * more complex scenarios, every process may not have a BackgroundWorkerHandle + * available, or may need to watch for the failure of more than one other + * process at a time. + * + * When a message exists as a contiguous chunk of bytes in the queue - that is, + * it is smaller than the size of the ring buffer and does not wrap around + * the end - we return the message to the caller as a pointer into the buffer. + * For messages that are larger or happen to wrap, we reassemble the message + * locally by copying the chunks into a backend-local buffer. mqh_buffer is + * the buffer, and mqh_buflen is the number of bytes allocated for it. + * + * mqh_send_pending, is number of bytes that is written to the queue but not + * yet updated in the shared memory. We will not update it until the written + * data is 1/4th of the ring size or the tuple queue is full. This will + * prevent frequent CPU cache misses, and it will also avoid frequent + * SetLatch() calls, which are quite expensive. + * + * mqh_partial_bytes, mqh_expected_bytes, and mqh_length_word_complete + * are used to track the state of non-blocking operations. When the caller + * attempts a non-blocking operation that returns SHM_MQ_WOULD_BLOCK, they + * are expected to retry the call at a later time with the same argument; + * we need to retain enough state to pick up where we left off. + * mqh_length_word_complete tracks whether we are done sending or receiving + * (whichever we're doing) the entire length word. mqh_partial_bytes tracks + * the number of bytes read or written for either the length word or the + * message itself, and mqh_expected_bytes - which is used only for reads - + * tracks the expected total size of the payload. + * + * mqh_counterparty_attached tracks whether we know the counterparty to have + * attached to the queue at some previous point. This lets us avoid some + * mutex acquisitions. + * + * mqh_context is the memory context in effect at the time we attached to + * the shm_mq. The shm_mq_handle itself is allocated in this context, and + * we make sure any other allocations we do happen in this context as well, + * to avoid nasty surprises. + */ +struct shm_mq_handle +{ + shm_mq *mqh_queue; + dsm_segment *mqh_segment; + BackgroundWorkerHandle *mqh_handle; + char *mqh_buffer; + Size mqh_buflen; + Size mqh_consume_pending; + Size mqh_send_pending; + Size mqh_partial_bytes; + Size mqh_expected_bytes; + bool mqh_length_word_complete; + bool mqh_counterparty_attached; + MemoryContext mqh_context; +}; + +static void shm_mq_detach_internal(shm_mq *mq); +static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, + const void *data, bool nowait, Size *bytes_written); +static shm_mq_result shm_mq_receive_bytes(shm_mq_handle *mqh, + Size bytes_needed, bool nowait, Size *nbytesp, + void **datap); +static bool shm_mq_counterparty_gone(shm_mq *mq, + BackgroundWorkerHandle *handle); +static bool shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr, + BackgroundWorkerHandle *handle); +static void shm_mq_inc_bytes_read(shm_mq *mq, Size n); +static void shm_mq_inc_bytes_written(shm_mq *mq, Size n); +static void shm_mq_detach_callback(dsm_segment *seg, Datum arg); + +/* Minimum queue size is enough for header and at least one chunk of data. */ +const Size shm_mq_minimum_size = +MAXALIGN(offsetof(shm_mq, mq_ring)) + MAXIMUM_ALIGNOF; + +#define MQH_INITIAL_BUFSIZE 8192 + +/* + * Initialize a new shared message queue. + */ +shm_mq * +shm_mq_create(void *address, Size size) +{ + shm_mq *mq = address; + Size data_offset = MAXALIGN(offsetof(shm_mq, mq_ring)); + + /* If the size isn't MAXALIGN'd, just discard the odd bytes. */ + size = MAXALIGN_DOWN(size); + + /* Queue size must be large enough to hold some data. */ + Assert(size > data_offset); + + /* Initialize queue header. */ + SpinLockInit(&mq->mq_mutex); + mq->mq_receiver = NULL; + mq->mq_sender = NULL; + pg_atomic_init_u64(&mq->mq_bytes_read, 0); + pg_atomic_init_u64(&mq->mq_bytes_written, 0); + mq->mq_ring_size = size - data_offset; + mq->mq_detached = false; + mq->mq_ring_offset = data_offset - offsetof(shm_mq, mq_ring); + + return mq; +} + +/* + * Set the identity of the process that will receive from a shared message + * queue. + */ +void +shm_mq_set_receiver(shm_mq *mq, PGPROC *proc) +{ + PGPROC *sender; + + SpinLockAcquire(&mq->mq_mutex); + Assert(mq->mq_receiver == NULL); + mq->mq_receiver = proc; + sender = mq->mq_sender; + SpinLockRelease(&mq->mq_mutex); + + if (sender != NULL) + SetLatch(&sender->procLatch); +} + +/* + * Set the identity of the process that will send to a shared message queue. + */ +void +shm_mq_set_sender(shm_mq *mq, PGPROC *proc) +{ + PGPROC *receiver; + + SpinLockAcquire(&mq->mq_mutex); + Assert(mq->mq_sender == NULL); + mq->mq_sender = proc; + receiver = mq->mq_receiver; + SpinLockRelease(&mq->mq_mutex); + + if (receiver != NULL) + SetLatch(&receiver->procLatch); +} + +/* + * Get the configured receiver. + */ +PGPROC * +shm_mq_get_receiver(shm_mq *mq) +{ + PGPROC *receiver; + + SpinLockAcquire(&mq->mq_mutex); + receiver = mq->mq_receiver; + SpinLockRelease(&mq->mq_mutex); + + return receiver; +} + +/* + * Get the configured sender. + */ +PGPROC * +shm_mq_get_sender(shm_mq *mq) +{ + PGPROC *sender; + + SpinLockAcquire(&mq->mq_mutex); + sender = mq->mq_sender; + SpinLockRelease(&mq->mq_mutex); + + return sender; +} + +/* + * Attach to a shared message queue so we can send or receive messages. + * + * The memory context in effect at the time this function is called should + * be one which will last for at least as long as the message queue itself. + * We'll allocate the handle in that context, and future allocations that + * are needed to buffer incoming data will happen in that context as well. + * + * If seg != NULL, the queue will be automatically detached when that dynamic + * shared memory segment is detached. + * + * If handle != NULL, the queue can be read or written even before the + * other process has attached. We'll wait for it to do so if needed. The + * handle must be for a background worker initialized with bgw_notify_pid + * equal to our PID. + * + * shm_mq_detach() should be called when done. This will free the + * shm_mq_handle and mark the queue itself as detached, so that our + * counterpart won't get stuck waiting for us to fill or drain the queue + * after we've already lost interest. + */ +shm_mq_handle * +shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle) +{ + shm_mq_handle *mqh = palloc(sizeof(shm_mq_handle)); + + Assert(mq->mq_receiver == MyProc || mq->mq_sender == MyProc); + mqh->mqh_queue = mq; + mqh->mqh_segment = seg; + mqh->mqh_handle = handle; + mqh->mqh_buffer = NULL; + mqh->mqh_buflen = 0; + mqh->mqh_consume_pending = 0; + mqh->mqh_send_pending = 0; + mqh->mqh_partial_bytes = 0; + mqh->mqh_expected_bytes = 0; + mqh->mqh_length_word_complete = false; + mqh->mqh_counterparty_attached = false; + mqh->mqh_context = CurrentMemoryContext; + + if (seg != NULL) + on_dsm_detach(seg, shm_mq_detach_callback, PointerGetDatum(mq)); + + return mqh; +} + +/* + * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had + * been passed to shm_mq_attach. + */ +void +shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle) +{ + Assert(mqh->mqh_handle == NULL); + mqh->mqh_handle = handle; +} + +/* + * Write a message into a shared message queue. + */ +shm_mq_result +shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait, + bool force_flush) +{ + shm_mq_iovec iov; + + iov.data = data; + iov.len = nbytes; + + return shm_mq_sendv(mqh, &iov, 1, nowait, force_flush); +} + +/* + * Write a message into a shared message queue, gathered from multiple + * addresses. + * + * When nowait = false, we'll wait on our process latch when the ring buffer + * fills up, and then continue writing once the receiver has drained some data. + * The process latch is reset after each wait. + * + * When nowait = true, we do not manipulate the state of the process latch; + * instead, if the buffer becomes full, we return SHM_MQ_WOULD_BLOCK. In + * this case, the caller should call this function again, with the same + * arguments, each time the process latch is set. (Once begun, the sending + * of a message cannot be aborted except by detaching from the queue; changing + * the length or payload will corrupt the queue.) + * + * When force_flush = true, we immediately update the shm_mq's mq_bytes_written + * and notify the receiver (if it is already attached). Otherwise, we don't + * update it until we have written an amount of data greater than 1/4th of the + * ring size. + */ +shm_mq_result +shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait, + bool force_flush) +{ + shm_mq_result res; + shm_mq *mq = mqh->mqh_queue; + PGPROC *receiver; + Size nbytes = 0; + Size bytes_written; + int i; + int which_iov = 0; + Size offset; + + Assert(mq->mq_sender == MyProc); + + /* Compute total size of write. */ + for (i = 0; i < iovcnt; ++i) + nbytes += iov[i].len; + + /* Prevent writing messages overwhelming the receiver. */ + if (nbytes > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot send a message of size %zu via shared memory queue", + nbytes))); + + /* Try to write, or finish writing, the length word into the buffer. */ + while (!mqh->mqh_length_word_complete) + { + Assert(mqh->mqh_partial_bytes < sizeof(Size)); + res = shm_mq_send_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes, + ((char *) &nbytes) + mqh->mqh_partial_bytes, + nowait, &bytes_written); + + if (res == SHM_MQ_DETACHED) + { + /* Reset state in case caller tries to send another message. */ + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = false; + return res; + } + mqh->mqh_partial_bytes += bytes_written; + + if (mqh->mqh_partial_bytes >= sizeof(Size)) + { + Assert(mqh->mqh_partial_bytes == sizeof(Size)); + + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = true; + } + + if (res != SHM_MQ_SUCCESS) + return res; + + /* Length word can't be split unless bigger than required alignment. */ + Assert(mqh->mqh_length_word_complete || sizeof(Size) > MAXIMUM_ALIGNOF); + } + + /* Write the actual data bytes into the buffer. */ + Assert(mqh->mqh_partial_bytes <= nbytes); + offset = mqh->mqh_partial_bytes; + do + { + Size chunksize; + + /* Figure out which bytes need to be sent next. */ + if (offset >= iov[which_iov].len) + { + offset -= iov[which_iov].len; + ++which_iov; + if (which_iov >= iovcnt) + break; + continue; + } + + /* + * We want to avoid copying the data if at all possible, but every + * chunk of bytes we write into the queue has to be MAXALIGN'd, except + * the last. Thus, if a chunk other than the last one ends on a + * non-MAXALIGN'd boundary, we have to combine the tail end of its + * data with data from one or more following chunks until we either + * reach the last chunk or accumulate a number of bytes which is + * MAXALIGN'd. + */ + if (which_iov + 1 < iovcnt && + offset + MAXIMUM_ALIGNOF > iov[which_iov].len) + { + char tmpbuf[MAXIMUM_ALIGNOF]; + int j = 0; + + for (;;) + { + if (offset < iov[which_iov].len) + { + tmpbuf[j] = iov[which_iov].data[offset]; + j++; + offset++; + if (j == MAXIMUM_ALIGNOF) + break; + } + else + { + offset -= iov[which_iov].len; + which_iov++; + if (which_iov >= iovcnt) + break; + } + } + + res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written); + + if (res == SHM_MQ_DETACHED) + { + /* Reset state in case caller tries to send another message. */ + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = false; + return res; + } + + mqh->mqh_partial_bytes += bytes_written; + if (res != SHM_MQ_SUCCESS) + return res; + continue; + } + + /* + * If this is the last chunk, we can write all the data, even if it + * isn't a multiple of MAXIMUM_ALIGNOF. Otherwise, we need to + * MAXALIGN_DOWN the write size. + */ + chunksize = iov[which_iov].len - offset; + if (which_iov + 1 < iovcnt) + chunksize = MAXALIGN_DOWN(chunksize); + res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset], + nowait, &bytes_written); + + if (res == SHM_MQ_DETACHED) + { + /* Reset state in case caller tries to send another message. */ + mqh->mqh_length_word_complete = false; + mqh->mqh_partial_bytes = 0; + return res; + } + + mqh->mqh_partial_bytes += bytes_written; + offset += bytes_written; + if (res != SHM_MQ_SUCCESS) + return res; + } while (mqh->mqh_partial_bytes < nbytes); + + /* Reset for next message. */ + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = false; + + /* If queue has been detached, let caller know. */ + if (mq->mq_detached) + return SHM_MQ_DETACHED; + + /* + * If the counterparty is known to have attached, we can read mq_receiver + * without acquiring the spinlock. Otherwise, more caution is needed. + */ + if (mqh->mqh_counterparty_attached) + receiver = mq->mq_receiver; + else + { + SpinLockAcquire(&mq->mq_mutex); + receiver = mq->mq_receiver; + SpinLockRelease(&mq->mq_mutex); + if (receiver != NULL) + mqh->mqh_counterparty_attached = true; + } + + /* + * If the caller has requested force flush or we have written more than + * 1/4 of the ring size, mark it as written in shared memory and notify + * the receiver. + */ + if (force_flush || mqh->mqh_send_pending > (mq->mq_ring_size >> 2)) + { + shm_mq_inc_bytes_written(mq, mqh->mqh_send_pending); + if (receiver != NULL) + SetLatch(&receiver->procLatch); + mqh->mqh_send_pending = 0; + } + + return SHM_MQ_SUCCESS; +} + +/* + * Receive a message from a shared message queue. + * + * We set *nbytes to the message length and *data to point to the message + * payload. If the entire message exists in the queue as a single, + * contiguous chunk, *data will point directly into shared memory; otherwise, + * it will point to a temporary buffer. This mostly avoids data copying in + * the hoped-for case where messages are short compared to the buffer size, + * while still allowing longer messages. In either case, the return value + * remains valid until the next receive operation is performed on the queue. + * + * When nowait = false, we'll wait on our process latch when the ring buffer + * is empty and we have not yet received a full message. The sender will + * set our process latch after more data has been written, and we'll resume + * processing. Each call will therefore return a complete message + * (unless the sender detaches the queue). + * + * When nowait = true, we do not manipulate the state of the process latch; + * instead, whenever the buffer is empty and we need to read from it, we + * return SHM_MQ_WOULD_BLOCK. In this case, the caller should call this + * function again after the process latch has been set. + */ +shm_mq_result +shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait) +{ + shm_mq *mq = mqh->mqh_queue; + shm_mq_result res; + Size rb = 0; + Size nbytes; + void *rawdata; + + Assert(mq->mq_receiver == MyProc); + + /* We can't receive data until the sender has attached. */ + if (!mqh->mqh_counterparty_attached) + { + if (nowait) + { + int counterparty_gone; + + /* + * We shouldn't return at this point at all unless the sender + * hasn't attached yet. However, the correct return value depends + * on whether the sender is still attached. If we first test + * whether the sender has ever attached and then test whether the + * sender has detached, there's a race condition: a sender that + * attaches and detaches very quickly might fool us into thinking + * the sender never attached at all. So, test whether our + * counterparty is definitively gone first, and only afterwards + * check whether the sender ever attached in the first place. + */ + counterparty_gone = shm_mq_counterparty_gone(mq, mqh->mqh_handle); + if (shm_mq_get_sender(mq) == NULL) + { + if (counterparty_gone) + return SHM_MQ_DETACHED; + else + return SHM_MQ_WOULD_BLOCK; + } + } + else if (!shm_mq_wait_internal(mq, &mq->mq_sender, mqh->mqh_handle) + && shm_mq_get_sender(mq) == NULL) + { + mq->mq_detached = true; + return SHM_MQ_DETACHED; + } + mqh->mqh_counterparty_attached = true; + } + + /* + * If we've consumed an amount of data greater than 1/4th of the ring + * size, mark it consumed in shared memory. We try to avoid doing this + * unnecessarily when only a small amount of data has been consumed, + * because SetLatch() is fairly expensive and we don't want to do it too + * often. + */ + if (mqh->mqh_consume_pending > mq->mq_ring_size / 4) + { + shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending); + mqh->mqh_consume_pending = 0; + } + + /* Try to read, or finish reading, the length word from the buffer. */ + while (!mqh->mqh_length_word_complete) + { + /* Try to receive the message length word. */ + Assert(mqh->mqh_partial_bytes < sizeof(Size)); + res = shm_mq_receive_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes, + nowait, &rb, &rawdata); + if (res != SHM_MQ_SUCCESS) + return res; + + /* + * Hopefully, we'll receive the entire message length word at once. + * But if sizeof(Size) > MAXIMUM_ALIGNOF, then it might be split over + * multiple reads. + */ + if (mqh->mqh_partial_bytes == 0 && rb >= sizeof(Size)) + { + Size needed; + + nbytes = *(Size *) rawdata; + + /* If we've already got the whole message, we're done. */ + needed = MAXALIGN(sizeof(Size)) + MAXALIGN(nbytes); + if (rb >= needed) + { + mqh->mqh_consume_pending += needed; + *nbytesp = nbytes; + *datap = ((char *) rawdata) + MAXALIGN(sizeof(Size)); + return SHM_MQ_SUCCESS; + } + + /* + * We don't have the whole message, but we at least have the whole + * length word. + */ + mqh->mqh_expected_bytes = nbytes; + mqh->mqh_length_word_complete = true; + mqh->mqh_consume_pending += MAXALIGN(sizeof(Size)); + rb -= MAXALIGN(sizeof(Size)); + } + else + { + Size lengthbytes; + + /* Can't be split unless bigger than required alignment. */ + Assert(sizeof(Size) > MAXIMUM_ALIGNOF); + + /* Message word is split; need buffer to reassemble. */ + if (mqh->mqh_buffer == NULL) + { + mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, + MQH_INITIAL_BUFSIZE); + mqh->mqh_buflen = MQH_INITIAL_BUFSIZE; + } + Assert(mqh->mqh_buflen >= sizeof(Size)); + + /* Copy partial length word; remember to consume it. */ + if (mqh->mqh_partial_bytes + rb > sizeof(Size)) + lengthbytes = sizeof(Size) - mqh->mqh_partial_bytes; + else + lengthbytes = rb; + memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, + lengthbytes); + mqh->mqh_partial_bytes += lengthbytes; + mqh->mqh_consume_pending += MAXALIGN(lengthbytes); + rb -= lengthbytes; + + /* If we now have the whole word, we're ready to read payload. */ + if (mqh->mqh_partial_bytes >= sizeof(Size)) + { + Assert(mqh->mqh_partial_bytes == sizeof(Size)); + mqh->mqh_expected_bytes = *(Size *) mqh->mqh_buffer; + mqh->mqh_length_word_complete = true; + mqh->mqh_partial_bytes = 0; + } + } + } + nbytes = mqh->mqh_expected_bytes; + + /* + * Should be disallowed on the sending side already, but better check and + * error out on the receiver side as well rather than trying to read a + * prohibitively large message. + */ + if (nbytes > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("invalid message size %zu in shared memory queue", + nbytes))); + + if (mqh->mqh_partial_bytes == 0) + { + /* + * Try to obtain the whole message in a single chunk. If this works, + * we need not copy the data and can return a pointer directly into + * shared memory. + */ + res = shm_mq_receive_bytes(mqh, nbytes, nowait, &rb, &rawdata); + if (res != SHM_MQ_SUCCESS) + return res; + if (rb >= nbytes) + { + mqh->mqh_length_word_complete = false; + mqh->mqh_consume_pending += MAXALIGN(nbytes); + *nbytesp = nbytes; + *datap = rawdata; + return SHM_MQ_SUCCESS; + } + + /* + * The message has wrapped the buffer. We'll need to copy it in order + * to return it to the client in one chunk. First, make sure we have + * a large enough buffer available. + */ + if (mqh->mqh_buflen < nbytes) + { + Size newbuflen; + + /* + * Increase size to the next power of 2 that's >= nbytes, but + * limit to MaxAllocSize. + */ + newbuflen = pg_nextpower2_size_t(nbytes); + newbuflen = Min(newbuflen, MaxAllocSize); + + if (mqh->mqh_buffer != NULL) + { + pfree(mqh->mqh_buffer); + mqh->mqh_buffer = NULL; + mqh->mqh_buflen = 0; + } + mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, newbuflen); + mqh->mqh_buflen = newbuflen; + } + } + + /* Loop until we've copied the entire message. */ + for (;;) + { + Size still_needed; + + /* Copy as much as we can. */ + Assert(mqh->mqh_partial_bytes + rb <= nbytes); + if (rb > 0) + { + memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, rb); + mqh->mqh_partial_bytes += rb; + } + + /* + * Update count of bytes that can be consumed, accounting for + * alignment padding. Note that this will never actually insert any + * padding except at the end of a message, because the buffer size is + * a multiple of MAXIMUM_ALIGNOF, and each read and write is as well. + */ + Assert(mqh->mqh_partial_bytes == nbytes || rb == MAXALIGN(rb)); + mqh->mqh_consume_pending += MAXALIGN(rb); + + /* If we got all the data, exit the loop. */ + if (mqh->mqh_partial_bytes >= nbytes) + break; + + /* Wait for some more data. */ + still_needed = nbytes - mqh->mqh_partial_bytes; + res = shm_mq_receive_bytes(mqh, still_needed, nowait, &rb, &rawdata); + if (res != SHM_MQ_SUCCESS) + return res; + if (rb > still_needed) + rb = still_needed; + } + + /* Return the complete message, and reset for next message. */ + *nbytesp = nbytes; + *datap = mqh->mqh_buffer; + mqh->mqh_length_word_complete = false; + mqh->mqh_partial_bytes = 0; + return SHM_MQ_SUCCESS; +} + +/* + * Wait for the other process that's supposed to use this queue to attach + * to it. + * + * The return value is SHM_MQ_DETACHED if the worker has already detached or + * if it dies; it is SHM_MQ_SUCCESS if we detect that the worker has attached. + * Note that we will only be able to detect that the worker has died before + * attaching if a background worker handle was passed to shm_mq_attach(). + */ +shm_mq_result +shm_mq_wait_for_attach(shm_mq_handle *mqh) +{ + shm_mq *mq = mqh->mqh_queue; + PGPROC **victim; + + if (shm_mq_get_receiver(mq) == MyProc) + victim = &mq->mq_sender; + else + { + Assert(shm_mq_get_sender(mq) == MyProc); + victim = &mq->mq_receiver; + } + + if (shm_mq_wait_internal(mq, victim, mqh->mqh_handle)) + return SHM_MQ_SUCCESS; + else + return SHM_MQ_DETACHED; +} + +/* + * Detach from a shared message queue, and destroy the shm_mq_handle. + */ +void +shm_mq_detach(shm_mq_handle *mqh) +{ + /* Before detaching, notify the receiver about any already-written data. */ + if (mqh->mqh_send_pending > 0) + { + shm_mq_inc_bytes_written(mqh->mqh_queue, mqh->mqh_send_pending); + mqh->mqh_send_pending = 0; + } + + /* Notify counterparty that we're outta here. */ + shm_mq_detach_internal(mqh->mqh_queue); + + /* Cancel on_dsm_detach callback, if any. */ + if (mqh->mqh_segment) + cancel_on_dsm_detach(mqh->mqh_segment, + shm_mq_detach_callback, + PointerGetDatum(mqh->mqh_queue)); + + /* Release local memory associated with handle. */ + if (mqh->mqh_buffer != NULL) + pfree(mqh->mqh_buffer); + pfree(mqh); +} + +/* + * Notify counterparty that we're detaching from shared message queue. + * + * The purpose of this function is to make sure that the process + * with which we're communicating doesn't block forever waiting for us to + * fill or drain the queue once we've lost interest. When the sender + * detaches, the receiver can read any messages remaining in the queue; + * further reads will return SHM_MQ_DETACHED. If the receiver detaches, + * further attempts to send messages will likewise return SHM_MQ_DETACHED. + * + * This is separated out from shm_mq_detach() because if the on_dsm_detach + * callback fires, we only want to do this much. We do not try to touch + * the local shm_mq_handle, as it may have been pfree'd already. + */ +static void +shm_mq_detach_internal(shm_mq *mq) +{ + PGPROC *victim; + + SpinLockAcquire(&mq->mq_mutex); + if (mq->mq_sender == MyProc) + victim = mq->mq_receiver; + else + { + Assert(mq->mq_receiver == MyProc); + victim = mq->mq_sender; + } + mq->mq_detached = true; + SpinLockRelease(&mq->mq_mutex); + + if (victim != NULL) + SetLatch(&victim->procLatch); +} + +/* + * Get the shm_mq from handle. + */ +shm_mq * +shm_mq_get_queue(shm_mq_handle *mqh) +{ + return mqh->mqh_queue; +} + +/* + * Write bytes into a shared message queue. + */ +static shm_mq_result +shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data, + bool nowait, Size *bytes_written) +{ + shm_mq *mq = mqh->mqh_queue; + Size sent = 0; + uint64 used; + Size ringsize = mq->mq_ring_size; + Size available; + + while (sent < nbytes) + { + uint64 rb; + uint64 wb; + + /* Compute number of ring buffer bytes used and available. */ + rb = pg_atomic_read_u64(&mq->mq_bytes_read); + wb = pg_atomic_read_u64(&mq->mq_bytes_written) + mqh->mqh_send_pending; + Assert(wb >= rb); + used = wb - rb; + Assert(used <= ringsize); + available = Min(ringsize - used, nbytes - sent); + + /* + * Bail out if the queue has been detached. Note that we would be in + * trouble if the compiler decided to cache the value of + * mq->mq_detached in a register or on the stack across loop + * iterations. It probably shouldn't do that anyway since we'll + * always return, call an external function that performs a system + * call, or reach a memory barrier at some point later in the loop, + * but just to be sure, insert a compiler barrier here. + */ + pg_compiler_barrier(); + if (mq->mq_detached) + { + *bytes_written = sent; + return SHM_MQ_DETACHED; + } + + if (available == 0 && !mqh->mqh_counterparty_attached) + { + /* + * The queue is full, so if the receiver isn't yet known to be + * attached, we must wait for that to happen. + */ + if (nowait) + { + if (shm_mq_counterparty_gone(mq, mqh->mqh_handle)) + { + *bytes_written = sent; + return SHM_MQ_DETACHED; + } + if (shm_mq_get_receiver(mq) == NULL) + { + *bytes_written = sent; + return SHM_MQ_WOULD_BLOCK; + } + } + else if (!shm_mq_wait_internal(mq, &mq->mq_receiver, + mqh->mqh_handle)) + { + mq->mq_detached = true; + *bytes_written = sent; + return SHM_MQ_DETACHED; + } + mqh->mqh_counterparty_attached = true; + + /* + * The receiver may have read some data after attaching, so we + * must not wait without rechecking the queue state. + */ + } + else if (available == 0) + { + /* Update the pending send bytes in the shared memory. */ + shm_mq_inc_bytes_written(mq, mqh->mqh_send_pending); + + /* + * Since mq->mqh_counterparty_attached is known to be true at this + * point, mq_receiver has been set, and it can't change once set. + * Therefore, we can read it without acquiring the spinlock. + */ + Assert(mqh->mqh_counterparty_attached); + SetLatch(&mq->mq_receiver->procLatch); + + /* + * We have just updated the mqh_send_pending bytes in the shared + * memory so reset it. + */ + mqh->mqh_send_pending = 0; + + /* Skip manipulation of our latch if nowait = true. */ + if (nowait) + { + *bytes_written = sent; + return SHM_MQ_WOULD_BLOCK; + } + + /* + * Wait for our latch to be set. It might already be set for some + * unrelated reason, but that'll just result in one extra trip + * through the loop. It's worth it to avoid resetting the latch + * at top of loop, because setting an already-set latch is much + * cheaper than setting one that has been reset. + */ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + WAIT_EVENT_MQ_SEND); + + /* Reset the latch so we don't spin. */ + ResetLatch(MyLatch); + + /* An interrupt may have occurred while we were waiting. */ + CHECK_FOR_INTERRUPTS(); + } + else + { + Size offset; + Size sendnow; + + offset = wb % (uint64) ringsize; + sendnow = Min(available, ringsize - offset); + + /* + * Write as much data as we can via a single memcpy(). Make sure + * these writes happen after the read of mq_bytes_read, above. + * This barrier pairs with the one in shm_mq_inc_bytes_read. + * (Since we're separating the read of mq_bytes_read from a + * subsequent write to mq_ring, we need a full barrier here.) + */ + pg_memory_barrier(); + memcpy(&mq->mq_ring[mq->mq_ring_offset + offset], + (char *) data + sent, sendnow); + sent += sendnow; + + /* + * Update count of bytes written, with alignment padding. Note + * that this will never actually insert any padding except at the + * end of a run of bytes, because the buffer size is a multiple of + * MAXIMUM_ALIGNOF, and each read is as well. + */ + Assert(sent == nbytes || sendnow == MAXALIGN(sendnow)); + + /* + * For efficiency, we don't update the bytes written in the shared + * memory and also don't set the reader's latch here. Refer to + * the comments atop the shm_mq_handle structure for more + * information. + */ + mqh->mqh_send_pending += MAXALIGN(sendnow); + } + } + + *bytes_written = sent; + return SHM_MQ_SUCCESS; +} + +/* + * Wait until at least *nbytesp bytes are available to be read from the + * shared message queue, or until the buffer wraps around. If the queue is + * detached, returns SHM_MQ_DETACHED. If nowait is specified and a wait + * would be required, returns SHM_MQ_WOULD_BLOCK. Otherwise, *datap is set + * to the location at which data bytes can be read, *nbytesp is set to the + * number of bytes which can be read at that address, and the return value + * is SHM_MQ_SUCCESS. + */ +static shm_mq_result +shm_mq_receive_bytes(shm_mq_handle *mqh, Size bytes_needed, bool nowait, + Size *nbytesp, void **datap) +{ + shm_mq *mq = mqh->mqh_queue; + Size ringsize = mq->mq_ring_size; + uint64 used; + uint64 written; + + for (;;) + { + Size offset; + uint64 read; + + /* Get bytes written, so we can compute what's available to read. */ + written = pg_atomic_read_u64(&mq->mq_bytes_written); + + /* + * Get bytes read. Include bytes we could consume but have not yet + * consumed. + */ + read = pg_atomic_read_u64(&mq->mq_bytes_read) + + mqh->mqh_consume_pending; + used = written - read; + Assert(used <= ringsize); + offset = read % (uint64) ringsize; + + /* If we have enough data or buffer has wrapped, we're done. */ + if (used >= bytes_needed || offset + used >= ringsize) + { + *nbytesp = Min(used, ringsize - offset); + *datap = &mq->mq_ring[mq->mq_ring_offset + offset]; + + /* + * Separate the read of mq_bytes_written, above, from caller's + * attempt to read the data itself. Pairs with the barrier in + * shm_mq_inc_bytes_written. + */ + pg_read_barrier(); + return SHM_MQ_SUCCESS; + } + + /* + * Fall out before waiting if the queue has been detached. + * + * Note that we don't check for this until *after* considering whether + * the data already available is enough, since the receiver can finish + * receiving a message stored in the buffer even after the sender has + * detached. + */ + if (mq->mq_detached) + { + /* + * If the writer advanced mq_bytes_written and then set + * mq_detached, we might not have read the final value of + * mq_bytes_written above. Insert a read barrier and then check + * again if mq_bytes_written has advanced. + */ + pg_read_barrier(); + if (written != pg_atomic_read_u64(&mq->mq_bytes_written)) + continue; + + return SHM_MQ_DETACHED; + } + + /* + * We didn't get enough data to satisfy the request, so mark any data + * previously-consumed as read to make more buffer space. + */ + if (mqh->mqh_consume_pending > 0) + { + shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending); + mqh->mqh_consume_pending = 0; + } + + /* Skip manipulation of our latch if nowait = true. */ + if (nowait) + return SHM_MQ_WOULD_BLOCK; + + /* + * Wait for our latch to be set. It might already be set for some + * unrelated reason, but that'll just result in one extra trip through + * the loop. It's worth it to avoid resetting the latch at top of + * loop, because setting an already-set latch is much cheaper than + * setting one that has been reset. + */ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + WAIT_EVENT_MQ_RECEIVE); + + /* Reset the latch so we don't spin. */ + ResetLatch(MyLatch); + + /* An interrupt may have occurred while we were waiting. */ + CHECK_FOR_INTERRUPTS(); + } +} + +/* + * Test whether a counterparty who may not even be alive yet is definitely gone. + */ +static bool +shm_mq_counterparty_gone(shm_mq *mq, BackgroundWorkerHandle *handle) +{ + pid_t pid; + + /* If the queue has been detached, counterparty is definitely gone. */ + if (mq->mq_detached) + return true; + + /* If there's a handle, check worker status. */ + if (handle != NULL) + { + BgwHandleStatus status; + + /* Check for unexpected worker death. */ + status = GetBackgroundWorkerPid(handle, &pid); + if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED) + { + /* Mark it detached, just to make it official. */ + mq->mq_detached = true; + return true; + } + } + + /* Counterparty is not definitively gone. */ + return false; +} + +/* + * This is used when a process is waiting for its counterpart to attach to the + * queue. We exit when the other process attaches as expected, or, if + * handle != NULL, when the referenced background process or the postmaster + * dies. Note that if handle == NULL, and the process fails to attach, we'll + * potentially get stuck here forever waiting for a process that may never + * start. We do check for interrupts, though. + * + * ptr is a pointer to the memory address that we're expecting to become + * non-NULL when our counterpart attaches to the queue. + */ +static bool +shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr, BackgroundWorkerHandle *handle) +{ + bool result = false; + + for (;;) + { + BgwHandleStatus status; + pid_t pid; + + /* Acquire the lock just long enough to check the pointer. */ + SpinLockAcquire(&mq->mq_mutex); + result = (*ptr != NULL); + SpinLockRelease(&mq->mq_mutex); + + /* Fail if detached; else succeed if initialized. */ + if (mq->mq_detached) + { + result = false; + break; + } + if (result) + break; + + if (handle != NULL) + { + /* Check for unexpected worker death. */ + status = GetBackgroundWorkerPid(handle, &pid); + if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED) + { + result = false; + break; + } + } + + /* Wait to be signaled. */ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + WAIT_EVENT_MQ_INTERNAL); + + /* Reset the latch so we don't spin. */ + ResetLatch(MyLatch); + + /* An interrupt may have occurred while we were waiting. */ + CHECK_FOR_INTERRUPTS(); + } + + return result; +} + +/* + * Increment the number of bytes read. + */ +static void +shm_mq_inc_bytes_read(shm_mq *mq, Size n) +{ + PGPROC *sender; + + /* + * Separate prior reads of mq_ring from the increment of mq_bytes_read + * which follows. This pairs with the full barrier in + * shm_mq_send_bytes(). We only need a read barrier here because the + * increment of mq_bytes_read is actually a read followed by a dependent + * write. + */ + pg_read_barrier(); + + /* + * There's no need to use pg_atomic_fetch_add_u64 here, because nobody + * else can be changing this value. This method should be cheaper. + */ + pg_atomic_write_u64(&mq->mq_bytes_read, + pg_atomic_read_u64(&mq->mq_bytes_read) + n); + + /* + * We shouldn't have any bytes to read without a sender, so we can read + * mq_sender here without a lock. Once it's initialized, it can't change. + */ + sender = mq->mq_sender; + Assert(sender != NULL); + SetLatch(&sender->procLatch); +} + +/* + * Increment the number of bytes written. + */ +static void +shm_mq_inc_bytes_written(shm_mq *mq, Size n) +{ + /* + * Separate prior reads of mq_ring from the write of mq_bytes_written + * which we're about to do. Pairs with the read barrier found in + * shm_mq_receive_bytes. + */ + pg_write_barrier(); + + /* + * There's no need to use pg_atomic_fetch_add_u64 here, because nobody + * else can be changing this value. This method avoids taking the bus + * lock unnecessarily. + */ + pg_atomic_write_u64(&mq->mq_bytes_written, + pg_atomic_read_u64(&mq->mq_bytes_written) + n); +} + +/* Shim for on_dsm_detach callback. */ +static void +shm_mq_detach_callback(dsm_segment *seg, Datum arg) +{ + shm_mq *mq = (shm_mq *) DatumGetPointer(arg); + + shm_mq_detach_internal(mq); +} diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c new file mode 100644 index 0000000..0cd8244 --- /dev/null +++ b/src/backend/storage/ipc/shm_toc.c @@ -0,0 +1,272 @@ +/*------------------------------------------------------------------------- + * + * shm_toc.c + * shared memory segment table of contents + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/storage/ipc/shm_toc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "port/atomics.h" +#include "storage/shm_toc.h" +#include "storage/spin.h" + +typedef struct shm_toc_entry +{ + uint64 key; /* Arbitrary identifier */ + Size offset; /* Offset, in bytes, from TOC start */ +} shm_toc_entry; + +struct shm_toc +{ + uint64 toc_magic; /* Magic number identifying this TOC */ + slock_t toc_mutex; /* Spinlock for mutual exclusion */ + Size toc_total_bytes; /* Bytes managed by this TOC */ + Size toc_allocated_bytes; /* Bytes allocated of those managed */ + uint32 toc_nentry; /* Number of entries in TOC */ + shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Initialize a region of shared memory with a table of contents. + */ +shm_toc * +shm_toc_create(uint64 magic, void *address, Size nbytes) +{ + shm_toc *toc = (shm_toc *) address; + + Assert(nbytes > offsetof(shm_toc, toc_entry)); + toc->toc_magic = magic; + SpinLockInit(&toc->toc_mutex); + + /* + * The alignment code in shm_toc_allocate() assumes that the starting + * value is buffer-aligned. + */ + toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes); + toc->toc_allocated_bytes = 0; + toc->toc_nentry = 0; + + return toc; +} + +/* + * Attach to an existing table of contents. If the magic number found at + * the target address doesn't match our expectations, return NULL. + */ +shm_toc * +shm_toc_attach(uint64 magic, void *address) +{ + shm_toc *toc = (shm_toc *) address; + + if (toc->toc_magic != magic) + return NULL; + + Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes); + Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry)); + + return toc; +} + +/* + * Allocate shared memory from a segment managed by a table of contents. + * + * This is not a full-blown allocator; there's no way to free memory. It's + * just a way of dividing a single physical shared memory segment into logical + * chunks that may be used for different purposes. + * + * We allocate backwards from the end of the segment, so that the TOC entries + * can grow forward from the start of the segment. + */ +void * +shm_toc_allocate(shm_toc *toc, Size nbytes) +{ + volatile shm_toc *vtoc = toc; + Size total_bytes; + Size allocated_bytes; + Size nentry; + Size toc_bytes; + + /* + * Make sure request is well-aligned. XXX: MAXALIGN is not enough, + * because atomic ops might need a wider alignment. We don't have a + * proper definition for the minimum to make atomic ops safe, but + * BUFFERALIGN ought to be enough. + */ + nbytes = BUFFERALIGN(nbytes); + + SpinLockAcquire(&toc->toc_mutex); + + total_bytes = vtoc->toc_total_bytes; + allocated_bytes = vtoc->toc_allocated_bytes; + nentry = vtoc->toc_nentry; + toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) + + allocated_bytes; + + /* Check for memory exhaustion and overflow. */ + if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes) + { + SpinLockRelease(&toc->toc_mutex); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); + } + vtoc->toc_allocated_bytes += nbytes; + + SpinLockRelease(&toc->toc_mutex); + + return ((char *) toc) + (total_bytes - allocated_bytes - nbytes); +} + +/* + * Return the number of bytes that can still be allocated. + */ +Size +shm_toc_freespace(shm_toc *toc) +{ + volatile shm_toc *vtoc = toc; + Size total_bytes; + Size allocated_bytes; + Size nentry; + Size toc_bytes; + + SpinLockAcquire(&toc->toc_mutex); + total_bytes = vtoc->toc_total_bytes; + allocated_bytes = vtoc->toc_allocated_bytes; + nentry = vtoc->toc_nentry; + SpinLockRelease(&toc->toc_mutex); + + toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry); + Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes); + return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes)); +} + +/* + * Insert a TOC entry. + * + * The idea here is that the process setting up the shared memory segment will + * register the addresses of data structures within the segment using this + * function. Each data structure will be identified using a 64-bit key, which + * is assumed to be a well-known or discoverable integer. Other processes + * accessing the shared memory segment can pass the same key to + * shm_toc_lookup() to discover the addresses of those data structures. + * + * Since the shared memory segment may be mapped at different addresses within + * different backends, we store relative rather than absolute pointers. + * + * This won't scale well to a large number of keys. Hopefully, that isn't + * necessary; if it proves to be, we might need to provide a more sophisticated + * data structure here. But the real idea here is just to give someone mapping + * a dynamic shared memory the ability to find the bare minimum number of + * pointers that they need to bootstrap. If you're storing a lot of stuff in + * the TOC, you're doing it wrong. + */ +void +shm_toc_insert(shm_toc *toc, uint64 key, void *address) +{ + volatile shm_toc *vtoc = toc; + Size total_bytes; + Size allocated_bytes; + Size nentry; + Size toc_bytes; + Size offset; + + /* Relativize pointer. */ + Assert(address > (void *) toc); + offset = ((char *) address) - (char *) toc; + + SpinLockAcquire(&toc->toc_mutex); + + total_bytes = vtoc->toc_total_bytes; + allocated_bytes = vtoc->toc_allocated_bytes; + nentry = vtoc->toc_nentry; + toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) + + allocated_bytes; + + /* Check for memory exhaustion and overflow. */ + if (toc_bytes + sizeof(shm_toc_entry) > total_bytes || + toc_bytes + sizeof(shm_toc_entry) < toc_bytes || + nentry >= PG_UINT32_MAX) + { + SpinLockRelease(&toc->toc_mutex); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); + } + + Assert(offset < total_bytes); + vtoc->toc_entry[nentry].key = key; + vtoc->toc_entry[nentry].offset = offset; + + /* + * By placing a write barrier after filling in the entry and before + * updating the number of entries, we make it safe to read the TOC + * unlocked. + */ + pg_write_barrier(); + + vtoc->toc_nentry++; + + SpinLockRelease(&toc->toc_mutex); +} + +/* + * Look up a TOC entry. + * + * If the key is not found, returns NULL if noError is true, otherwise + * throws elog(ERROR). + * + * Unlike the other functions in this file, this operation acquires no lock; + * it uses only barriers. It probably wouldn't hurt concurrency very much even + * if it did get a lock, but since it's reasonably likely that a group of + * worker processes could each read a series of entries from the same TOC + * right around the same time, there seems to be some value in avoiding it. + */ +void * +shm_toc_lookup(shm_toc *toc, uint64 key, bool noError) +{ + uint32 nentry; + uint32 i; + + /* + * Read the number of entries before we examine any entry. We assume that + * reading a uint32 is atomic. + */ + nentry = toc->toc_nentry; + pg_read_barrier(); + + /* Now search for a matching entry. */ + for (i = 0; i < nentry; ++i) + { + if (toc->toc_entry[i].key == key) + return ((char *) toc) + toc->toc_entry[i].offset; + } + + /* No matching entry was found. */ + if (!noError) + elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p", + key, toc); + return NULL; +} + +/* + * Estimate how much shared memory will be required to store a TOC and its + * dependent data structures. + */ +Size +shm_toc_estimate(shm_toc_estimator *e) +{ + Size sz; + + sz = offsetof(shm_toc, toc_entry); + sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry))); + sz = add_size(sz, e->space_for_chunks); + + return BUFFERALIGN(sz); +} diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c new file mode 100644 index 0000000..5465fa1 --- /dev/null +++ b/src/backend/storage/ipc/shmem.c @@ -0,0 +1,584 @@ +/*------------------------------------------------------------------------- + * + * shmem.c + * create shared memory and initialize shared memory data structures. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/shmem.c + * + *------------------------------------------------------------------------- + */ +/* + * POSTGRES processes share one or more regions of shared memory. + * The shared memory is created by a postmaster and is inherited + * by each backend via fork() (or, in some ports, via other OS-specific + * methods). The routines in this file are used for allocating and + * binding to shared memory data structures. + * + * NOTES: + * (a) There are three kinds of shared memory data structures + * available to POSTGRES: fixed-size structures, queues and hash + * tables. Fixed-size structures contain things like global variables + * for a module and should never be allocated after the shared memory + * initialization phase. Hash tables have a fixed maximum size, but + * their actual size can vary dynamically. When entries are added + * to the table, more space is allocated. Queues link data structures + * that have been allocated either within fixed-size structures or as hash + * buckets. Each shared data structure has a string name to identify + * it (assigned in the module that declares it). + * + * (b) During initialization, each module looks for its + * shared data structures in a hash table called the "Shmem Index". + * If the data structure is not present, the caller can allocate + * a new one and initialize it. If the data structure is present, + * the caller "attaches" to the structure by initializing a pointer + * in the local address space. + * The shmem index has two purposes: first, it gives us + * a simple model of how the world looks when a backend process + * initializes. If something is present in the shmem index, + * it is initialized. If it is not, it is uninitialized. Second, + * the shmem index allows us to allocate shared memory on demand + * instead of trying to preallocate structures and hard-wire the + * sizes and locations in header files. If you are using a lot + * of shared memory in a lot of different places (and changing + * things during development), this is important. + * + * (c) In standard Unix-ish environments, individual backends do not + * need to re-establish their local pointers into shared memory, because + * they inherit correct values of those variables via fork() from the + * postmaster. However, this does not work in the EXEC_BACKEND case. + * In ports using EXEC_BACKEND, new backends have to set up their local + * pointers using the method described in (b) above. + * + * (d) memory allocation model: shared memory can never be + * freed, once allocated. Each hash table has its own free list, + * so hash buckets can be reused when an item is deleted. However, + * if one hash table grows very large and then shrinks, its space + * cannot be redistributed to other tables. We could build a simple + * hash bucket garbage collector if need be. Right now, it seems + * unnecessary. + */ + +#include "postgres.h" + +#include "access/transam.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/builtins.h" + +static void *ShmemAllocRaw(Size size, Size *allocated_size); + +/* shared memory global variables */ + +static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ + +static void *ShmemBase; /* start address of shared memory */ + +static void *ShmemEnd; /* end+1 address of shared memory */ + +slock_t *ShmemLock; /* spinlock for shared memory and LWLock + * allocation */ + +static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ + + +/* + * InitShmemAccess() --- set up basic pointers to shared memory. + * + * Note: the argument should be declared "PGShmemHeader *seghdr", + * but we use void to avoid having to include ipc.h in shmem.h. + */ +void +InitShmemAccess(void *seghdr) +{ + PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr; + + ShmemSegHdr = shmhdr; + ShmemBase = (void *) shmhdr; + ShmemEnd = (char *) ShmemBase + shmhdr->totalsize; +} + +/* + * InitShmemAllocation() --- set up shared-memory space allocation. + * + * This should be called only in the postmaster or a standalone backend. + */ +void +InitShmemAllocation(void) +{ + PGShmemHeader *shmhdr = ShmemSegHdr; + char *aligned; + + Assert(shmhdr != NULL); + + /* + * Initialize the spinlock used by ShmemAlloc. We must use + * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet. + */ + ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t)); + + SpinLockInit(ShmemLock); + + /* + * Allocations after this point should go through ShmemAlloc, which + * expects to allocate everything on cache line boundaries. Make sure the + * first allocation begins on a cache line boundary. + */ + aligned = (char *) + (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset))); + shmhdr->freeoffset = aligned - (char *) shmhdr; + + /* ShmemIndex can't be set up yet (need LWLocks first) */ + shmhdr->index = NULL; + ShmemIndex = (HTAB *) NULL; + + /* + * Initialize ShmemVariableCache for transaction manager. (This doesn't + * really belong here, but not worth moving.) + */ + ShmemVariableCache = (VariableCache) + ShmemAlloc(sizeof(*ShmemVariableCache)); + memset(ShmemVariableCache, 0, sizeof(*ShmemVariableCache)); +} + +/* + * ShmemAlloc -- allocate max-aligned chunk from shared memory + * + * Throws error if request cannot be satisfied. + * + * Assumes ShmemLock and ShmemSegHdr are initialized. + */ +void * +ShmemAlloc(Size size) +{ + void *newSpace; + Size allocated_size; + + newSpace = ShmemAllocRaw(size, &allocated_size); + if (!newSpace) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory (%zu bytes requested)", + size))); + return newSpace; +} + +/* + * ShmemAllocNoError -- allocate max-aligned chunk from shared memory + * + * As ShmemAlloc, but returns NULL if out of space, rather than erroring. + */ +void * +ShmemAllocNoError(Size size) +{ + Size allocated_size; + + return ShmemAllocRaw(size, &allocated_size); +} + +/* + * ShmemAllocRaw -- allocate align chunk and return allocated size + * + * Also sets *allocated_size to the number of bytes allocated, which will + * be equal to the number requested plus any padding we choose to add. + */ +static void * +ShmemAllocRaw(Size size, Size *allocated_size) +{ + Size newStart; + Size newFree; + void *newSpace; + + /* + * Ensure all space is adequately aligned. We used to only MAXALIGN this + * space but experience has proved that on modern systems that is not good + * enough. Many parts of the system are very sensitive to critical data + * structures getting split across cache line boundaries. To avoid that, + * attempt to align the beginning of the allocation to a cache line + * boundary. The calling code will still need to be careful about how it + * uses the allocated space - e.g. by padding each element in an array of + * structures out to a power-of-two size - but without this, even that + * won't be sufficient. + */ + size = CACHELINEALIGN(size); + *allocated_size = size; + + Assert(ShmemSegHdr != NULL); + + SpinLockAcquire(ShmemLock); + + newStart = ShmemSegHdr->freeoffset; + + newFree = newStart + size; + if (newFree <= ShmemSegHdr->totalsize) + { + newSpace = (void *) ((char *) ShmemBase + newStart); + ShmemSegHdr->freeoffset = newFree; + } + else + newSpace = NULL; + + SpinLockRelease(ShmemLock); + + /* note this assert is okay with newSpace == NULL */ + Assert(newSpace == (void *) CACHELINEALIGN(newSpace)); + + return newSpace; +} + +/* + * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory + * + * Allocate space without locking ShmemLock. This should be used for, + * and only for, allocations that must happen before ShmemLock is ready. + * + * We consider maxalign, rather than cachealign, sufficient here. + */ +void * +ShmemAllocUnlocked(Size size) +{ + Size newStart; + Size newFree; + void *newSpace; + + /* + * Ensure allocated space is adequately aligned. + */ + size = MAXALIGN(size); + + Assert(ShmemSegHdr != NULL); + + newStart = ShmemSegHdr->freeoffset; + + newFree = newStart + size; + if (newFree > ShmemSegHdr->totalsize) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory (%zu bytes requested)", + size))); + ShmemSegHdr->freeoffset = newFree; + + newSpace = (void *) ((char *) ShmemBase + newStart); + + Assert(newSpace == (void *) MAXALIGN(newSpace)); + + return newSpace; +} + +/* + * ShmemAddrIsValid -- test if an address refers to shared memory + * + * Returns true if the pointer points within the shared memory segment. + */ +bool +ShmemAddrIsValid(const void *addr) +{ + return (addr >= ShmemBase) && (addr < ShmemEnd); +} + +/* + * InitShmemIndex() --- set up or attach to shmem index table. + */ +void +InitShmemIndex(void) +{ + HASHCTL info; + + /* + * Create the shared memory shmem index. + * + * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex + * hashtable to exist already, we have a bit of a circularity problem in + * initializing the ShmemIndex itself. The special "ShmemIndex" hash + * table name will tell ShmemInitStruct to fake it. + */ + info.keysize = SHMEM_INDEX_KEYSIZE; + info.entrysize = sizeof(ShmemIndexEnt); + + ShmemIndex = ShmemInitHash("ShmemIndex", + SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE, + &info, + HASH_ELEM | HASH_STRINGS); +} + +/* + * ShmemInitHash -- Create and initialize, or attach to, a + * shared memory hash table. + * + * We assume caller is doing some kind of synchronization + * so that two processes don't try to create/initialize the same + * table at once. (In practice, all creations are done in the postmaster + * process; child processes should always be attaching to existing tables.) + * + * max_size is the estimated maximum number of hashtable entries. This is + * not a hard limit, but the access efficiency will degrade if it is + * exceeded substantially (since it's used to compute directory size and + * the hash table buckets will get overfull). + * + * init_size is the number of hashtable entries to preallocate. For a table + * whose maximum size is certain, this should be equal to max_size; that + * ensures that no run-time out-of-shared-memory failures can occur. + * + * *infoP and hash_flags must specify at least the entry sizes and key + * comparison semantics (see hash_create()). Flag bits and values specific + * to shared-memory hash tables are added here, except that callers may + * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE. + * + * Note: before Postgres 9.0, this function returned NULL for some failure + * cases. Now, it always throws error instead, so callers need not check + * for NULL. + */ +HTAB * +ShmemInitHash(const char *name, /* table string name for shmem index */ + long init_size, /* initial table size */ + long max_size, /* max size of the table */ + HASHCTL *infoP, /* info about key and bucket size */ + int hash_flags) /* info about infoP */ +{ + bool found; + void *location; + + /* + * Hash tables allocated in shared memory have a fixed directory; it can't + * grow or other backends wouldn't be able to find it. So, make sure we + * make it big enough to start with. + * + * The shared memory allocator must be specified too. + */ + infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size); + infoP->alloc = ShmemAllocNoError; + hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE; + + /* look it up in the shmem index */ + location = ShmemInitStruct(name, + hash_get_shared_size(infoP, hash_flags), + &found); + + /* + * if it already exists, attach to it rather than allocate and initialize + * new space + */ + if (found) + hash_flags |= HASH_ATTACH; + + /* Pass location of hashtable header to hash_create */ + infoP->hctl = (HASHHDR *) location; + + return hash_create(name, init_size, infoP, hash_flags); +} + +/* + * ShmemInitStruct -- Create/attach to a structure in shared memory. + * + * This is called during initialization to find or allocate + * a data structure in shared memory. If no other process + * has created the structure, this routine allocates space + * for it. If it exists already, a pointer to the existing + * structure is returned. + * + * Returns: pointer to the object. *foundPtr is set true if the object was + * already in the shmem index (hence, already initialized). + * + * Note: before Postgres 9.0, this function returned NULL for some failure + * cases. Now, it always throws error instead, so callers need not check + * for NULL. + */ +void * +ShmemInitStruct(const char *name, Size size, bool *foundPtr) +{ + ShmemIndexEnt *result; + void *structPtr; + + LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); + + if (!ShmemIndex) + { + PGShmemHeader *shmemseghdr = ShmemSegHdr; + + /* Must be trying to create/attach to ShmemIndex itself */ + Assert(strcmp(name, "ShmemIndex") == 0); + + if (IsUnderPostmaster) + { + /* Must be initializing a (non-standalone) backend */ + Assert(shmemseghdr->index != NULL); + structPtr = shmemseghdr->index; + *foundPtr = true; + } + else + { + /* + * If the shmem index doesn't exist, we are bootstrapping: we must + * be trying to init the shmem index itself. + * + * Notice that the ShmemIndexLock is released before the shmem + * index has been initialized. This should be OK because no other + * process can be accessing shared memory yet. + */ + Assert(shmemseghdr->index == NULL); + structPtr = ShmemAlloc(size); + shmemseghdr->index = structPtr; + *foundPtr = false; + } + LWLockRelease(ShmemIndexLock); + return structPtr; + } + + /* look it up in the shmem index */ + result = (ShmemIndexEnt *) + hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr); + + if (!result) + { + LWLockRelease(ShmemIndexLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("could not create ShmemIndex entry for data structure \"%s\"", + name))); + } + + if (*foundPtr) + { + /* + * Structure is in the shmem index so someone else has allocated it + * already. The size better be the same as the size we are trying to + * initialize to, or there is a name conflict (or worse). + */ + if (result->size != size) + { + LWLockRelease(ShmemIndexLock); + ereport(ERROR, + (errmsg("ShmemIndex entry size is wrong for data structure" + " \"%s\": expected %zu, actual %zu", + name, size, result->size))); + } + structPtr = result->location; + } + else + { + Size allocated_size; + + /* It isn't in the table yet. allocate and initialize it */ + structPtr = ShmemAllocRaw(size, &allocated_size); + if (structPtr == NULL) + { + /* out of memory; remove the failed ShmemIndex entry */ + hash_search(ShmemIndex, name, HASH_REMOVE, NULL); + LWLockRelease(ShmemIndexLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("not enough shared memory for data structure" + " \"%s\" (%zu bytes requested)", + name, size))); + } + result->size = size; + result->allocated_size = allocated_size; + result->location = structPtr; + } + + LWLockRelease(ShmemIndexLock); + + Assert(ShmemAddrIsValid(structPtr)); + + Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); + + return structPtr; +} + + +/* + * Add two Size values, checking for overflow + */ +Size +add_size(Size s1, Size s2) +{ + Size result; + + result = s1 + s2; + /* We are assuming Size is an unsigned type here... */ + if (result < s1 || result < s2) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested shared memory size overflows size_t"))); + return result; +} + +/* + * Multiply two Size values, checking for overflow + */ +Size +mul_size(Size s1, Size s2) +{ + Size result; + + if (s1 == 0 || s2 == 0) + return 0; + result = s1 * s2; + /* We are assuming Size is an unsigned type here... */ + if (result / s2 != s1) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested shared memory size overflows size_t"))); + return result; +} + +/* SQL SRF showing allocated shared memory */ +Datum +pg_get_shmem_allocations(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_SIZES_COLS 4 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + HASH_SEQ_STATUS hstat; + ShmemIndexEnt *ent; + Size named_allocated = 0; + Datum values[PG_GET_SHMEM_SIZES_COLS]; + bool nulls[PG_GET_SHMEM_SIZES_COLS]; + + InitMaterializedSRF(fcinfo, 0); + + LWLockAcquire(ShmemIndexLock, LW_SHARED); + + hash_seq_init(&hstat, ShmemIndex); + + /* output all allocated entries */ + memset(nulls, 0, sizeof(nulls)); + while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) + { + values[0] = CStringGetTextDatum(ent->key); + values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr); + values[2] = Int64GetDatum(ent->size); + values[3] = Int64GetDatum(ent->allocated_size); + named_allocated += ent->allocated_size; + + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, + values, nulls); + } + + /* output shared memory allocated but not counted via the shmem index */ + values[0] = CStringGetTextDatum(""); + nulls[1] = true; + values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated); + values[3] = values[2]; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + + /* output as-of-yet unused shared memory */ + nulls[0] = true; + values[1] = Int64GetDatum(ShmemSegHdr->freeoffset); + nulls[1] = false; + values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset); + values[3] = values[2]; + tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls); + + LWLockRelease(ShmemIndexLock); + + return (Datum) 0; +} diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c new file mode 100644 index 0000000..b595c2d --- /dev/null +++ b/src/backend/storage/ipc/signalfuncs.c @@ -0,0 +1,318 @@ +/*------------------------------------------------------------------------- + * + * signalfuncs.c + * Functions for signaling backends + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/signalfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "catalog/pg_authid.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/syslogger.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/acl.h" +#include "utils/builtins.h" + + +/* + * Send a signal to another backend. + * + * The signal is delivered if the user is either a superuser or the same + * role as the backend being signaled. For "dangerous" signals, an explicit + * check for superuser needs to be done prior to calling this function. + * + * Returns 0 on success, 1 on general failure, 2 on normal permission error + * and 3 if the caller needs to be a superuser. + * + * In the event of a general failure (return code 1), a warning message will + * be emitted. For permission errors, doing that is the responsibility of + * the caller. + */ +#define SIGNAL_BACKEND_SUCCESS 0 +#define SIGNAL_BACKEND_ERROR 1 +#define SIGNAL_BACKEND_NOPERMISSION 2 +#define SIGNAL_BACKEND_NOSUPERUSER 3 +static int +pg_signal_backend(int pid, int sig) +{ + PGPROC *proc = BackendPidGetProc(pid); + + /* + * BackendPidGetProc returns NULL if the pid isn't valid; but by the time + * we reach kill(), a process for which we get a valid proc here might + * have terminated on its own. There's no way to acquire a lock on an + * arbitrary process to prevent that. But since so far all the callers of + * this mechanism involve some request for ending the process anyway, that + * it might end on its own first is not a problem. + * + * Note that proc will also be NULL if the pid refers to an auxiliary + * process or the postmaster (neither of which can be signaled via + * pg_signal_backend()). + */ + if (proc == NULL) + { + /* + * This is just a warning so a loop-through-resultset will not abort + * if one backend terminated on its own during the run. + */ + ereport(WARNING, + (errmsg("PID %d is not a PostgreSQL backend process", pid))); + + return SIGNAL_BACKEND_ERROR; + } + + /* + * Only allow superusers to signal superuser-owned backends. Any process + * not advertising a role might have the importance of a superuser-owned + * backend, so treat it that way. + */ + if ((!OidIsValid(proc->roleId) || superuser_arg(proc->roleId)) && + !superuser()) + return SIGNAL_BACKEND_NOSUPERUSER; + + /* Users can signal backends they have role membership in. */ + if (!has_privs_of_role(GetUserId(), proc->roleId) && + !has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND)) + return SIGNAL_BACKEND_NOPERMISSION; + + /* + * Can the process we just validated above end, followed by the pid being + * recycled for a new process, before reaching here? Then we'd be trying + * to kill the wrong thing. Seems near impossible when sequential pid + * assignment and wraparound is used. Perhaps it could happen on a system + * where pid re-use is randomized. That race condition possibility seems + * too unlikely to worry about. + */ + + /* If we have setsid(), signal the backend's whole process group */ +#ifdef HAVE_SETSID + if (kill(-pid, sig)) +#else + if (kill(pid, sig)) +#endif + { + /* Again, just a warning to allow loops */ + ereport(WARNING, + (errmsg("could not send signal to process %d: %m", pid))); + return SIGNAL_BACKEND_ERROR; + } + return SIGNAL_BACKEND_SUCCESS; +} + +/* + * Signal to cancel a backend process. This is allowed if you are a member of + * the role whose process is being canceled. + * + * Note that only superusers can signal superuser-owned processes. + */ +Datum +pg_cancel_backend(PG_FUNCTION_ARGS) +{ + int r = pg_signal_backend(PG_GETARG_INT32(0), SIGINT); + + if (r == SIGNAL_BACKEND_NOSUPERUSER) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to cancel query"), + errdetail("Only roles with the %s attribute may cancel queries of roles with the %s attribute.", + "SUPERUSER", "SUPERUSER"))); + + if (r == SIGNAL_BACKEND_NOPERMISSION) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to cancel query"), + errdetail("Only roles with privileges of the role whose query is being canceled or with privileges of the \"%s\" role may cancel this query.", + "pg_signal_backend"))); + + PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS); +} + +/* + * Wait until there is no backend process with the given PID and return true. + * On timeout, a warning is emitted and false is returned. + */ +static bool +pg_wait_until_termination(int pid, int64 timeout) +{ + /* + * Wait in steps of waittime milliseconds until this function exits or + * timeout. + */ + int64 waittime = 100; + + /* + * Initially remaining time is the entire timeout specified by the user. + */ + int64 remainingtime = timeout; + + /* + * Check existence of the backend. If the backend still exists, then wait + * for waittime milliseconds, again check for the existence. Repeat this + * until timeout or an error occurs or a pending interrupt such as query + * cancel gets processed. + */ + do + { + if (remainingtime < waittime) + waittime = remainingtime; + + if (kill(pid, 0) == -1) + { + if (errno == ESRCH) + return true; + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not check the existence of the backend with PID %d: %m", + pid))); + } + + /* Process interrupts, if any, before waiting */ + CHECK_FOR_INTERRUPTS(); + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + waittime, + WAIT_EVENT_BACKEND_TERMINATION); + + ResetLatch(MyLatch); + + remainingtime -= waittime; + } while (remainingtime > 0); + + ereport(WARNING, + (errmsg_plural("backend with PID %d did not terminate within %lld millisecond", + "backend with PID %d did not terminate within %lld milliseconds", + timeout, + pid, (long long int) timeout))); + + return false; +} + +/* + * Send a signal to terminate a backend process. This is allowed if you are a + * member of the role whose process is being terminated. If the timeout input + * argument is 0, then this function just signals the backend and returns + * true. If timeout is nonzero, then it waits until no process has the given + * PID; if the process ends within the timeout, true is returned, and if the + * timeout is exceeded, a warning is emitted and false is returned. + * + * Note that only superusers can signal superuser-owned processes. + */ +Datum +pg_terminate_backend(PG_FUNCTION_ARGS) +{ + int pid; + int r; + int timeout; /* milliseconds */ + + pid = PG_GETARG_INT32(0); + timeout = PG_GETARG_INT64(1); + + if (timeout < 0) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"timeout\" must not be negative"))); + + r = pg_signal_backend(pid, SIGTERM); + + if (r == SIGNAL_BACKEND_NOSUPERUSER) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to terminate process"), + errdetail("Only roles with the %s attribute may terminate processes of roles with the %s attribute.", + "SUPERUSER", "SUPERUSER"))); + + if (r == SIGNAL_BACKEND_NOPERMISSION) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied to terminate process"), + errdetail("Only roles with privileges of the role whose process is being terminated or with privileges of the \"%s\" role may terminate this process.", + "pg_signal_backend"))); + + /* Wait only on success and if actually requested */ + if (r == SIGNAL_BACKEND_SUCCESS && timeout > 0) + PG_RETURN_BOOL(pg_wait_until_termination(pid, timeout)); + else + PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS); +} + +/* + * Signal to reload the database configuration + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_reload_conf(PG_FUNCTION_ARGS) +{ + if (kill(PostmasterPid, SIGHUP)) + { + ereport(WARNING, + (errmsg("failed to send signal to postmaster: %m"))); + PG_RETURN_BOOL(false); + } + + PG_RETURN_BOOL(true); +} + + +/* + * Rotate log file + * + * This function is kept to support adminpack 1.0. + */ +Datum +pg_rotate_logfile(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to rotate log files with adminpack 1.0"), + /* translator: %s is a SQL function name */ + errhint("Consider using %s, which is part of core, instead.", + "pg_logfile_rotate()"))); + + if (!Logging_collector) + { + ereport(WARNING, + (errmsg("rotation not possible because log collection not active"))); + PG_RETURN_BOOL(false); + } + + SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE); + PG_RETURN_BOOL(true); +} + +/* + * Rotate log file + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_rotate_logfile_v2(PG_FUNCTION_ARGS) +{ + if (!Logging_collector) + { + ereport(WARNING, + (errmsg("rotation not possible because log collection not active"))); + PG_RETURN_BOOL(false); + } + + SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE); + PG_RETURN_BOOL(true); +} diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c new file mode 100644 index 0000000..b405f08 --- /dev/null +++ b/src/backend/storage/ipc/sinval.c @@ -0,0 +1,205 @@ +/*------------------------------------------------------------------------- + * + * sinval.c + * POSTGRES shared cache invalidation communication code. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/sinval.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "commands/async.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/sinvaladt.h" +#include "utils/inval.h" + + +uint64 SharedInvalidMessageCounter; + + +/* + * Because backends sitting idle will not be reading sinval events, we + * need a way to give an idle backend a swift kick in the rear and make + * it catch up before the sinval queue overflows and forces it to go + * through a cache reset exercise. This is done by sending + * PROCSIG_CATCHUP_INTERRUPT to any backend that gets too far behind. + * + * The signal handler will set an interrupt pending flag and will set the + * processes latch. Whenever starting to read from the client, or when + * interrupted while doing so, ProcessClientReadInterrupt() will call + * ProcessCatchupEvent(). + */ +volatile sig_atomic_t catchupInterruptPending = false; + + +/* + * SendSharedInvalidMessages + * Add shared-cache-invalidation message(s) to the global SI message queue. + */ +void +SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, int n) +{ + SIInsertDataEntries(msgs, n); +} + +/* + * ReceiveSharedInvalidMessages + * Process shared-cache-invalidation messages waiting for this backend + * + * We guarantee to process all messages that had been queued before the + * routine was entered. It is of course possible for more messages to get + * queued right after our last SIGetDataEntries call. + * + * NOTE: it is entirely possible for this routine to be invoked recursively + * as a consequence of processing inside the invalFunction or resetFunction. + * Furthermore, such a recursive call must guarantee that all outstanding + * inval messages have been processed before it exits. This is the reason + * for the strange-looking choice to use a statically allocated buffer array + * and counters; it's so that a recursive call can process messages already + * sucked out of sinvaladt.c. + */ +void +ReceiveSharedInvalidMessages(void (*invalFunction) (SharedInvalidationMessage *msg), + void (*resetFunction) (void)) +{ +#define MAXINVALMSGS 32 + static SharedInvalidationMessage messages[MAXINVALMSGS]; + + /* + * We use volatile here to prevent bugs if a compiler doesn't realize that + * recursion is a possibility ... + */ + static volatile int nextmsg = 0; + static volatile int nummsgs = 0; + + /* Deal with any messages still pending from an outer recursion */ + while (nextmsg < nummsgs) + { + SharedInvalidationMessage msg = messages[nextmsg++]; + + SharedInvalidMessageCounter++; + invalFunction(&msg); + } + + do + { + int getResult; + + nextmsg = nummsgs = 0; + + /* Try to get some more messages */ + getResult = SIGetDataEntries(messages, MAXINVALMSGS); + + if (getResult < 0) + { + /* got a reset message */ + elog(DEBUG4, "cache state reset"); + SharedInvalidMessageCounter++; + resetFunction(); + break; /* nothing more to do */ + } + + /* Process them, being wary that a recursive call might eat some */ + nextmsg = 0; + nummsgs = getResult; + + while (nextmsg < nummsgs) + { + SharedInvalidationMessage msg = messages[nextmsg++]; + + SharedInvalidMessageCounter++; + invalFunction(&msg); + } + + /* + * We only need to loop if the last SIGetDataEntries call (which might + * have been within a recursive call) returned a full buffer. + */ + } while (nummsgs == MAXINVALMSGS); + + /* + * We are now caught up. If we received a catchup signal, reset that + * flag, and call SICleanupQueue(). This is not so much because we need + * to flush dead messages right now, as that we want to pass on the + * catchup signal to the next slowest backend. "Daisy chaining" the + * catchup signal this way avoids creating spikes in system load for what + * should be just a background maintenance activity. + */ + if (catchupInterruptPending) + { + catchupInterruptPending = false; + elog(DEBUG4, "sinval catchup complete, cleaning queue"); + SICleanupQueue(false, 0); + } +} + + +/* + * HandleCatchupInterrupt + * + * This is called when PROCSIG_CATCHUP_INTERRUPT is received. + * + * We used to directly call ProcessCatchupEvent directly when idle. These days + * we just set a flag to do it later and notify the process of that fact by + * setting the process's latch. + */ +void +HandleCatchupInterrupt(void) +{ + /* + * Note: this is called by a SIGNAL HANDLER. You must be very wary what + * you do here. + */ + + catchupInterruptPending = true; + + /* make sure the event is processed in due course */ + SetLatch(MyLatch); +} + +/* + * ProcessCatchupInterrupt + * + * The portion of catchup interrupt handling that runs outside of the signal + * handler, which allows it to actually process pending invalidations. + */ +void +ProcessCatchupInterrupt(void) +{ + while (catchupInterruptPending) + { + /* + * What we need to do here is cause ReceiveSharedInvalidMessages() to + * run, which will do the necessary work and also reset the + * catchupInterruptPending flag. If we are inside a transaction we + * can just call AcceptInvalidationMessages() to do this. If we + * aren't, we start and immediately end a transaction; the call to + * AcceptInvalidationMessages() happens down inside transaction start. + * + * It is awfully tempting to just call AcceptInvalidationMessages() + * without the rest of the xact start/stop overhead, and I think that + * would actually work in the normal case; but I am not sure that + * things would clean up nicely if we got an error partway through. + */ + if (IsTransactionOrTransactionBlock()) + { + elog(DEBUG4, "ProcessCatchupEvent inside transaction"); + AcceptInvalidationMessages(); + } + else + { + elog(DEBUG4, "ProcessCatchupEvent outside transaction"); + StartTransactionCommand(); + CommitTransactionCommand(); + } + } +} diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c new file mode 100644 index 0000000..3d97c75 --- /dev/null +++ b/src/backend/storage/ipc/sinvaladt.c @@ -0,0 +1,791 @@ +/*------------------------------------------------------------------------- + * + * sinvaladt.c + * POSTGRES shared cache invalidation data manager. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/sinvaladt.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "access/transam.h" +#include "miscadmin.h" +#include "storage/backendid.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/sinvaladt.h" +#include "storage/spin.h" + +/* + * Conceptually, the shared cache invalidation messages are stored in an + * infinite array, where maxMsgNum is the next array subscript to store a + * submitted message in, minMsgNum is the smallest array subscript containing + * a message not yet read by all backends, and we always have maxMsgNum >= + * minMsgNum. (They are equal when there are no messages pending.) For each + * active backend, there is a nextMsgNum pointer indicating the next message it + * needs to read; we have maxMsgNum >= nextMsgNum >= minMsgNum for every + * backend. + * + * (In the current implementation, minMsgNum is a lower bound for the + * per-process nextMsgNum values, but it isn't rigorously kept equal to the + * smallest nextMsgNum --- it may lag behind. We only update it when + * SICleanupQueue is called, and we try not to do that often.) + * + * In reality, the messages are stored in a circular buffer of MAXNUMMESSAGES + * entries. We translate MsgNum values into circular-buffer indexes by + * computing MsgNum % MAXNUMMESSAGES (this should be fast as long as + * MAXNUMMESSAGES is a constant and a power of 2). As long as maxMsgNum + * doesn't exceed minMsgNum by more than MAXNUMMESSAGES, we have enough space + * in the buffer. If the buffer does overflow, we recover by setting the + * "reset" flag for each backend that has fallen too far behind. A backend + * that is in "reset" state is ignored while determining minMsgNum. When + * it does finally attempt to receive inval messages, it must discard all + * its invalidatable state, since it won't know what it missed. + * + * To reduce the probability of needing resets, we send a "catchup" interrupt + * to any backend that seems to be falling unreasonably far behind. The + * normal behavior is that at most one such interrupt is in flight at a time; + * when a backend completes processing a catchup interrupt, it executes + * SICleanupQueue, which will signal the next-furthest-behind backend if + * needed. This avoids undue contention from multiple backends all trying + * to catch up at once. However, the furthest-back backend might be stuck + * in a state where it can't catch up. Eventually it will get reset, so it + * won't cause any more problems for anyone but itself. But we don't want + * to find that a bunch of other backends are now too close to the reset + * threshold to be saved. So SICleanupQueue is designed to occasionally + * send extra catchup interrupts as the queue gets fuller, to backends that + * are far behind and haven't gotten one yet. As long as there aren't a lot + * of "stuck" backends, we won't need a lot of extra interrupts, since ones + * that aren't stuck will propagate their interrupts to the next guy. + * + * We would have problems if the MsgNum values overflow an integer, so + * whenever minMsgNum exceeds MSGNUMWRAPAROUND, we subtract MSGNUMWRAPAROUND + * from all the MsgNum variables simultaneously. MSGNUMWRAPAROUND can be + * large so that we don't need to do this often. It must be a multiple of + * MAXNUMMESSAGES so that the existing circular-buffer entries don't need + * to be moved when we do it. + * + * Access to the shared sinval array is protected by two locks, SInvalReadLock + * and SInvalWriteLock. Readers take SInvalReadLock in shared mode; this + * authorizes them to modify their own ProcState but not to modify or even + * look at anyone else's. When we need to perform array-wide updates, + * such as in SICleanupQueue, we take SInvalReadLock in exclusive mode to + * lock out all readers. Writers take SInvalWriteLock (always in exclusive + * mode) to serialize adding messages to the queue. Note that a writer + * can operate in parallel with one or more readers, because the writer + * has no need to touch anyone's ProcState, except in the infrequent cases + * when SICleanupQueue is needed. The only point of overlap is that + * the writer wants to change maxMsgNum while readers need to read it. + * We deal with that by having a spinlock that readers must take for just + * long enough to read maxMsgNum, while writers take it for just long enough + * to write maxMsgNum. (The exact rule is that you need the spinlock to + * read maxMsgNum if you are not holding SInvalWriteLock, and you need the + * spinlock to write maxMsgNum unless you are holding both locks.) + * + * Note: since maxMsgNum is an int and hence presumably atomically readable/ + * writable, the spinlock might seem unnecessary. The reason it is needed + * is to provide a memory barrier: we need to be sure that messages written + * to the array are actually there before maxMsgNum is increased, and that + * readers will see that data after fetching maxMsgNum. Multiprocessors + * that have weak memory-ordering guarantees can fail without the memory + * barrier instructions that are included in the spinlock sequences. + */ + + +/* + * Configurable parameters. + * + * MAXNUMMESSAGES: max number of shared-inval messages we can buffer. + * Must be a power of 2 for speed. + * + * MSGNUMWRAPAROUND: how often to reduce MsgNum variables to avoid overflow. + * Must be a multiple of MAXNUMMESSAGES. Should be large. + * + * CLEANUP_MIN: the minimum number of messages that must be in the buffer + * before we bother to call SICleanupQueue. + * + * CLEANUP_QUANTUM: how often (in messages) to call SICleanupQueue once + * we exceed CLEANUP_MIN. Should be a power of 2 for speed. + * + * SIG_THRESHOLD: the minimum number of messages a backend must have fallen + * behind before we'll send it PROCSIG_CATCHUP_INTERRUPT. + * + * WRITE_QUANTUM: the max number of messages to push into the buffer per + * iteration of SIInsertDataEntries. Noncritical but should be less than + * CLEANUP_QUANTUM, because we only consider calling SICleanupQueue once + * per iteration. + */ + +#define MAXNUMMESSAGES 4096 +#define MSGNUMWRAPAROUND (MAXNUMMESSAGES * 262144) +#define CLEANUP_MIN (MAXNUMMESSAGES / 2) +#define CLEANUP_QUANTUM (MAXNUMMESSAGES / 16) +#define SIG_THRESHOLD (MAXNUMMESSAGES / 2) +#define WRITE_QUANTUM 64 + +/* Per-backend state in shared invalidation structure */ +typedef struct ProcState +{ + /* procPid is zero in an inactive ProcState array entry. */ + pid_t procPid; /* PID of backend, for signaling */ + PGPROC *proc; /* PGPROC of backend */ + /* nextMsgNum is meaningless if procPid == 0 or resetState is true. */ + int nextMsgNum; /* next message number to read */ + bool resetState; /* backend needs to reset its state */ + bool signaled; /* backend has been sent catchup signal */ + bool hasMessages; /* backend has unread messages */ + + /* + * Backend only sends invalidations, never receives them. This only makes + * sense for Startup process during recovery because it doesn't maintain a + * relcache, yet it fires inval messages to allow query backends to see + * schema changes. + */ + bool sendOnly; /* backend only sends, never receives */ + + /* + * Next LocalTransactionId to use for each idle backend slot. We keep + * this here because it is indexed by BackendId and it is convenient to + * copy the value to and from local memory when MyBackendId is set. It's + * meaningless in an active ProcState entry. + */ + LocalTransactionId nextLXID; +} ProcState; + +/* Shared cache invalidation memory segment */ +typedef struct SISeg +{ + /* + * General state information + */ + int minMsgNum; /* oldest message still needed */ + int maxMsgNum; /* next message number to be assigned */ + int nextThreshold; /* # of messages to call SICleanupQueue */ + int lastBackend; /* index of last active procState entry, +1 */ + int maxBackends; /* size of procState array */ + + slock_t msgnumLock; /* spinlock protecting maxMsgNum */ + + /* + * Circular buffer holding shared-inval messages + */ + SharedInvalidationMessage buffer[MAXNUMMESSAGES]; + + /* + * Per-backend invalidation state info (has MaxBackends entries). + */ + ProcState procState[FLEXIBLE_ARRAY_MEMBER]; +} SISeg; + +static SISeg *shmInvalBuffer; /* pointer to the shared inval buffer */ + + +static LocalTransactionId nextLocalTransactionId; + +static void CleanupInvalidationState(int status, Datum arg); + + +/* + * SInvalShmemSize --- return shared-memory space needed + */ +Size +SInvalShmemSize(void) +{ + Size size; + + size = offsetof(SISeg, procState); + + /* + * In Hot Standby mode, the startup process requests a procState array + * slot using InitRecoveryTransactionEnvironment(). Even though + * MaxBackends doesn't account for the startup process, it is guaranteed + * to get a free slot. This is because the autovacuum launcher and worker + * processes, which are included in MaxBackends, are not started in Hot + * Standby mode. + */ + size = add_size(size, mul_size(sizeof(ProcState), MaxBackends)); + + return size; +} + +/* + * CreateSharedInvalidationState + * Create and initialize the SI message buffer + */ +void +CreateSharedInvalidationState(void) +{ + int i; + bool found; + + /* Allocate space in shared memory */ + shmInvalBuffer = (SISeg *) + ShmemInitStruct("shmInvalBuffer", SInvalShmemSize(), &found); + if (found) + return; + + /* Clear message counters, save size of procState array, init spinlock */ + shmInvalBuffer->minMsgNum = 0; + shmInvalBuffer->maxMsgNum = 0; + shmInvalBuffer->nextThreshold = CLEANUP_MIN; + shmInvalBuffer->lastBackend = 0; + shmInvalBuffer->maxBackends = MaxBackends; + SpinLockInit(&shmInvalBuffer->msgnumLock); + + /* The buffer[] array is initially all unused, so we need not fill it */ + + /* Mark all backends inactive, and initialize nextLXID */ + for (i = 0; i < shmInvalBuffer->maxBackends; i++) + { + shmInvalBuffer->procState[i].procPid = 0; /* inactive */ + shmInvalBuffer->procState[i].proc = NULL; + shmInvalBuffer->procState[i].nextMsgNum = 0; /* meaningless */ + shmInvalBuffer->procState[i].resetState = false; + shmInvalBuffer->procState[i].signaled = false; + shmInvalBuffer->procState[i].hasMessages = false; + shmInvalBuffer->procState[i].nextLXID = InvalidLocalTransactionId; + } +} + +/* + * SharedInvalBackendInit + * Initialize a new backend to operate on the sinval buffer + */ +void +SharedInvalBackendInit(bool sendOnly) +{ + int index; + ProcState *stateP = NULL; + SISeg *segP = shmInvalBuffer; + + /* + * This can run in parallel with read operations, but not with write + * operations, since SIInsertDataEntries relies on lastBackend to set + * hasMessages appropriately. + */ + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + + /* Look for a free entry in the procState array */ + for (index = 0; index < segP->lastBackend; index++) + { + if (segP->procState[index].procPid == 0) /* inactive slot? */ + { + stateP = &segP->procState[index]; + break; + } + } + + if (stateP == NULL) + { + if (segP->lastBackend < segP->maxBackends) + { + stateP = &segP->procState[segP->lastBackend]; + Assert(stateP->procPid == 0); + segP->lastBackend++; + } + else + { + /* + * out of procState slots: MaxBackends exceeded -- report normally + */ + MyBackendId = InvalidBackendId; + LWLockRelease(SInvalWriteLock); + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("sorry, too many clients already"))); + } + } + + MyBackendId = (stateP - &segP->procState[0]) + 1; + + /* Advertise assigned backend ID in MyProc */ + MyProc->backendId = MyBackendId; + + /* Fetch next local transaction ID into local memory */ + nextLocalTransactionId = stateP->nextLXID; + + /* mark myself active, with all extant messages already read */ + stateP->procPid = MyProcPid; + stateP->proc = MyProc; + stateP->nextMsgNum = segP->maxMsgNum; + stateP->resetState = false; + stateP->signaled = false; + stateP->hasMessages = false; + stateP->sendOnly = sendOnly; + + LWLockRelease(SInvalWriteLock); + + /* register exit routine to mark my entry inactive at exit */ + on_shmem_exit(CleanupInvalidationState, PointerGetDatum(segP)); + + elog(DEBUG4, "my backend ID is %d", MyBackendId); +} + +/* + * CleanupInvalidationState + * Mark the current backend as no longer active. + * + * This function is called via on_shmem_exit() during backend shutdown. + * + * arg is really of type "SISeg*". + */ +static void +CleanupInvalidationState(int status, Datum arg) +{ + SISeg *segP = (SISeg *) DatumGetPointer(arg); + ProcState *stateP; + int i; + + Assert(PointerIsValid(segP)); + + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + + stateP = &segP->procState[MyBackendId - 1]; + + /* Update next local transaction ID for next holder of this backendID */ + stateP->nextLXID = nextLocalTransactionId; + + /* Mark myself inactive */ + stateP->procPid = 0; + stateP->proc = NULL; + stateP->nextMsgNum = 0; + stateP->resetState = false; + stateP->signaled = false; + + /* Recompute index of last active backend */ + for (i = segP->lastBackend; i > 0; i--) + { + if (segP->procState[i - 1].procPid != 0) + break; + } + segP->lastBackend = i; + + LWLockRelease(SInvalWriteLock); +} + +/* + * BackendIdGetProc + * Get the PGPROC structure for a backend, given the backend ID. + * The result may be out of date arbitrarily quickly, so the caller + * must be careful about how this information is used. NULL is + * returned if the backend is not active. + */ +PGPROC * +BackendIdGetProc(int backendID) +{ + PGPROC *result = NULL; + SISeg *segP = shmInvalBuffer; + + /* Need to lock out additions/removals of backends */ + LWLockAcquire(SInvalWriteLock, LW_SHARED); + + if (backendID > 0 && backendID <= segP->lastBackend) + { + ProcState *stateP = &segP->procState[backendID - 1]; + + result = stateP->proc; + } + + LWLockRelease(SInvalWriteLock); + + return result; +} + +/* + * BackendIdGetTransactionIds + * Get the xid, xmin, nsubxid and overflow status of the backend. The + * result may be out of date arbitrarily quickly, so the caller must be + * careful about how this information is used. + */ +void +BackendIdGetTransactionIds(int backendID, TransactionId *xid, + TransactionId *xmin, int *nsubxid, bool *overflowed) +{ + SISeg *segP = shmInvalBuffer; + + *xid = InvalidTransactionId; + *xmin = InvalidTransactionId; + *nsubxid = 0; + *overflowed = false; + + /* Need to lock out additions/removals of backends */ + LWLockAcquire(SInvalWriteLock, LW_SHARED); + + if (backendID > 0 && backendID <= segP->lastBackend) + { + ProcState *stateP = &segP->procState[backendID - 1]; + PGPROC *proc = stateP->proc; + + if (proc != NULL) + { + *xid = proc->xid; + *xmin = proc->xmin; + *nsubxid = proc->subxidStatus.count; + *overflowed = proc->subxidStatus.overflowed; + } + } + + LWLockRelease(SInvalWriteLock); +} + +/* + * SIInsertDataEntries + * Add new invalidation message(s) to the buffer. + */ +void +SIInsertDataEntries(const SharedInvalidationMessage *data, int n) +{ + SISeg *segP = shmInvalBuffer; + + /* + * N can be arbitrarily large. We divide the work into groups of no more + * than WRITE_QUANTUM messages, to be sure that we don't hold the lock for + * an unreasonably long time. (This is not so much because we care about + * letting in other writers, as that some just-caught-up backend might be + * trying to do SICleanupQueue to pass on its signal, and we don't want it + * to have to wait a long time.) Also, we need to consider calling + * SICleanupQueue every so often. + */ + while (n > 0) + { + int nthistime = Min(n, WRITE_QUANTUM); + int numMsgs; + int max; + int i; + + n -= nthistime; + + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + + /* + * If the buffer is full, we *must* acquire some space. Clean the + * queue and reset anyone who is preventing space from being freed. + * Otherwise, clean the queue only when it's exceeded the next + * fullness threshold. We have to loop and recheck the buffer state + * after any call of SICleanupQueue. + */ + for (;;) + { + numMsgs = segP->maxMsgNum - segP->minMsgNum; + if (numMsgs + nthistime > MAXNUMMESSAGES || + numMsgs >= segP->nextThreshold) + SICleanupQueue(true, nthistime); + else + break; + } + + /* + * Insert new message(s) into proper slot of circular buffer + */ + max = segP->maxMsgNum; + while (nthistime-- > 0) + { + segP->buffer[max % MAXNUMMESSAGES] = *data++; + max++; + } + + /* Update current value of maxMsgNum using spinlock */ + SpinLockAcquire(&segP->msgnumLock); + segP->maxMsgNum = max; + SpinLockRelease(&segP->msgnumLock); + + /* + * Now that the maxMsgNum change is globally visible, we give everyone + * a swift kick to make sure they read the newly added messages. + * Releasing SInvalWriteLock will enforce a full memory barrier, so + * these (unlocked) changes will be committed to memory before we exit + * the function. + */ + for (i = 0; i < segP->lastBackend; i++) + { + ProcState *stateP = &segP->procState[i]; + + stateP->hasMessages = true; + } + + LWLockRelease(SInvalWriteLock); + } +} + +/* + * SIGetDataEntries + * get next SI message(s) for current backend, if there are any + * + * Possible return values: + * 0: no SI message available + * n>0: next n SI messages have been extracted into data[] + * -1: SI reset message extracted + * + * If the return value is less than the array size "datasize", the caller + * can assume that there are no more SI messages after the one(s) returned. + * Otherwise, another call is needed to collect more messages. + * + * NB: this can run in parallel with other instances of SIGetDataEntries + * executing on behalf of other backends, since each instance will modify only + * fields of its own backend's ProcState, and no instance will look at fields + * of other backends' ProcStates. We express this by grabbing SInvalReadLock + * in shared mode. Note that this is not exactly the normal (read-only) + * interpretation of a shared lock! Look closely at the interactions before + * allowing SInvalReadLock to be grabbed in shared mode for any other reason! + * + * NB: this can also run in parallel with SIInsertDataEntries. It is not + * guaranteed that we will return any messages added after the routine is + * entered. + * + * Note: we assume that "datasize" is not so large that it might be important + * to break our hold on SInvalReadLock into segments. + */ +int +SIGetDataEntries(SharedInvalidationMessage *data, int datasize) +{ + SISeg *segP; + ProcState *stateP; + int max; + int n; + + segP = shmInvalBuffer; + stateP = &segP->procState[MyBackendId - 1]; + + /* + * Before starting to take locks, do a quick, unlocked test to see whether + * there can possibly be anything to read. On a multiprocessor system, + * it's possible that this load could migrate backwards and occur before + * we actually enter this function, so we might miss a sinval message that + * was just added by some other processor. But they can't migrate + * backwards over a preceding lock acquisition, so it should be OK. If we + * haven't acquired a lock preventing against further relevant + * invalidations, any such occurrence is not much different than if the + * invalidation had arrived slightly later in the first place. + */ + if (!stateP->hasMessages) + return 0; + + LWLockAcquire(SInvalReadLock, LW_SHARED); + + /* + * We must reset hasMessages before determining how many messages we're + * going to read. That way, if new messages arrive after we have + * determined how many we're reading, the flag will get reset and we'll + * notice those messages part-way through. + * + * Note that, if we don't end up reading all of the messages, we had + * better be certain to reset this flag before exiting! + */ + stateP->hasMessages = false; + + /* Fetch current value of maxMsgNum using spinlock */ + SpinLockAcquire(&segP->msgnumLock); + max = segP->maxMsgNum; + SpinLockRelease(&segP->msgnumLock); + + if (stateP->resetState) + { + /* + * Force reset. We can say we have dealt with any messages added + * since the reset, as well; and that means we should clear the + * signaled flag, too. + */ + stateP->nextMsgNum = max; + stateP->resetState = false; + stateP->signaled = false; + LWLockRelease(SInvalReadLock); + return -1; + } + + /* + * Retrieve messages and advance backend's counter, until data array is + * full or there are no more messages. + * + * There may be other backends that haven't read the message(s), so we + * cannot delete them here. SICleanupQueue() will eventually remove them + * from the queue. + */ + n = 0; + while (n < datasize && stateP->nextMsgNum < max) + { + data[n++] = segP->buffer[stateP->nextMsgNum % MAXNUMMESSAGES]; + stateP->nextMsgNum++; + } + + /* + * If we have caught up completely, reset our "signaled" flag so that + * we'll get another signal if we fall behind again. + * + * If we haven't caught up completely, reset the hasMessages flag so that + * we see the remaining messages next time. + */ + if (stateP->nextMsgNum >= max) + stateP->signaled = false; + else + stateP->hasMessages = true; + + LWLockRelease(SInvalReadLock); + return n; +} + +/* + * SICleanupQueue + * Remove messages that have been consumed by all active backends + * + * callerHasWriteLock is true if caller is holding SInvalWriteLock. + * minFree is the minimum number of message slots to make free. + * + * Possible side effects of this routine include marking one or more + * backends as "reset" in the array, and sending PROCSIG_CATCHUP_INTERRUPT + * to some backend that seems to be getting too far behind. We signal at + * most one backend at a time, for reasons explained at the top of the file. + * + * Caution: because we transiently release write lock when we have to signal + * some other backend, it is NOT guaranteed that there are still minFree + * free message slots at exit. Caller must recheck and perhaps retry. + */ +void +SICleanupQueue(bool callerHasWriteLock, int minFree) +{ + SISeg *segP = shmInvalBuffer; + int min, + minsig, + lowbound, + numMsgs, + i; + ProcState *needSig = NULL; + + /* Lock out all writers and readers */ + if (!callerHasWriteLock) + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + LWLockAcquire(SInvalReadLock, LW_EXCLUSIVE); + + /* + * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify the + * furthest-back backend that needs signaling (if any), and reset any + * backends that are too far back. Note that because we ignore sendOnly + * backends here it is possible for them to keep sending messages without + * a problem even when they are the only active backend. + */ + min = segP->maxMsgNum; + minsig = min - SIG_THRESHOLD; + lowbound = min - MAXNUMMESSAGES + minFree; + + for (i = 0; i < segP->lastBackend; i++) + { + ProcState *stateP = &segP->procState[i]; + int n = stateP->nextMsgNum; + + /* Ignore if inactive or already in reset state */ + if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly) + continue; + + /* + * If we must free some space and this backend is preventing it, force + * him into reset state and then ignore until he catches up. + */ + if (n < lowbound) + { + stateP->resetState = true; + /* no point in signaling him ... */ + continue; + } + + /* Track the global minimum nextMsgNum */ + if (n < min) + min = n; + + /* Also see who's furthest back of the unsignaled backends */ + if (n < minsig && !stateP->signaled) + { + minsig = n; + needSig = stateP; + } + } + segP->minMsgNum = min; + + /* + * When minMsgNum gets really large, decrement all message counters so as + * to forestall overflow of the counters. This happens seldom enough that + * folding it into the previous loop would be a loser. + */ + if (min >= MSGNUMWRAPAROUND) + { + segP->minMsgNum -= MSGNUMWRAPAROUND; + segP->maxMsgNum -= MSGNUMWRAPAROUND; + for (i = 0; i < segP->lastBackend; i++) + { + /* we don't bother skipping inactive entries here */ + segP->procState[i].nextMsgNum -= MSGNUMWRAPAROUND; + } + } + + /* + * Determine how many messages are still in the queue, and set the + * threshold at which we should repeat SICleanupQueue(). + */ + numMsgs = segP->maxMsgNum - segP->minMsgNum; + if (numMsgs < CLEANUP_MIN) + segP->nextThreshold = CLEANUP_MIN; + else + segP->nextThreshold = (numMsgs / CLEANUP_QUANTUM + 1) * CLEANUP_QUANTUM; + + /* + * Lastly, signal anyone who needs a catchup interrupt. Since + * SendProcSignal() might not be fast, we don't want to hold locks while + * executing it. + */ + if (needSig) + { + pid_t his_pid = needSig->procPid; + BackendId his_backendId = (needSig - &segP->procState[0]) + 1; + + needSig->signaled = true; + LWLockRelease(SInvalReadLock); + LWLockRelease(SInvalWriteLock); + elog(DEBUG4, "sending sinval catchup signal to PID %d", (int) his_pid); + SendProcSignal(his_pid, PROCSIG_CATCHUP_INTERRUPT, his_backendId); + if (callerHasWriteLock) + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + } + else + { + LWLockRelease(SInvalReadLock); + if (!callerHasWriteLock) + LWLockRelease(SInvalWriteLock); + } +} + + +/* + * GetNextLocalTransactionId --- allocate a new LocalTransactionId + * + * We split VirtualTransactionIds into two parts so that it is possible + * to allocate a new one without any contention for shared memory, except + * for a bit of additional overhead during backend startup/shutdown. + * The high-order part of a VirtualTransactionId is a BackendId, and the + * low-order part is a LocalTransactionId, which we assign from a local + * counter. To avoid the risk of a VirtualTransactionId being reused + * within a short interval, successive procs occupying the same backend ID + * slot should use a consecutive sequence of local IDs, which is implemented + * by copying nextLocalTransactionId as seen above. + */ +LocalTransactionId +GetNextLocalTransactionId(void) +{ + LocalTransactionId result; + + /* loop to avoid returning InvalidLocalTransactionId at wraparound */ + do + { + result = nextLocalTransactionId++; + } while (!LocalTransactionIdIsValid(result)); + + return result; +} diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c new file mode 100644 index 0000000..4c06741 --- /dev/null +++ b/src/backend/storage/ipc/standby.c @@ -0,0 +1,1518 @@ +/*------------------------------------------------------------------------- + * + * standby.c + * Misc functions used in Hot Standby mode. + * + * All functions for handling RM_STANDBY_ID, which relate to + * AccessExclusiveLocks and starting snapshots for Hot Standby mode. + * Plus conflict recovery processing. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/ipc/standby.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "access/xlogrecovery.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/slot.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "storage/standby.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* User-settable GUC parameters */ +int max_standby_archive_delay = 30 * 1000; +int max_standby_streaming_delay = 30 * 1000; +bool log_recovery_conflict_waits = false; + +/* + * Keep track of all the exclusive locks owned by original transactions. + * For each known exclusive lock, there is a RecoveryLockEntry in the + * RecoveryLockHash hash table. All RecoveryLockEntrys belonging to a + * given XID are chained together so that we can find them easily. + * For each original transaction that is known to have any such locks, + * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table, + * which stores the head of the chain of its locks. + */ +typedef struct RecoveryLockEntry +{ + xl_standby_lock key; /* hash key: xid, dbOid, relOid */ + struct RecoveryLockEntry *next; /* chain link */ +} RecoveryLockEntry; + +typedef struct RecoveryLockXidEntry +{ + TransactionId xid; /* hash key -- must be first */ + struct RecoveryLockEntry *head; /* chain head */ +} RecoveryLockXidEntry; + +static HTAB *RecoveryLockHash = NULL; +static HTAB *RecoveryLockXidHash = NULL; + +/* Flags set by timeout handlers */ +static volatile sig_atomic_t got_standby_deadlock_timeout = false; +static volatile sig_atomic_t got_standby_delay_timeout = false; +static volatile sig_atomic_t got_standby_lock_timeout = false; + +static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, + ProcSignalReason reason, + uint32 wait_event_info, + bool report_waiting); +static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason); +static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts); +static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks); +static const char *get_recovery_conflict_desc(ProcSignalReason reason); + +/* + * InitRecoveryTransactionEnvironment + * Initialize tracking of our primary's in-progress transactions. + * + * We need to issue shared invalidations and hold locks. Holding locks + * means others may want to wait on us, so we need to make a lock table + * vxact entry like a real transaction. We could create and delete + * lock table entries for each transaction but its simpler just to create + * one permanent entry and leave it there all the time. Locks are then + * acquired and released as needed. Yes, this means you can see the + * Startup process in pg_locks once we have run this. + */ +void +InitRecoveryTransactionEnvironment(void) +{ + VirtualTransactionId vxid; + HASHCTL hash_ctl; + + Assert(RecoveryLockHash == NULL); /* don't run this twice */ + + /* + * Initialize the hash tables for tracking the locks held by each + * transaction. + */ + hash_ctl.keysize = sizeof(xl_standby_lock); + hash_ctl.entrysize = sizeof(RecoveryLockEntry); + RecoveryLockHash = hash_create("RecoveryLockHash", + 64, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(RecoveryLockXidEntry); + RecoveryLockXidHash = hash_create("RecoveryLockXidHash", + 64, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Initialize shared invalidation management for Startup process, being + * careful to register ourselves as a sendOnly process so we don't need to + * read messages, nor will we get signaled when the queue starts filling + * up. + */ + SharedInvalBackendInit(true); + + /* + * Lock a virtual transaction id for Startup process. + * + * We need to do GetNextLocalTransactionId() because + * SharedInvalBackendInit() leaves localTransactionId invalid and the lock + * manager doesn't like that at all. + * + * Note that we don't need to run XactLockTableInsert() because nobody + * needs to wait on xids. That sounds a little strange, but table locks + * are held by vxids and row level locks are held by xids. All queries + * hold AccessShareLocks so never block while we write or lock new rows. + */ + vxid.backendId = MyBackendId; + vxid.localTransactionId = GetNextLocalTransactionId(); + VirtualXactLockTableInsert(vxid); + + standbyState = STANDBY_INITIALIZED; +} + +/* + * ShutdownRecoveryTransactionEnvironment + * Shut down transaction tracking + * + * Prepare to switch from hot standby mode to normal operation. Shut down + * recovery-time transaction tracking. + * + * This must be called even in shutdown of startup process if transaction + * tracking has been initialized. Otherwise some locks the tracked + * transactions were holding will not be released and may interfere with + * the processes still running (but will exit soon later) at the exit of + * startup process. + */ +void +ShutdownRecoveryTransactionEnvironment(void) +{ + /* + * Do nothing if RecoveryLockHash is NULL because that means that + * transaction tracking has not yet been initialized or has already been + * shut down. This makes it safe to have possibly-redundant calls of this + * function during process exit. + */ + if (RecoveryLockHash == NULL) + return; + + /* Mark all tracked in-progress transactions as finished. */ + ExpireAllKnownAssignedTransactionIds(); + + /* Release all locks the tracked transactions were holding */ + StandbyReleaseAllLocks(); + + /* Destroy the lock hash tables. */ + hash_destroy(RecoveryLockHash); + hash_destroy(RecoveryLockXidHash); + RecoveryLockHash = NULL; + RecoveryLockXidHash = NULL; + + /* Cleanup our VirtualTransaction */ + VirtualXactLockTableCleanup(); +} + + +/* + * ----------------------------------------------------- + * Standby wait timers and backend cancel logic + * ----------------------------------------------------- + */ + +/* + * Determine the cutoff time at which we want to start canceling conflicting + * transactions. Returns zero (a time safely in the past) if we are willing + * to wait forever. + */ +static TimestampTz +GetStandbyLimitTime(void) +{ + TimestampTz rtime; + bool fromStream; + + /* + * The cutoff time is the last WAL data receipt time plus the appropriate + * delay variable. Delay of -1 means wait forever. + */ + GetXLogReceiptTime(&rtime, &fromStream); + if (fromStream) + { + if (max_standby_streaming_delay < 0) + return 0; /* wait forever */ + return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay); + } + else + { + if (max_standby_archive_delay < 0) + return 0; /* wait forever */ + return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay); + } +} + +#define STANDBY_INITIAL_WAIT_US 1000 +static int standbyWait_us = STANDBY_INITIAL_WAIT_US; + +/* + * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs. + * We wait here for a while then return. If we decide we can't wait any + * more then we return true, if we can wait some more return false. + */ +static bool +WaitExceedsMaxStandbyDelay(uint32 wait_event_info) +{ + TimestampTz ltime; + + CHECK_FOR_INTERRUPTS(); + + /* Are we past the limit time? */ + ltime = GetStandbyLimitTime(); + if (ltime && GetCurrentTimestamp() >= ltime) + return true; + + /* + * Sleep a bit (this is essential to avoid busy-waiting). + */ + pgstat_report_wait_start(wait_event_info); + pg_usleep(standbyWait_us); + pgstat_report_wait_end(); + + /* + * Progressively increase the sleep times, but not to more than 1s, since + * pg_usleep isn't interruptible on some platforms. + */ + standbyWait_us *= 2; + if (standbyWait_us > 1000000) + standbyWait_us = 1000000; + + return false; +} + +/* + * Log the recovery conflict. + * + * wait_start is the timestamp when the caller started to wait. + * now is the timestamp when this function has been called. + * wait_list is the list of virtual transaction ids assigned to + * conflicting processes. still_waiting indicates whether + * the startup process is still waiting for the recovery conflict + * to be resolved or not. + */ +void +LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, + TimestampTz now, VirtualTransactionId *wait_list, + bool still_waiting) +{ + long secs; + int usecs; + long msecs; + StringInfoData buf; + int nprocs = 0; + + /* + * There must be no conflicting processes when the recovery conflict has + * already been resolved. + */ + Assert(still_waiting || wait_list == NULL); + + TimestampDifference(wait_start, now, &secs, &usecs); + msecs = secs * 1000 + usecs / 1000; + usecs = usecs % 1000; + + if (wait_list) + { + VirtualTransactionId *vxids; + + /* Construct a string of list of the conflicting processes */ + vxids = wait_list; + while (VirtualTransactionIdIsValid(*vxids)) + { + PGPROC *proc = BackendIdGetProc(vxids->backendId); + + /* proc can be NULL if the target backend is not active */ + if (proc) + { + if (nprocs == 0) + { + initStringInfo(&buf); + appendStringInfo(&buf, "%d", proc->pid); + } + else + appendStringInfo(&buf, ", %d", proc->pid); + + nprocs++; + } + + vxids++; + } + } + + /* + * If wait_list is specified, report the list of PIDs of active + * conflicting backends in a detail message. Note that if all the backends + * in the list are not active, no detail message is logged. + */ + if (still_waiting) + { + ereport(LOG, + errmsg("recovery still waiting after %ld.%03d ms: %s", + msecs, usecs, get_recovery_conflict_desc(reason)), + nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.", + "Conflicting processes: %s.", + nprocs, buf.data) : 0); + } + else + { + ereport(LOG, + errmsg("recovery finished waiting after %ld.%03d ms: %s", + msecs, usecs, get_recovery_conflict_desc(reason))); + } + + if (nprocs > 0) + pfree(buf.data); +} + +/* + * This is the main executioner for any query backend that conflicts with + * recovery processing. Judgement has already been passed on it within + * a specific rmgr. Here we just issue the orders to the procs. The procs + * then throw the required error as instructed. + * + * If report_waiting is true, "waiting" is reported in PS display and the + * wait for recovery conflict is reported in the log, if necessary. If + * the caller is responsible for reporting them, report_waiting should be + * false. Otherwise, both the caller and this function report the same + * thing unexpectedly. + */ +static void +ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, + ProcSignalReason reason, uint32 wait_event_info, + bool report_waiting) +{ + TimestampTz waitStart = 0; + bool waiting = false; + bool logged_recovery_conflict = false; + + /* Fast exit, to avoid a kernel call if there's no work to be done. */ + if (!VirtualTransactionIdIsValid(*waitlist)) + return; + + /* Set the wait start timestamp for reporting */ + if (report_waiting && (log_recovery_conflict_waits || update_process_title)) + waitStart = GetCurrentTimestamp(); + + while (VirtualTransactionIdIsValid(*waitlist)) + { + /* reset standbyWait_us for each xact we wait for */ + standbyWait_us = STANDBY_INITIAL_WAIT_US; + + /* wait until the virtual xid is gone */ + while (!VirtualXactLock(*waitlist, false)) + { + /* Is it time to kill it? */ + if (WaitExceedsMaxStandbyDelay(wait_event_info)) + { + pid_t pid; + + /* + * Now find out who to throw out of the balloon. + */ + Assert(VirtualTransactionIdIsValid(*waitlist)); + pid = CancelVirtualTransaction(*waitlist, reason); + + /* + * Wait a little bit for it to die so that we avoid flooding + * an unresponsive backend when system is heavily loaded. + */ + if (pid != 0) + pg_usleep(5000L); + } + + if (waitStart != 0 && (!logged_recovery_conflict || !waiting)) + { + TimestampTz now = 0; + bool maybe_log_conflict; + bool maybe_update_title; + + maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict); + maybe_update_title = (update_process_title && !waiting); + + /* Get the current timestamp if not report yet */ + if (maybe_log_conflict || maybe_update_title) + now = GetCurrentTimestamp(); + + /* + * Report via ps if we have been waiting for more than 500 + * msec (should that be configurable?) + */ + if (maybe_update_title && + TimestampDifferenceExceeds(waitStart, now, 500)) + { + set_ps_display_suffix("waiting"); + waiting = true; + } + + /* + * Emit the log message if the startup process is waiting + * longer than deadlock_timeout for recovery conflict. + */ + if (maybe_log_conflict && + TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout)) + { + LogRecoveryConflict(reason, waitStart, now, waitlist, true); + logged_recovery_conflict = true; + } + } + } + + /* The virtual transaction is gone now, wait for the next one */ + waitlist++; + } + + /* + * Emit the log message if recovery conflict was resolved but the startup + * process waited longer than deadlock_timeout for it. + */ + if (logged_recovery_conflict) + LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(), + NULL, false); + + /* reset ps display to remove the suffix if we added one */ + if (waiting) + set_ps_display_remove_suffix(); + +} + +/* + * Generate whatever recovery conflicts are needed to eliminate snapshots that + * might see XIDs <= snapshotConflictHorizon as still running. + * + * snapshotConflictHorizon cutoffs are our standard approach to generating + * granular recovery conflicts. Note that InvalidTransactionId values are + * interpreted as "definitely don't need any conflicts" here, which is a + * general convention that WAL records can (and often do) depend on. + */ +void +ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon, + bool isCatalogRel, + RelFileLocator locator) +{ + VirtualTransactionId *backends; + + /* + * If we get passed InvalidTransactionId then we do nothing (no conflict). + * + * This can happen when replaying already-applied WAL records after a + * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE + * record that marks as frozen a page which was already all-visible. It's + * also quite common with records generated during index deletion + * (original execution of the deletion can reason that a recovery conflict + * which is sufficient for the deletion operation must take place before + * replay of the deletion record itself). + */ + if (!TransactionIdIsValid(snapshotConflictHorizon)) + return; + + Assert(TransactionIdIsNormal(snapshotConflictHorizon)); + backends = GetConflictingVirtualXIDs(snapshotConflictHorizon, + locator.dbOid); + ResolveRecoveryConflictWithVirtualXIDs(backends, + PROCSIG_RECOVERY_CONFLICT_SNAPSHOT, + WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT, + true); + + /* + * Note that WaitExceedsMaxStandbyDelay() is not taken into account here + * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That + * seems OK, given that this kind of conflict should not normally be + * reached, e.g. due to using a physical replication slot. + */ + if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel) + InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid, + snapshotConflictHorizon); +} + +/* + * Variant of ResolveRecoveryConflictWithSnapshot that works with + * FullTransactionId values + */ +void +ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon, + bool isCatalogRel, + RelFileLocator locator) +{ + /* + * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds, + * so truncate the logged FullTransactionId. If the logged value is very + * old, so that XID wrap-around already happened on it, there can't be any + * snapshots that still see it. + */ + FullTransactionId nextXid = ReadNextFullTransactionId(); + uint64 diff; + + diff = U64FromFullTransactionId(nextXid) - + U64FromFullTransactionId(snapshotConflictHorizon); + if (diff < MaxTransactionId / 2) + { + TransactionId truncated; + + truncated = XidFromFullTransactionId(snapshotConflictHorizon); + ResolveRecoveryConflictWithSnapshot(truncated, + isCatalogRel, + locator); + } +} + +void +ResolveRecoveryConflictWithTablespace(Oid tsid) +{ + VirtualTransactionId *temp_file_users; + + /* + * Standby users may be currently using this tablespace for their + * temporary files. We only care about current users because + * temp_tablespace parameter will just ignore tablespaces that no longer + * exist. + * + * Ask everybody to cancel their queries immediately so we can ensure no + * temp files remain and we can remove the tablespace. Nuke the entire + * site from orbit, it's the only way to be sure. + * + * XXX: We could work out the pids of active backends using this + * tablespace by examining the temp filenames in the directory. We would + * then convert the pids into VirtualXIDs before attempting to cancel + * them. + * + * We don't wait for commit because drop tablespace is non-transactional. + */ + temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId, + InvalidOid); + ResolveRecoveryConflictWithVirtualXIDs(temp_file_users, + PROCSIG_RECOVERY_CONFLICT_TABLESPACE, + WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE, + true); +} + +void +ResolveRecoveryConflictWithDatabase(Oid dbid) +{ + /* + * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that + * only waits for transactions and completely idle sessions would block + * us. This is rare enough that we do this as simply as possible: no wait, + * just force them off immediately. + * + * No locking is required here because we already acquired + * AccessExclusiveLock. Anybody trying to connect while we do this will + * block during InitPostgres() and then disconnect when they see the + * database has been removed. + */ + while (CountDBBackends(dbid) > 0) + { + CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true); + + /* + * Wait awhile for them to die so that we avoid flooding an + * unresponsive backend when system is heavily loaded. + */ + pg_usleep(10000); + } +} + +/* + * ResolveRecoveryConflictWithLock is called from ProcSleep() + * to resolve conflicts with other backends holding relation locks. + * + * The WaitLatch sleep normally done in ProcSleep() + * (when not InHotStandby) is performed here, for code clarity. + * + * We either resolve conflicts immediately or set a timeout to wake us at + * the limit of our patience. + * + * Resolve conflicts by canceling to all backends holding a conflicting + * lock. As we are already queued to be granted the lock, no new lock + * requests conflicting with ours will be granted in the meantime. + * + * We also must check for deadlocks involving the Startup process and + * hot-standby backend processes. If deadlock_timeout is reached in + * this function, all the backends holding the conflicting locks are + * requested to check themselves for deadlocks. + * + * logging_conflict should be true if the recovery conflict has not been + * logged yet even though logging is enabled. After deadlock_timeout is + * reached and the request for deadlock check is sent, we wait again to + * be signaled by the release of the lock if logging_conflict is false. + * Otherwise we return without waiting again so that the caller can report + * the recovery conflict. In this case, then, this function is called again + * with logging_conflict=false (because the recovery conflict has already + * been logged) and we will wait again for the lock to be released. + */ +void +ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict) +{ + TimestampTz ltime; + TimestampTz now; + + Assert(InHotStandby); + + ltime = GetStandbyLimitTime(); + now = GetCurrentTimestamp(); + + /* + * Update waitStart if first time through after the startup process + * started waiting for the lock. It should not be updated every time + * ResolveRecoveryConflictWithLock() is called during the wait. + * + * Use the current time obtained for comparison with ltime as waitStart + * (i.e., the time when this process started waiting for the lock). Since + * getting the current time newly can cause overhead, we reuse the + * already-obtained time to avoid that overhead. + * + * Note that waitStart is updated without holding the lock table's + * partition lock, to avoid the overhead by additional lock acquisition. + * This can cause "waitstart" in pg_locks to become NULL for a very short + * period of time after the wait started even though "granted" is false. + * This is OK in practice because we can assume that users are likely to + * look at "waitstart" when waiting for the lock for a long time. + */ + if (pg_atomic_read_u64(&MyProc->waitStart) == 0) + pg_atomic_write_u64(&MyProc->waitStart, now); + + if (now >= ltime && ltime != 0) + { + /* + * We're already behind, so clear a path as quickly as possible. + */ + VirtualTransactionId *backends; + + backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL); + + /* + * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting + * "waiting" in PS display by disabling its argument report_waiting + * because the caller, WaitOnLock(), has already reported that. + */ + ResolveRecoveryConflictWithVirtualXIDs(backends, + PROCSIG_RECOVERY_CONFLICT_LOCK, + PG_WAIT_LOCK | locktag.locktag_type, + false); + } + else + { + /* + * Wait (or wait again) until ltime, and check for deadlocks as well + * if we will be waiting longer than deadlock_timeout + */ + EnableTimeoutParams timeouts[2]; + int cnt = 0; + + if (ltime != 0) + { + got_standby_lock_timeout = false; + timeouts[cnt].id = STANDBY_LOCK_TIMEOUT; + timeouts[cnt].type = TMPARAM_AT; + timeouts[cnt].fin_time = ltime; + cnt++; + } + + got_standby_deadlock_timeout = false; + timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT; + timeouts[cnt].type = TMPARAM_AFTER; + timeouts[cnt].delay_ms = DeadlockTimeout; + cnt++; + + enable_timeouts(timeouts, cnt); + } + + /* Wait to be signaled by the release of the Relation Lock */ + ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type); + + /* + * Exit if ltime is reached. Then all the backends holding conflicting + * locks will be canceled in the next ResolveRecoveryConflictWithLock() + * call. + */ + if (got_standby_lock_timeout) + goto cleanup; + + if (got_standby_deadlock_timeout) + { + VirtualTransactionId *backends; + + backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL); + + /* Quick exit if there's no work to be done */ + if (!VirtualTransactionIdIsValid(*backends)) + goto cleanup; + + /* + * Send signals to all the backends holding the conflicting locks, to + * ask them to check themselves for deadlocks. + */ + while (VirtualTransactionIdIsValid(*backends)) + { + SignalVirtualTransaction(*backends, + PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK, + false); + backends++; + } + + /* + * Exit if the recovery conflict has not been logged yet even though + * logging is enabled, so that the caller can log that. Then + * RecoveryConflictWithLock() is called again and we will wait again + * for the lock to be released. + */ + if (logging_conflict) + goto cleanup; + + /* + * Wait again here to be signaled by the release of the Relation Lock, + * to prevent the subsequent RecoveryConflictWithLock() from causing + * deadlock_timeout and sending a request for deadlocks check again. + * Otherwise the request continues to be sent every deadlock_timeout + * until the relation locks are released or ltime is reached. + */ + got_standby_deadlock_timeout = false; + ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type); + } + +cleanup: + + /* + * Clear any timeout requests established above. We assume here that the + * Startup process doesn't have any other outstanding timeouts than those + * used by this function. If that stops being true, we could cancel the + * timeouts individually, but that'd be slower. + */ + disable_all_timeouts(false); + got_standby_lock_timeout = false; + got_standby_deadlock_timeout = false; +} + +/* + * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup() + * to resolve conflicts with other backends holding buffer pins. + * + * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup() + * (when not InHotStandby) is performed here, for code clarity. + * + * We either resolve conflicts immediately or set a timeout to wake us at + * the limit of our patience. + * + * Resolve conflicts by sending a PROCSIG signal to all backends to check if + * they hold one of the buffer pins that is blocking Startup process. If so, + * those backends will take an appropriate error action, ERROR or FATAL. + * + * We also must check for deadlocks. Deadlocks occur because if queries + * wait on a lock, that must be behind an AccessExclusiveLock, which can only + * be cleared if the Startup process replays a transaction completion record. + * If Startup process is also waiting then that is a deadlock. The deadlock + * can occur if the query is waiting and then the Startup sleeps, or if + * Startup is sleeping and the query waits on a lock. We protect against + * only the former sequence here, the latter sequence is checked prior to + * the query sleeping, in CheckRecoveryConflictDeadlock(). + * + * Deadlocks are extremely rare, and relatively expensive to check for, + * so we don't do a deadlock check right away ... only if we have had to wait + * at least deadlock_timeout. + */ +void +ResolveRecoveryConflictWithBufferPin(void) +{ + TimestampTz ltime; + + Assert(InHotStandby); + + ltime = GetStandbyLimitTime(); + + if (GetCurrentTimestamp() >= ltime && ltime != 0) + { + /* + * We're already behind, so clear a path as quickly as possible. + */ + SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + } + else + { + /* + * Wake up at ltime, and check for deadlocks as well if we will be + * waiting longer than deadlock_timeout + */ + EnableTimeoutParams timeouts[2]; + int cnt = 0; + + if (ltime != 0) + { + timeouts[cnt].id = STANDBY_TIMEOUT; + timeouts[cnt].type = TMPARAM_AT; + timeouts[cnt].fin_time = ltime; + cnt++; + } + + got_standby_deadlock_timeout = false; + timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT; + timeouts[cnt].type = TMPARAM_AFTER; + timeouts[cnt].delay_ms = DeadlockTimeout; + cnt++; + + enable_timeouts(timeouts, cnt); + } + + /* + * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted + * by one of the timeouts established above. + * + * We assume that only UnpinBuffer() and the timeout requests established + * above can wake us up here. WakeupRecovery() called by walreceiver or + * SIGHUP signal handler, etc cannot do that because it uses the different + * latch from that ProcWaitForSignal() waits on. + */ + ProcWaitForSignal(PG_WAIT_BUFFER_PIN); + + if (got_standby_delay_timeout) + SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + else if (got_standby_deadlock_timeout) + { + /* + * Send out a request for hot-standby backends to check themselves for + * deadlocks. + * + * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait + * to be signaled by UnpinBuffer() again and send a request for + * deadlocks check if deadlock_timeout happens. This causes the + * request to continue to be sent every deadlock_timeout until the + * buffer is unpinned or ltime is reached. This would increase the + * workload in the startup process and backends. In practice it may + * not be so harmful because the period that the buffer is kept pinned + * is basically no so long. But we should fix this? + */ + SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + } + + /* + * Clear any timeout requests established above. We assume here that the + * Startup process doesn't have any other timeouts than what this function + * uses. If that stops being true, we could cancel the timeouts + * individually, but that'd be slower. + */ + disable_all_timeouts(false); + got_standby_delay_timeout = false; + got_standby_deadlock_timeout = false; +} + +static void +SendRecoveryConflictWithBufferPin(ProcSignalReason reason) +{ + Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN || + reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + + /* + * We send signal to all backends to ask them if they are holding the + * buffer pin which is delaying the Startup process. We must not set the + * conflict flag yet, since most backends will be innocent. Let the + * SIGUSR1 handling in each backend decide their own fate. + */ + CancelDBBackends(InvalidOid, reason, false); +} + +/* + * In Hot Standby perform early deadlock detection. We abort the lock + * wait if we are about to sleep while holding the buffer pin that Startup + * process is waiting for. + * + * Note: this code is pessimistic, because there is no way for it to + * determine whether an actual deadlock condition is present: the lock we + * need to wait for might be unrelated to any held by the Startup process. + * Sooner or later, this mechanism should get ripped out in favor of somehow + * accounting for buffer locks in DeadLockCheck(). However, errors here + * seem to be very low-probability in practice, so for now it's not worth + * the trouble. + */ +void +CheckRecoveryConflictDeadlock(void) +{ + Assert(!InRecovery); /* do not call in Startup process */ + + if (!HoldingBufferPinThatDelaysRecovery()) + return; + + /* + * Error message should match ProcessInterrupts() but we avoid calling + * that because we aren't handling an interrupt at this point. Note that + * we only cancel the current transaction here, so if we are in a + * subtransaction and the pin is held by a parent, then the Startup + * process will continue to wait even though we have avoided deadlock. + */ + ereport(ERROR, + (errcode(ERRCODE_T_R_DEADLOCK_DETECTED), + errmsg("canceling statement due to conflict with recovery"), + errdetail("User transaction caused buffer deadlock with recovery."))); +} + + +/* -------------------------------- + * timeout handler routines + * -------------------------------- + */ + +/* + * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is + * exceeded. + */ +void +StandbyDeadLockHandler(void) +{ + got_standby_deadlock_timeout = true; +} + +/* + * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded. + */ +void +StandbyTimeoutHandler(void) +{ + got_standby_delay_timeout = true; +} + +/* + * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded. + */ +void +StandbyLockTimeoutHandler(void) +{ + got_standby_lock_timeout = true; +} + +/* + * ----------------------------------------------------- + * Locking in Recovery Mode + * ----------------------------------------------------- + * + * All locks are held by the Startup process using a single virtual + * transaction. This implementation is both simpler and in some senses, + * more correct. The locks held mean "some original transaction held + * this lock, so query access is not allowed at this time". So the Startup + * process is the proxy by which the original locks are implemented. + * + * We only keep track of AccessExclusiveLocks, which are only ever held by + * one transaction on one relation. + * + * We keep a table of known locks in the RecoveryLockHash hash table. + * The point of that table is to let us efficiently de-duplicate locks, + * which is important because checkpoints will re-report the same locks + * already held. There is also a RecoveryLockXidHash table with one entry + * per xid, which allows us to efficiently find all the locks held by a + * given original transaction. + * + * We use session locks rather than normal locks so we don't need + * ResourceOwners. + */ + + +void +StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid) +{ + RecoveryLockXidEntry *xidentry; + RecoveryLockEntry *lockentry; + xl_standby_lock key; + LOCKTAG locktag; + bool found; + + /* Already processed? */ + if (!TransactionIdIsValid(xid) || + TransactionIdDidCommit(xid) || + TransactionIdDidAbort(xid)) + return; + + elog(trace_recovery(DEBUG4), + "adding recovery lock: db %u rel %u", dbOid, relOid); + + /* dbOid is InvalidOid when we are locking a shared relation. */ + Assert(OidIsValid(relOid)); + + /* Create a hash entry for this xid, if we don't have one already. */ + xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found); + if (!found) + { + Assert(xidentry->xid == xid); /* dynahash should have set this */ + xidentry->head = NULL; + } + + /* Create a hash entry for this lock, unless we have one already. */ + key.xid = xid; + key.dbOid = dbOid; + key.relOid = relOid; + lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found); + if (!found) + { + /* It's new, so link it into the XID's list ... */ + lockentry->next = xidentry->head; + xidentry->head = lockentry; + + /* ... and acquire the lock locally. */ + SET_LOCKTAG_RELATION(locktag, dbOid, relOid); + + (void) LockAcquire(&locktag, AccessExclusiveLock, true, false); + } +} + +/* + * Release all the locks associated with this RecoveryLockXidEntry. + */ +static void +StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry) +{ + RecoveryLockEntry *entry; + RecoveryLockEntry *next; + + for (entry = xidentry->head; entry != NULL; entry = next) + { + LOCKTAG locktag; + + elog(trace_recovery(DEBUG4), + "releasing recovery lock: xid %u db %u rel %u", + entry->key.xid, entry->key.dbOid, entry->key.relOid); + /* Release the lock ... */ + SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid); + if (!LockRelease(&locktag, AccessExclusiveLock, true)) + { + elog(LOG, + "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u", + entry->key.xid, entry->key.dbOid, entry->key.relOid); + Assert(false); + } + /* ... and remove the per-lock hash entry */ + next = entry->next; + hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL); + } + + xidentry->head = NULL; /* just for paranoia */ +} + +/* + * Release locks for specific XID, or all locks if it's InvalidXid. + */ +static void +StandbyReleaseLocks(TransactionId xid) +{ + RecoveryLockXidEntry *entry; + + if (TransactionIdIsValid(xid)) + { + if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL))) + { + StandbyReleaseXidEntryLocks(entry); + hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL); + } + } + else + StandbyReleaseAllLocks(); +} + +/* + * Release locks for a transaction tree, starting at xid down, from + * RecoveryLockXidHash. + * + * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode, + * to remove any AccessExclusiveLocks requested by a transaction. + */ +void +StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids) +{ + int i; + + StandbyReleaseLocks(xid); + + for (i = 0; i < nsubxids; i++) + StandbyReleaseLocks(subxids[i]); +} + +/* + * Called at end of recovery and when we see a shutdown checkpoint. + */ +void +StandbyReleaseAllLocks(void) +{ + HASH_SEQ_STATUS status; + RecoveryLockXidEntry *entry; + + elog(trace_recovery(DEBUG2), "release all standby locks"); + + hash_seq_init(&status, RecoveryLockXidHash); + while ((entry = hash_seq_search(&status))) + { + StandbyReleaseXidEntryLocks(entry); + hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL); + } +} + +/* + * StandbyReleaseOldLocks + * Release standby locks held by top-level XIDs that aren't running, + * as long as they're not prepared transactions. + */ +void +StandbyReleaseOldLocks(TransactionId oldxid) +{ + HASH_SEQ_STATUS status; + RecoveryLockXidEntry *entry; + + hash_seq_init(&status, RecoveryLockXidHash); + while ((entry = hash_seq_search(&status))) + { + Assert(TransactionIdIsValid(entry->xid)); + + /* Skip if prepared transaction. */ + if (StandbyTransactionIdIsPrepared(entry->xid)) + continue; + + /* Skip if >= oldxid. */ + if (!TransactionIdPrecedes(entry->xid, oldxid)) + continue; + + /* Remove all locks and hash table entry. */ + StandbyReleaseXidEntryLocks(entry); + hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL); + } +} + +/* + * -------------------------------------------------------------------- + * Recovery handling for Rmgr RM_STANDBY_ID + * + * These record types will only be created if XLogStandbyInfoActive() + * -------------------------------------------------------------------- + */ + +void +standby_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in standby records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + /* Do nothing if we're not in hot standby mode */ + if (standbyState == STANDBY_DISABLED) + return; + + if (info == XLOG_STANDBY_LOCK) + { + xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record); + int i; + + for (i = 0; i < xlrec->nlocks; i++) + StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid, + xlrec->locks[i].dbOid, + xlrec->locks[i].relOid); + } + else if (info == XLOG_RUNNING_XACTS) + { + xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record); + RunningTransactionsData running; + + running.xcnt = xlrec->xcnt; + running.subxcnt = xlrec->subxcnt; + running.subxid_overflow = xlrec->subxid_overflow; + running.nextXid = xlrec->nextXid; + running.latestCompletedXid = xlrec->latestCompletedXid; + running.oldestRunningXid = xlrec->oldestRunningXid; + running.xids = xlrec->xids; + + ProcArrayApplyRecoveryInfo(&running); + + /* + * The startup process currently has no convenient way to schedule + * stats to be reported. XLOG_RUNNING_XACTS records issued at a + * regular cadence, making this a convenient location to report stats. + * While these records aren't generated with wal_level=minimal, stats + * also cannot be accessed during WAL replay. + */ + pgstat_report_stat(true); + } + else if (info == XLOG_INVALIDATIONS) + { + xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record); + + ProcessCommittedInvalidationMessages(xlrec->msgs, + xlrec->nmsgs, + xlrec->relcacheInitFileInval, + xlrec->dbId, + xlrec->tsId); + } + else + elog(PANIC, "standby_redo: unknown op code %u", info); +} + +/* + * Log details of the current snapshot to WAL. This allows the snapshot state + * to be reconstructed on the standby and for logical decoding. + * + * This is used for Hot Standby as follows: + * + * We can move directly to STANDBY_SNAPSHOT_READY at startup if we + * start from a shutdown checkpoint because we know nothing was running + * at that time and our recovery snapshot is known empty. In the more + * typical case of an online checkpoint we need to jump through a few + * hoops to get a correct recovery snapshot and this requires a two or + * sometimes a three stage process. + * + * The initial snapshot must contain all running xids and all current + * AccessExclusiveLocks at a point in time on the standby. Assembling + * that information while the server is running requires many and + * various LWLocks, so we choose to derive that information piece by + * piece and then re-assemble that info on the standby. When that + * information is fully assembled we move to STANDBY_SNAPSHOT_READY. + * + * Since locking on the primary when we derive the information is not + * strict, we note that there is a time window between the derivation and + * writing to WAL of the derived information. That allows race conditions + * that we must resolve, since xids and locks may enter or leave the + * snapshot during that window. This creates the issue that an xid or + * lock may start *after* the snapshot has been derived yet *before* the + * snapshot is logged in the running xacts WAL record. We resolve this by + * starting to accumulate changes at a point just prior to when we derive + * the snapshot on the primary, then ignore duplicates when we later apply + * the snapshot from the running xacts record. This is implemented during + * CreateCheckPoint() where we use the logical checkpoint location as + * our starting point and then write the running xacts record immediately + * before writing the main checkpoint WAL record. Since we always start + * up from a checkpoint and are immediately at our starting point, we + * unconditionally move to STANDBY_INITIALIZED. After this point we + * must do 4 things: + * * move shared nextXid forwards as we see new xids + * * extend the clog and subtrans with each new xid + * * keep track of uncommitted known assigned xids + * * keep track of uncommitted AccessExclusiveLocks + * + * When we see a commit/abort we must remove known assigned xids and locks + * from the completing transaction. Attempted removals that cannot locate + * an entry are expected and must not cause an error when we are in state + * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and + * KnownAssignedXidsRemove(). + * + * Later, when we apply the running xact data we must be careful to ignore + * transactions already committed, since those commits raced ahead when + * making WAL entries. + * + * The loose timing also means that locks may be recorded that have a + * zero xid, since xids are removed from procs before locks are removed. + * So we must prune the lock list down to ensure we hold locks only for + * currently running xids, performed by StandbyReleaseOldLocks(). + * Zero xids should no longer be possible, but we may be replaying WAL + * from a time when they were possible. + * + * For logical decoding only the running xacts information is needed; + * there's no need to look at the locking information, but it's logged anyway, + * as there's no independent knob to just enable logical decoding. For + * details of how this is used, check snapbuild.c's introductory comment. + * + * + * Returns the RecPtr of the last inserted record. + */ +XLogRecPtr +LogStandbySnapshot(void) +{ + XLogRecPtr recptr; + RunningTransactions running; + xl_standby_lock *locks; + int nlocks; + + Assert(XLogStandbyInfoActive()); + + /* + * Get details of any AccessExclusiveLocks being held at the moment. + */ + locks = GetRunningTransactionLocks(&nlocks); + if (nlocks > 0) + LogAccessExclusiveLocks(nlocks, locks); + pfree(locks); + + /* + * Log details of all in-progress transactions. This should be the last + * record we write, because standby will open up when it sees this. + */ + running = GetRunningTransactionData(); + + /* + * GetRunningTransactionData() acquired ProcArrayLock, we must release it. + * For Hot Standby this can be done before inserting the WAL record + * because ProcArrayApplyRecoveryInfo() rechecks the commit status using + * the clog. For logical decoding, though, the lock can't be released + * early because the clog might be "in the future" from the POV of the + * historic snapshot. This would allow for situations where we're waiting + * for the end of a transaction listed in the xl_running_xacts record + * which, according to the WAL, has committed before the xl_running_xacts + * record. Fortunately this routine isn't executed frequently, and it's + * only a shared lock. + */ + if (wal_level < WAL_LEVEL_LOGICAL) + LWLockRelease(ProcArrayLock); + + recptr = LogCurrentRunningXacts(running); + + /* Release lock if we kept it longer ... */ + if (wal_level >= WAL_LEVEL_LOGICAL) + LWLockRelease(ProcArrayLock); + + /* GetRunningTransactionData() acquired XidGenLock, we must release it */ + LWLockRelease(XidGenLock); + + return recptr; +} + +/* + * Record an enhanced snapshot of running transactions into WAL. + * + * The definitions of RunningTransactionsData and xl_running_xacts are + * similar. We keep them separate because xl_running_xacts is a contiguous + * chunk of memory and never exists fully until it is assembled in WAL. + * The inserted records are marked as not being important for durability, + * to avoid triggering superfluous checkpoint / archiving activity. + */ +static XLogRecPtr +LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) +{ + xl_running_xacts xlrec; + XLogRecPtr recptr; + + xlrec.xcnt = CurrRunningXacts->xcnt; + xlrec.subxcnt = CurrRunningXacts->subxcnt; + xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow; + xlrec.nextXid = CurrRunningXacts->nextXid; + xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; + xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; + + /* Header */ + XLogBeginInsert(); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts); + + /* array of TransactionIds */ + if (xlrec.xcnt > 0) + XLogRegisterData((char *) CurrRunningXacts->xids, + (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId)); + + recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS); + + if (CurrRunningXacts->subxid_overflow) + elog(trace_recovery(DEBUG2), + "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + CurrRunningXacts->xcnt, + LSN_FORMAT_ARGS(recptr), + CurrRunningXacts->oldestRunningXid, + CurrRunningXacts->latestCompletedXid, + CurrRunningXacts->nextXid); + else + elog(trace_recovery(DEBUG2), + "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt, + LSN_FORMAT_ARGS(recptr), + CurrRunningXacts->oldestRunningXid, + CurrRunningXacts->latestCompletedXid, + CurrRunningXacts->nextXid); + + /* + * Ensure running_xacts information is synced to disk not too far in the + * future. We don't want to stall anything though (i.e. use XLogFlush()), + * so we let the wal writer do it during normal operation. + * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced + * and nudge the WALWriter into action if sleeping. Check + * XLogBackgroundFlush() for details why a record might not be flushed + * without it. + */ + XLogSetAsyncXactLSN(recptr); + + return recptr; +} + +/* + * Wholesale logging of AccessExclusiveLocks. Other lock types need not be + * logged, as described in backend/storage/lmgr/README. + */ +static void +LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks) +{ + xl_standby_locks xlrec; + + xlrec.nlocks = nlocks; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks)); + XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock)); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + + (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK); +} + +/* + * Individual logging of AccessExclusiveLocks for use during LockAcquire() + */ +void +LogAccessExclusiveLock(Oid dbOid, Oid relOid) +{ + xl_standby_lock xlrec; + + xlrec.xid = GetCurrentTransactionId(); + + xlrec.dbOid = dbOid; + xlrec.relOid = relOid; + + LogAccessExclusiveLocks(1, &xlrec); + MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK; +} + +/* + * Prepare to log an AccessExclusiveLock, for use during LockAcquire() + */ +void +LogAccessExclusiveLockPrepare(void) +{ + /* + * Ensure that a TransactionId has been assigned to this transaction, for + * two reasons, both related to lock release on the standby. First, we + * must assign an xid so that RecordTransactionCommit() and + * RecordTransactionAbort() do not optimise away the transaction + * completion record which recovery relies upon to release locks. It's a + * hack, but for a corner case not worth adding code for into the main + * commit path. Second, we must assign an xid before the lock is recorded + * in shared memory, otherwise a concurrently executing + * GetRunningTransactionLocks() might see a lock associated with an + * InvalidTransactionId which we later assert cannot happen. + */ + (void) GetCurrentTransactionId(); +} + +/* + * Emit WAL for invalidations. This currently is only used for commits without + * an xid but which contain invalidations. + */ +void +LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs, + bool relcacheInitFileInval) +{ + xl_invalidations xlrec; + + /* prepare record */ + memset(&xlrec, 0, sizeof(xlrec)); + xlrec.dbId = MyDatabaseId; + xlrec.tsId = MyDatabaseTableSpace; + xlrec.relcacheInitFileInval = relcacheInitFileInval; + xlrec.nmsgs = nmsgs; + + /* perform insertion */ + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations); + XLogRegisterData((char *) msgs, + nmsgs * sizeof(SharedInvalidationMessage)); + XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS); +} + +/* Return the description of recovery conflict */ +static const char * +get_recovery_conflict_desc(ProcSignalReason reason) +{ + const char *reasonDesc = _("unknown reason"); + + switch (reason) + { + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + reasonDesc = _("recovery conflict on buffer pin"); + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + reasonDesc = _("recovery conflict on lock"); + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + reasonDesc = _("recovery conflict on tablespace"); + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + reasonDesc = _("recovery conflict on snapshot"); + break; + case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT: + reasonDesc = _("recovery conflict on replication slot"); + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + reasonDesc = _("recovery conflict on buffer deadlock"); + break; + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + reasonDesc = _("recovery conflict on database"); + break; + default: + break; + } + + return reasonDesc; +} diff --git a/src/backend/storage/large_object/Makefile b/src/backend/storage/large_object/Makefile new file mode 100644 index 0000000..8a6bc36 --- /dev/null +++ b/src/backend/storage/large_object/Makefile @@ -0,0 +1,18 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/large_object +# +# IDENTIFICATION +# src/backend/storage/large_object/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/large_object +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + inv_api.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c new file mode 100644 index 0000000..cab47f8 --- /dev/null +++ b/src/backend/storage/large_object/inv_api.c @@ -0,0 +1,954 @@ +/*------------------------------------------------------------------------- + * + * inv_api.c + * routines for manipulating inversion fs large objects. This file + * contains the user-level large object application interface routines. + * + * + * Note: we access pg_largeobject.data using its C struct declaration. + * This is safe because it immediately follows pageno which is an int4 field, + * and therefore the data field will always be 4-byte aligned, even if it + * is in the short 1-byte-header format. We have to detoast it since it's + * quite likely to be in compressed or short format. We also need to check + * for NULLs, since initdb will mark loid and pageno but not data as NOT NULL. + * + * Note: many of these routines leak memory in CurrentMemoryContext, as indeed + * does most of the backend code. We expect that CurrentMemoryContext will + * be a short-lived context. Data that must persist across function calls + * is kept either in CacheMemoryContext (the Relation structs) or in the + * memory context given to inv_open (for LargeObjectDesc structs). + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/large_object/inv_api.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/detoast.h" +#include "access/genam.h" +#include "access/htup_details.h" +#include "access/sysattr.h" +#include "access/table.h" +#include "access/xact.h" +#include "catalog/dependency.h" +#include "catalog/indexing.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_largeobject.h" +#include "catalog/pg_largeobject_metadata.h" +#include "libpq/libpq-fs.h" +#include "miscadmin.h" +#include "storage/large_object.h" +#include "utils/acl.h" +#include "utils/fmgroids.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + + +/* + * GUC: backwards-compatibility flag to suppress LO permission checks + */ +bool lo_compat_privileges; + +/* + * All accesses to pg_largeobject and its index make use of a single Relation + * reference, so that we only need to open pg_relation once per transaction. + * To avoid problems when the first such reference occurs inside a + * subtransaction, we execute a slightly klugy maneuver to assign ownership of + * the Relation reference to TopTransactionResourceOwner. + */ +static Relation lo_heap_r = NULL; +static Relation lo_index_r = NULL; + + +/* + * Open pg_largeobject and its index, if not already done in current xact + */ +static void +open_lo_relation(void) +{ + ResourceOwner currentOwner; + + if (lo_heap_r && lo_index_r) + return; /* already open in current xact */ + + /* Arrange for the top xact to own these relation references */ + currentOwner = CurrentResourceOwner; + CurrentResourceOwner = TopTransactionResourceOwner; + + /* Use RowExclusiveLock since we might either read or write */ + if (lo_heap_r == NULL) + lo_heap_r = table_open(LargeObjectRelationId, RowExclusiveLock); + if (lo_index_r == NULL) + lo_index_r = index_open(LargeObjectLOidPNIndexId, RowExclusiveLock); + + CurrentResourceOwner = currentOwner; +} + +/* + * Clean up at main transaction end + */ +void +close_lo_relation(bool isCommit) +{ + if (lo_heap_r || lo_index_r) + { + /* + * Only bother to close if committing; else abort cleanup will handle + * it + */ + if (isCommit) + { + ResourceOwner currentOwner; + + currentOwner = CurrentResourceOwner; + CurrentResourceOwner = TopTransactionResourceOwner; + + if (lo_index_r) + index_close(lo_index_r, NoLock); + if (lo_heap_r) + table_close(lo_heap_r, NoLock); + + CurrentResourceOwner = currentOwner; + } + lo_heap_r = NULL; + lo_index_r = NULL; + } +} + + +/* + * Same as pg_largeobject.c's LargeObjectExists(), except snapshot to + * read with can be specified. + */ +static bool +myLargeObjectExists(Oid loid, Snapshot snapshot) +{ + Relation pg_lo_meta; + ScanKeyData skey[1]; + SysScanDesc sd; + HeapTuple tuple; + bool retval = false; + + ScanKeyInit(&skey[0], + Anum_pg_largeobject_metadata_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(loid)); + + pg_lo_meta = table_open(LargeObjectMetadataRelationId, + AccessShareLock); + + sd = systable_beginscan(pg_lo_meta, + LargeObjectMetadataOidIndexId, true, + snapshot, 1, skey); + + tuple = systable_getnext(sd); + if (HeapTupleIsValid(tuple)) + retval = true; + + systable_endscan(sd); + + table_close(pg_lo_meta, AccessShareLock); + + return retval; +} + + +/* + * Extract data field from a pg_largeobject tuple, detoasting if needed + * and verifying that the length is sane. Returns data pointer (a bytea *), + * data length, and an indication of whether to pfree the data pointer. + */ +static void +getdatafield(Form_pg_largeobject tuple, + bytea **pdatafield, + int *plen, + bool *pfreeit) +{ + bytea *datafield; + int len; + bool freeit; + + datafield = &(tuple->data); /* see note at top of file */ + freeit = false; + if (VARATT_IS_EXTENDED(datafield)) + { + datafield = (bytea *) + detoast_attr((struct varlena *) datafield); + freeit = true; + } + len = VARSIZE(datafield) - VARHDRSZ; + if (len < 0 || len > LOBLKSIZE) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("pg_largeobject entry for OID %u, page %d has invalid data field size %d", + tuple->loid, tuple->pageno, len))); + *pdatafield = datafield; + *plen = len; + *pfreeit = freeit; +} + + +/* + * inv_create -- create a new large object + * + * Arguments: + * lobjId - OID to use for new large object, or InvalidOid to pick one + * + * Returns: + * OID of new object + * + * If lobjId is not InvalidOid, then an error occurs if the OID is already + * in use. + */ +Oid +inv_create(Oid lobjId) +{ + Oid lobjId_new; + + /* + * Create a new largeobject with empty data pages + */ + lobjId_new = LargeObjectCreate(lobjId); + + /* + * dependency on the owner of largeobject + * + * Note that LO dependencies are recorded using classId + * LargeObjectRelationId for backwards-compatibility reasons. Using + * LargeObjectMetadataRelationId instead would simplify matters for the + * backend, but it'd complicate pg_dump and possibly break other clients. + */ + recordDependencyOnOwner(LargeObjectRelationId, + lobjId_new, GetUserId()); + + /* Post creation hook for new large object */ + InvokeObjectPostCreateHook(LargeObjectRelationId, lobjId_new, 0); + + /* + * Advance command counter to make new tuple visible to later operations. + */ + CommandCounterIncrement(); + + return lobjId_new; +} + +/* + * inv_open -- access an existing large object. + * + * Returns a large object descriptor, appropriately filled in. + * The descriptor and subsidiary data are allocated in the specified + * memory context, which must be suitably long-lived for the caller's + * purposes. If the returned descriptor has a snapshot associated + * with it, the caller must ensure that it also lives long enough, + * e.g. by calling RegisterSnapshotOnOwner + */ +LargeObjectDesc * +inv_open(Oid lobjId, int flags, MemoryContext mcxt) +{ + LargeObjectDesc *retval; + Snapshot snapshot = NULL; + int descflags = 0; + + /* + * Historically, no difference is made between (INV_WRITE) and (INV_WRITE + * | INV_READ), the caller being allowed to read the large object + * descriptor in either case. + */ + if (flags & INV_WRITE) + descflags |= IFS_WRLOCK | IFS_RDLOCK; + if (flags & INV_READ) + descflags |= IFS_RDLOCK; + + if (descflags == 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid flags for opening a large object: %d", + flags))); + + /* Get snapshot. If write is requested, use an instantaneous snapshot. */ + if (descflags & IFS_WRLOCK) + snapshot = NULL; + else + snapshot = GetActiveSnapshot(); + + /* Can't use LargeObjectExists here because we need to specify snapshot */ + if (!myLargeObjectExists(lobjId, snapshot)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_OBJECT), + errmsg("large object %u does not exist", lobjId))); + + /* Apply permission checks, again specifying snapshot */ + if ((descflags & IFS_RDLOCK) != 0) + { + if (!lo_compat_privileges && + pg_largeobject_aclcheck_snapshot(lobjId, + GetUserId(), + ACL_SELECT, + snapshot) != ACLCHECK_OK) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied for large object %u", + lobjId))); + } + if ((descflags & IFS_WRLOCK) != 0) + { + if (!lo_compat_privileges && + pg_largeobject_aclcheck_snapshot(lobjId, + GetUserId(), + ACL_UPDATE, + snapshot) != ACLCHECK_OK) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied for large object %u", + lobjId))); + } + + /* OK to create a descriptor */ + retval = (LargeObjectDesc *) MemoryContextAlloc(mcxt, + sizeof(LargeObjectDesc)); + retval->id = lobjId; + retval->offset = 0; + retval->flags = descflags; + + /* caller sets if needed, not used by the functions in this file */ + retval->subid = InvalidSubTransactionId; + + /* + * The snapshot (if any) is just the currently active snapshot. The + * caller will replace it with a longer-lived copy if needed. + */ + retval->snapshot = snapshot; + + return retval; +} + +/* + * Closes a large object descriptor previously made by inv_open(), and + * releases the long-term memory used by it. + */ +void +inv_close(LargeObjectDesc *obj_desc) +{ + Assert(PointerIsValid(obj_desc)); + pfree(obj_desc); +} + +/* + * Destroys an existing large object (not to be confused with a descriptor!) + * + * Note we expect caller to have done any required permissions check. + */ +int +inv_drop(Oid lobjId) +{ + ObjectAddress object; + + /* + * Delete any comments and dependencies on the large object + */ + object.classId = LargeObjectRelationId; + object.objectId = lobjId; + object.objectSubId = 0; + performDeletion(&object, DROP_CASCADE, 0); + + /* + * Advance command counter so that tuple removal will be seen by later + * large-object operations in this transaction. + */ + CommandCounterIncrement(); + + /* For historical reasons, we always return 1 on success. */ + return 1; +} + +/* + * Determine size of a large object + * + * NOTE: LOs can contain gaps, just like Unix files. We actually return + * the offset of the last byte + 1. + */ +static uint64 +inv_getsize(LargeObjectDesc *obj_desc) +{ + uint64 lastbyte = 0; + ScanKeyData skey[1]; + SysScanDesc sd; + HeapTuple tuple; + + Assert(PointerIsValid(obj_desc)); + + open_lo_relation(); + + ScanKeyInit(&skey[0], + Anum_pg_largeobject_loid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(obj_desc->id)); + + sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, + obj_desc->snapshot, 1, skey); + + /* + * Because the pg_largeobject index is on both loid and pageno, but we + * constrain only loid, a backwards scan should visit all pages of the + * large object in reverse pageno order. So, it's sufficient to examine + * the first valid tuple (== last valid page). + */ + tuple = systable_getnext_ordered(sd, BackwardScanDirection); + if (HeapTupleIsValid(tuple)) + { + Form_pg_largeobject data; + bytea *datafield; + int len; + bool pfreeit; + + if (HeapTupleHasNulls(tuple)) /* paranoia */ + elog(ERROR, "null field found in pg_largeobject"); + data = (Form_pg_largeobject) GETSTRUCT(tuple); + getdatafield(data, &datafield, &len, &pfreeit); + lastbyte = (uint64) data->pageno * LOBLKSIZE + len; + if (pfreeit) + pfree(datafield); + } + + systable_endscan_ordered(sd); + + return lastbyte; +} + +int64 +inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence) +{ + int64 newoffset; + + Assert(PointerIsValid(obj_desc)); + + /* + * We allow seek/tell if you have either read or write permission, so no + * need for a permission check here. + */ + + /* + * Note: overflow in the additions is possible, but since we will reject + * negative results, we don't need any extra test for that. + */ + switch (whence) + { + case SEEK_SET: + newoffset = offset; + break; + case SEEK_CUR: + newoffset = obj_desc->offset + offset; + break; + case SEEK_END: + newoffset = inv_getsize(obj_desc) + offset; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid whence setting: %d", whence))); + newoffset = 0; /* keep compiler quiet */ + break; + } + + /* + * use errmsg_internal here because we don't want to expose INT64_FORMAT + * in translatable strings; doing better is not worth the trouble + */ + if (newoffset < 0 || newoffset > MAX_LARGE_OBJECT_SIZE) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg_internal("invalid large object seek target: " INT64_FORMAT, + newoffset))); + + obj_desc->offset = newoffset; + return newoffset; +} + +int64 +inv_tell(LargeObjectDesc *obj_desc) +{ + Assert(PointerIsValid(obj_desc)); + + /* + * We allow seek/tell if you have either read or write permission, so no + * need for a permission check here. + */ + + return obj_desc->offset; +} + +int +inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes) +{ + int nread = 0; + int64 n; + int64 off; + int len; + int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); + uint64 pageoff; + ScanKeyData skey[2]; + SysScanDesc sd; + HeapTuple tuple; + + Assert(PointerIsValid(obj_desc)); + Assert(buf != NULL); + + if ((obj_desc->flags & IFS_RDLOCK) == 0) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied for large object %u", + obj_desc->id))); + + if (nbytes <= 0) + return 0; + + open_lo_relation(); + + ScanKeyInit(&skey[0], + Anum_pg_largeobject_loid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(obj_desc->id)); + + ScanKeyInit(&skey[1], + Anum_pg_largeobject_pageno, + BTGreaterEqualStrategyNumber, F_INT4GE, + Int32GetDatum(pageno)); + + sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, + obj_desc->snapshot, 2, skey); + + while ((tuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) + { + Form_pg_largeobject data; + bytea *datafield; + bool pfreeit; + + if (HeapTupleHasNulls(tuple)) /* paranoia */ + elog(ERROR, "null field found in pg_largeobject"); + data = (Form_pg_largeobject) GETSTRUCT(tuple); + + /* + * We expect the indexscan will deliver pages in order. However, + * there may be missing pages if the LO contains unwritten "holes". We + * want missing sections to read out as zeroes. + */ + pageoff = ((uint64) data->pageno) * LOBLKSIZE; + if (pageoff > obj_desc->offset) + { + n = pageoff - obj_desc->offset; + n = (n <= (nbytes - nread)) ? n : (nbytes - nread); + MemSet(buf + nread, 0, n); + nread += n; + obj_desc->offset += n; + } + + if (nread < nbytes) + { + Assert(obj_desc->offset >= pageoff); + off = (int) (obj_desc->offset - pageoff); + Assert(off >= 0 && off < LOBLKSIZE); + + getdatafield(data, &datafield, &len, &pfreeit); + if (len > off) + { + n = len - off; + n = (n <= (nbytes - nread)) ? n : (nbytes - nread); + memcpy(buf + nread, VARDATA(datafield) + off, n); + nread += n; + obj_desc->offset += n; + } + if (pfreeit) + pfree(datafield); + } + + if (nread >= nbytes) + break; + } + + systable_endscan_ordered(sd); + + return nread; +} + +int +inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) +{ + int nwritten = 0; + int n; + int off; + int len; + int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); + ScanKeyData skey[2]; + SysScanDesc sd; + HeapTuple oldtuple; + Form_pg_largeobject olddata; + bool neednextpage; + bytea *datafield; + bool pfreeit; + union + { + bytea hdr; + /* this is to make the union big enough for a LO data chunk: */ + char data[LOBLKSIZE + VARHDRSZ]; + /* ensure union is aligned well enough: */ + int32 align_it; + } workbuf; + char *workb = VARDATA(&workbuf.hdr); + HeapTuple newtup; + Datum values[Natts_pg_largeobject]; + bool nulls[Natts_pg_largeobject]; + bool replace[Natts_pg_largeobject]; + CatalogIndexState indstate; + + Assert(PointerIsValid(obj_desc)); + Assert(buf != NULL); + + /* enforce writability because snapshot is probably wrong otherwise */ + if ((obj_desc->flags & IFS_WRLOCK) == 0) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied for large object %u", + obj_desc->id))); + + if (nbytes <= 0) + return 0; + + /* this addition can't overflow because nbytes is only int32 */ + if ((nbytes + obj_desc->offset) > MAX_LARGE_OBJECT_SIZE) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid large object write request size: %d", + nbytes))); + + open_lo_relation(); + + indstate = CatalogOpenIndexes(lo_heap_r); + + ScanKeyInit(&skey[0], + Anum_pg_largeobject_loid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(obj_desc->id)); + + ScanKeyInit(&skey[1], + Anum_pg_largeobject_pageno, + BTGreaterEqualStrategyNumber, F_INT4GE, + Int32GetDatum(pageno)); + + sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, + obj_desc->snapshot, 2, skey); + + oldtuple = NULL; + olddata = NULL; + neednextpage = true; + + while (nwritten < nbytes) + { + /* + * If possible, get next pre-existing page of the LO. We expect the + * indexscan will deliver these in order --- but there may be holes. + */ + if (neednextpage) + { + if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) + { + if (HeapTupleHasNulls(oldtuple)) /* paranoia */ + elog(ERROR, "null field found in pg_largeobject"); + olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); + Assert(olddata->pageno >= pageno); + } + neednextpage = false; + } + + /* + * If we have a pre-existing page, see if it is the page we want to + * write, or a later one. + */ + if (olddata != NULL && olddata->pageno == pageno) + { + /* + * Update an existing page with fresh data. + * + * First, load old data into workbuf + */ + getdatafield(olddata, &datafield, &len, &pfreeit); + memcpy(workb, VARDATA(datafield), len); + if (pfreeit) + pfree(datafield); + + /* + * Fill any hole + */ + off = (int) (obj_desc->offset % LOBLKSIZE); + if (off > len) + MemSet(workb + len, 0, off - len); + + /* + * Insert appropriate portion of new data + */ + n = LOBLKSIZE - off; + n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); + memcpy(workb + off, buf + nwritten, n); + nwritten += n; + obj_desc->offset += n; + off += n; + /* compute valid length of new page */ + len = (len >= off) ? len : off; + SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); + + /* + * Form and insert updated tuple + */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replace, false, sizeof(replace)); + values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); + replace[Anum_pg_largeobject_data - 1] = true; + newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), + values, nulls, replace); + CatalogTupleUpdateWithInfo(lo_heap_r, &newtup->t_self, newtup, + indstate); + heap_freetuple(newtup); + + /* + * We're done with this old page. + */ + oldtuple = NULL; + olddata = NULL; + neednextpage = true; + } + else + { + /* + * Write a brand new page. + * + * First, fill any hole + */ + off = (int) (obj_desc->offset % LOBLKSIZE); + if (off > 0) + MemSet(workb, 0, off); + + /* + * Insert appropriate portion of new data + */ + n = LOBLKSIZE - off; + n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); + memcpy(workb + off, buf + nwritten, n); + nwritten += n; + obj_desc->offset += n; + /* compute valid length of new page */ + len = off + n; + SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); + + /* + * Form and insert updated tuple + */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); + values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); + values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); + newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); + CatalogTupleInsertWithInfo(lo_heap_r, newtup, indstate); + heap_freetuple(newtup); + } + pageno++; + } + + systable_endscan_ordered(sd); + + CatalogCloseIndexes(indstate); + + /* + * Advance command counter so that my tuple updates will be seen by later + * large-object operations in this transaction. + */ + CommandCounterIncrement(); + + return nwritten; +} + +void +inv_truncate(LargeObjectDesc *obj_desc, int64 len) +{ + int32 pageno = (int32) (len / LOBLKSIZE); + int32 off; + ScanKeyData skey[2]; + SysScanDesc sd; + HeapTuple oldtuple; + Form_pg_largeobject olddata; + union + { + bytea hdr; + /* this is to make the union big enough for a LO data chunk: */ + char data[LOBLKSIZE + VARHDRSZ]; + /* ensure union is aligned well enough: */ + int32 align_it; + } workbuf; + char *workb = VARDATA(&workbuf.hdr); + HeapTuple newtup; + Datum values[Natts_pg_largeobject]; + bool nulls[Natts_pg_largeobject]; + bool replace[Natts_pg_largeobject]; + CatalogIndexState indstate; + + Assert(PointerIsValid(obj_desc)); + + /* enforce writability because snapshot is probably wrong otherwise */ + if ((obj_desc->flags & IFS_WRLOCK) == 0) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("permission denied for large object %u", + obj_desc->id))); + + /* + * use errmsg_internal here because we don't want to expose INT64_FORMAT + * in translatable strings; doing better is not worth the trouble + */ + if (len < 0 || len > MAX_LARGE_OBJECT_SIZE) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg_internal("invalid large object truncation target: " INT64_FORMAT, + len))); + + open_lo_relation(); + + indstate = CatalogOpenIndexes(lo_heap_r); + + /* + * Set up to find all pages with desired loid and pageno >= target + */ + ScanKeyInit(&skey[0], + Anum_pg_largeobject_loid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(obj_desc->id)); + + ScanKeyInit(&skey[1], + Anum_pg_largeobject_pageno, + BTGreaterEqualStrategyNumber, F_INT4GE, + Int32GetDatum(pageno)); + + sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, + obj_desc->snapshot, 2, skey); + + /* + * If possible, get the page the truncation point is in. The truncation + * point may be beyond the end of the LO or in a hole. + */ + olddata = NULL; + if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) + { + if (HeapTupleHasNulls(oldtuple)) /* paranoia */ + elog(ERROR, "null field found in pg_largeobject"); + olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); + Assert(olddata->pageno >= pageno); + } + + /* + * If we found the page of the truncation point we need to truncate the + * data in it. Otherwise if we're in a hole, we need to create a page to + * mark the end of data. + */ + if (olddata != NULL && olddata->pageno == pageno) + { + /* First, load old data into workbuf */ + bytea *datafield; + int pagelen; + bool pfreeit; + + getdatafield(olddata, &datafield, &pagelen, &pfreeit); + memcpy(workb, VARDATA(datafield), pagelen); + if (pfreeit) + pfree(datafield); + + /* + * Fill any hole + */ + off = len % LOBLKSIZE; + if (off > pagelen) + MemSet(workb + pagelen, 0, off - pagelen); + + /* compute length of new page */ + SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); + + /* + * Form and insert updated tuple + */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + memset(replace, false, sizeof(replace)); + values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); + replace[Anum_pg_largeobject_data - 1] = true; + newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), + values, nulls, replace); + CatalogTupleUpdateWithInfo(lo_heap_r, &newtup->t_self, newtup, + indstate); + heap_freetuple(newtup); + } + else + { + /* + * If the first page we found was after the truncation point, we're in + * a hole that we'll fill, but we need to delete the later page + * because the loop below won't visit it again. + */ + if (olddata != NULL) + { + Assert(olddata->pageno > pageno); + CatalogTupleDelete(lo_heap_r, &oldtuple->t_self); + } + + /* + * Write a brand new page. + * + * Fill the hole up to the truncation point + */ + off = len % LOBLKSIZE; + if (off > 0) + MemSet(workb, 0, off); + + /* compute length of new page */ + SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); + + /* + * Form and insert new tuple + */ + memset(values, 0, sizeof(values)); + memset(nulls, false, sizeof(nulls)); + values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); + values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); + values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); + newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); + CatalogTupleInsertWithInfo(lo_heap_r, newtup, indstate); + heap_freetuple(newtup); + } + + /* + * Delete any pages after the truncation point. If the initial search + * didn't find a page, then of course there's nothing more to do. + */ + if (olddata != NULL) + { + while ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) + { + CatalogTupleDelete(lo_heap_r, &oldtuple->t_self); + } + } + + systable_endscan_ordered(sd); + + CatalogCloseIndexes(indstate); + + /* + * Advance command counter so that tuple updates will be seen by later + * large-object operations in this transaction. + */ + CommandCounterIncrement(); +} diff --git a/src/backend/storage/large_object/meson.build b/src/backend/storage/large_object/meson.build new file mode 100644 index 0000000..4d9d893 --- /dev/null +++ b/src/backend/storage/large_object/meson.build @@ -0,0 +1,5 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'inv_api.c', +) diff --git a/src/backend/storage/lmgr/.gitignore b/src/backend/storage/lmgr/.gitignore new file mode 100644 index 0000000..dab4c3f --- /dev/null +++ b/src/backend/storage/lmgr/.gitignore @@ -0,0 +1,3 @@ +/lwlocknames.c +/lwlocknames.h +/s_lock_test diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile new file mode 100644 index 0000000..b25b7ee --- /dev/null +++ b/src/backend/storage/lmgr/Makefile @@ -0,0 +1,52 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/lmgr +# +# IDENTIFICATION +# src/backend/storage/lmgr/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/lmgr +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + condition_variable.o \ + deadlock.o \ + lmgr.o \ + lock.o \ + lwlock.o \ + lwlocknames.o \ + predicate.o \ + proc.o \ + s_lock.o \ + spin.o + +include $(top_srcdir)/src/backend/common.mk + +ifdef TAS +TASPATH = $(top_builddir)/src/backend/port/tas.o +endif + +s_lock_test: s_lock.c $(top_builddir)/src/common/libpgcommon.a $(top_builddir)/src/port/libpgport.a + $(CC) $(CPPFLAGS) $(CFLAGS) -DS_LOCK_TEST=1 $(srcdir)/s_lock.c \ + $(TASPATH) -L $(top_builddir)/src/common -lpgcommon \ + -L $(top_builddir)/src/port -lpgport -o s_lock_test + +# see notes in src/backend/parser/Makefile +lwlocknames.c: lwlocknames.h + touch $@ + +lwlocknames.h: $(top_srcdir)/src/backend/storage/lmgr/lwlocknames.txt generate-lwlocknames.pl + $(PERL) $(srcdir)/generate-lwlocknames.pl $< + +check: s_lock_test + ./s_lock_test + +clean distclean: + rm -f s_lock_test + +maintainer-clean: clean + rm -f lwlocknames.h lwlocknames.c diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README new file mode 100644 index 0000000..45de0fd --- /dev/null +++ b/src/backend/storage/lmgr/README @@ -0,0 +1,731 @@ +src/backend/storage/lmgr/README + +Locking Overview +================ + +Postgres uses four types of interprocess locks: + +* Spinlocks. These are intended for *very* short-term locks. If a lock +is to be held more than a few dozen instructions, or across any sort of +kernel call (or even a call to a nontrivial subroutine), don't use a +spinlock. Spinlocks are primarily used as infrastructure for lightweight +locks. They are implemented using a hardware atomic-test-and-set +instruction, if available. Waiting processes busy-loop until they can +get the lock. There is no provision for deadlock detection, automatic +release on error, or any other nicety. There is a timeout if the lock +cannot be gotten after a minute or so (which is approximately forever in +comparison to the intended lock hold time, so this is certainly an error +condition). + +* Lightweight locks (LWLocks). These locks are typically used to +interlock access to datastructures in shared memory. LWLocks support +both exclusive and shared lock modes (for read/write and read-only +access to a shared object). There is no provision for deadlock +detection, but the LWLock manager will automatically release held +LWLocks during elog() recovery, so it is safe to raise an error while +holding LWLocks. Obtaining or releasing an LWLock is quite fast (a few +dozen instructions) when there is no contention for the lock. When a +process has to wait for an LWLock, it blocks on a SysV semaphore so as +to not consume CPU time. Waiting processes will be granted the lock in +arrival order. There is no timeout. + +* Regular locks (a/k/a heavyweight locks). The regular lock manager +supports a variety of lock modes with table-driven semantics, and it has +full deadlock detection and automatic release at transaction end. +Regular locks should be used for all user-driven lock requests. + +* SIReadLock predicate locks. See separate README-SSI file for details. + +Acquisition of either a spinlock or a lightweight lock causes query +cancel and die() interrupts to be held off until all such locks are +released. No such restriction exists for regular locks, however. Also +note that we can accept query cancel and die() interrupts while waiting +for a regular lock, but we will not accept them while waiting for +spinlocks or LW locks. It is therefore not a good idea to use LW locks +when the wait time might exceed a few seconds. + +The rest of this README file discusses the regular lock manager in detail. + + +Lock Data Structures +-------------------- + +Lock methods describe the overall locking behavior. Currently there are +two lock methods: DEFAULT and USER. + +Lock modes describe the type of the lock (read/write or shared/exclusive). +In principle, each lock method can have its own set of lock modes with +different conflict rules, but currently DEFAULT and USER methods use +identical lock mode sets. See src/include/storage/lock.h for more details. +(Lock modes are also called lock types in some places in the code and +documentation.) + +There are two main methods for recording locks in shared memory. The primary +mechanism uses two main structures: the per-lockable-object LOCK struct, and +the per-lock-and-requestor PROCLOCK struct. A LOCK object exists for each +lockable object that currently has locks held or requested on it. A PROCLOCK +struct exists for each backend that is holding or requesting lock(s) on each +LOCK object. + +There is also a special "fast path" mechanism which backends may use to +record a limited number of locks with very specific characteristics: they must +use the DEFAULT lockmethod; they must represent a lock on a database relation +(not a shared relation), they must be a "weak" lock which is unlikely to +conflict (AccessShareLock, RowShareLock, or RowExclusiveLock); and the system +must be able to quickly verify that no conflicting locks could possibly be +present. See "Fast Path Locking", below, for more details. + +Each backend also maintains an unshared LOCALLOCK structure for each lockable +object and lock mode that it is currently holding or requesting. The shared +lock structures only allow a single lock grant to be made per lockable +object/lock mode/backend. Internally to a backend, however, the same lock may +be requested and perhaps released multiple times in a transaction, and it can +also be held both transactionally and session-wide. The internal request +counts are held in LOCALLOCK so that the shared data structures need not be +accessed to alter them. + +--------------------------------------------------------------------------- + +The lock manager's LOCK objects contain: + +tag - + The key fields that are used for hashing locks in the shared memory + lock hash table. The contents of the tag essentially define an + individual lockable object. See include/storage/lock.h for details + about the supported types of lockable objects. This is declared as + a separate struct to ensure that we always zero out the correct number + of bytes. It is critical that any alignment-padding bytes the compiler + might insert in the struct be zeroed out, else the hash computation + will be random. (Currently, we are careful to define struct LOCKTAG + so that there are no padding bytes.) + +grantMask - + This bitmask indicates what types of locks are currently held on the + given lockable object. It is used (against the lock table's conflict + table) to determine if a new lock request will conflict with existing + lock types held. Conflicts are determined by bitwise AND operations + between the grantMask and the conflict table entry for the requested + lock type. Bit i of grantMask is 1 if and only if granted[i] > 0. + +waitMask - + This bitmask shows the types of locks being waited for. Bit i of waitMask + is 1 if and only if requested[i] > granted[i]. + +procLocks - + This is a shared memory queue of all the PROCLOCK structs associated with + the lock object. Note that both granted and waiting PROCLOCKs are in this + list (indeed, the same PROCLOCK might have some already-granted locks and + be waiting for more!). + +waitProcs - + This is a shared memory queue of all PGPROC structures corresponding to + backends that are waiting (sleeping) until another backend releases this + lock. The process structure holds the information needed to determine + if it should be woken up when the lock is released. + +nRequested - + Keeps a count of how many times this lock has been attempted to be + acquired. The count includes attempts by processes which were put + to sleep due to conflicts. It also counts the same backend twice + if, for example, a backend process first acquires a read and then + acquires a write. (But multiple acquisitions of the same lock/lock mode + within a backend are not multiply counted here; they are recorded + only in the backend's LOCALLOCK structure.) + +requested - + Keeps a count of how many locks of each type have been attempted. Only + elements 1 through MAX_LOCKMODES-1 are used as they correspond to the lock + type defined constants. Summing the values of requested[] should come out + equal to nRequested. + +nGranted - + Keeps count of how many times this lock has been successfully acquired. + This count does not include attempts that are waiting due to conflicts. + Otherwise the counting rules are the same as for nRequested. + +granted - + Keeps count of how many locks of each type are currently held. Once again + only elements 1 through MAX_LOCKMODES-1 are used (0 is not). Also, like + requested[], summing the values of granted[] should total to the value + of nGranted. + +We should always have 0 <= nGranted <= nRequested, and +0 <= granted[i] <= requested[i] for each i. When all the request counts +go to zero, the LOCK object is no longer needed and can be freed. + +--------------------------------------------------------------------------- + +The lock manager's PROCLOCK objects contain: + +tag - + The key fields that are used for hashing entries in the shared memory + PROCLOCK hash table. This is declared as a separate struct to ensure that + we always zero out the correct number of bytes. It is critical that any + alignment-padding bytes the compiler might insert in the struct be zeroed + out, else the hash computation will be random. (Currently, we are careful + to define struct PROCLOCKTAG so that there are no padding bytes.) + + tag.myLock + Pointer to the shared LOCK object this PROCLOCK is for. + + tag.myProc + Pointer to the PGPROC of backend process that owns this PROCLOCK. + + Note: it's OK to use pointers here because a PROCLOCK never outlives + either its lock or its proc. The tag is therefore unique for as long + as it needs to be, even though the same tag values might mean something + else at other times. + +holdMask - + A bitmask for the lock modes successfully acquired by this PROCLOCK. + This should be a subset of the LOCK object's grantMask, and also a + subset of the PGPROC object's heldLocks mask (if the PGPROC is + currently waiting for another lock mode on this lock). + +releaseMask - + A bitmask for the lock modes due to be released during LockReleaseAll. + This must be a subset of the holdMask. Note that it is modified without + taking the partition LWLock, and therefore it is unsafe for any + backend except the one owning the PROCLOCK to examine/change it. + +lockLink - + List link for shared memory queue of all the PROCLOCK objects for the + same LOCK. + +procLink - + List link for shared memory queue of all the PROCLOCK objects for the + same backend. + +--------------------------------------------------------------------------- + + +Lock Manager Internal Locking +----------------------------- + +Before PostgreSQL 8.2, all of the shared-memory data structures used by +the lock manager were protected by a single LWLock, the LockMgrLock; +any operation involving these data structures had to exclusively lock +LockMgrLock. Not too surprisingly, this became a contention bottleneck. +To reduce contention, the lock manager's data structures have been split +into multiple "partitions", each protected by an independent LWLock. +Most operations only need to lock the single partition they are working in. +Here are the details: + +* Each possible lock is assigned to one partition according to a hash of +its LOCKTAG value. The partition's LWLock is considered to protect all the +LOCK objects of that partition as well as their subsidiary PROCLOCKs. + +* The shared-memory hash tables for LOCKs and PROCLOCKs are organized +so that different partitions use different hash chains, and thus there +is no conflict in working with objects in different partitions. This +is supported directly by dynahash.c's "partitioned table" mechanism +for the LOCK table: we need only ensure that the partition number is +taken from the low-order bits of the dynahash hash value for the LOCKTAG. +To make it work for PROCLOCKs, we have to ensure that a PROCLOCK's hash +value has the same low-order bits as its associated LOCK. This requires +a specialized hash function (see proclock_hash). + +* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it. +This has now been split into per-partition lists, so that access to a +particular PROCLOCK list can be protected by the associated partition's +LWLock. (This rule allows one backend to manipulate another backend's +PROCLOCK lists, which was not originally necessary but is now required in +connection with fast-path locking; see below.) + +* The other lock-related fields of a PGPROC are only interesting when +the PGPROC is waiting for a lock, so we consider that they are protected +by the partition LWLock of the awaited lock. + +For normal lock acquisition and release, it is sufficient to lock the +partition containing the desired lock. Deadlock checking needs to touch +multiple partitions in general; for simplicity, we just make it lock all +the partitions in partition-number order. (To prevent LWLock deadlock, +we establish the rule that any backend needing to lock more than one +partition at once must lock them in partition-number order.) It's +possible that deadlock checking could be done without touching every +partition in typical cases, but since in a properly functioning system +deadlock checking should not occur often enough to be performance-critical, +trying to make this work does not seem a productive use of effort. + +A backend's internal LOCALLOCK hash table is not partitioned. We do store +a copy of the locktag hash code in LOCALLOCK table entries, from which the +partition number can be computed, but this is a straight speed-for-space +tradeoff: we could instead recalculate the partition number from the LOCKTAG +when needed. + + +Fast Path Locking +----------------- + +Fast path locking is a special purpose mechanism designed to reduce the +overhead of taking and releasing certain types of locks which are taken +and released very frequently but rarely conflict. Currently, this includes +two categories of locks: + +(1) Weak relation locks. SELECT, INSERT, UPDATE, and DELETE must acquire a +lock on every relation they operate on, as well as various system catalogs +that can be used internally. Many DML operations can proceed in parallel +against the same table at the same time; only DDL operations such as +CLUSTER, ALTER TABLE, or DROP -- or explicit user action such as LOCK TABLE +-- will create lock conflicts with the "weak" locks (AccessShareLock, +RowShareLock, RowExclusiveLock) acquired by DML operations. + +(2) VXID locks. Every transaction takes a lock on its own virtual +transaction ID. Currently, the only operations that wait for these locks +are CREATE INDEX CONCURRENTLY and Hot Standby (in the case of a conflict), +so most VXID locks are taken and released by the owner without anyone else +needing to care. + +The primary locking mechanism does not cope well with this workload. Even +though the lock manager locks are partitioned, the locktag for any given +relation still falls in one, and only one, partition. Thus, if many short +queries are accessing the same relation, the lock manager partition lock for +that partition becomes a contention bottleneck. This effect is measurable +even on 2-core servers, and becomes very pronounced as core count increases. + +To alleviate this bottleneck, beginning in PostgreSQL 9.2, each backend is +permitted to record a limited number of locks on unshared relations in an +array within its PGPROC structure, rather than using the primary lock table. +This mechanism can only be used when the locker can verify that no conflicting +locks exist at the time of taking the lock. + +A key point of this algorithm is that it must be possible to verify the +absence of possibly conflicting locks without fighting over a shared LWLock or +spinlock. Otherwise, this effort would simply move the contention bottleneck +from one place to another. We accomplish this using an array of 1024 integer +counters, which are in effect a 1024-way partitioning of the lock space. +Each counter records the number of "strong" locks (that is, ShareLock, +ShareRowExclusiveLock, ExclusiveLock, and AccessExclusiveLock) on unshared +relations that fall into that partition. When this counter is non-zero, the +fast path mechanism may not be used to take new relation locks within that +partition. A strong locker bumps the counter and then scans each per-backend +array for matching fast-path locks; any which are found must be transferred to +the primary lock table before attempting to acquire the lock, to ensure proper +lock conflict and deadlock detection. + +On an SMP system, we must guarantee proper memory synchronization. Here we +rely on the fact that LWLock acquisition acts as a memory sequence point: if +A performs a store, A and B both acquire an LWLock in either order, and B +then performs a load on the same memory location, it is guaranteed to see +A's store. In this case, each backend's fast-path lock queue is protected +by an LWLock. A backend wishing to acquire a fast-path lock grabs this +LWLock before examining FastPathStrongRelationLocks to check for the presence +of a conflicting strong lock. And the backend attempting to acquire a strong +lock, because it must transfer any matching weak locks taken via the fast-path +mechanism to the shared lock table, will acquire every LWLock protecting a +backend fast-path queue in turn. So, if we examine +FastPathStrongRelationLocks and see a zero, then either the value is truly +zero, or if it is a stale value, the strong locker has yet to acquire the +per-backend LWLock we now hold (or, indeed, even the first per-backend LWLock) +and will notice any weak lock we take when it does. + +Fast-path VXID locks do not use the FastPathStrongRelationLocks table. The +first lock taken on a VXID is always the ExclusiveLock taken by its owner. +Any subsequent lockers are share lockers waiting for the VXID to terminate. +Indeed, the only reason VXID locks use the lock manager at all (rather than +waiting for the VXID to terminate via some other method) is for deadlock +detection. Thus, the initial VXID lock can *always* be taken via the fast +path without checking for conflicts. Any subsequent locker must check +whether the lock has been transferred to the main lock table, and if not, +do so. The backend owning the VXID must be careful to clean up any entry +made in the main lock table at end of transaction. + +Deadlock detection does not need to examine the fast-path data structures, +because any lock that could possibly be involved in a deadlock must have +been transferred to the main tables beforehand. + + +The Deadlock Detection Algorithm +-------------------------------- + +Since we allow user transactions to request locks in any order, deadlock +is possible. We use a deadlock detection/breaking algorithm that is +fairly standard in essence, but there are many special considerations +needed to deal with Postgres' generalized locking model. + +A key design consideration is that we want to make routine operations +(lock grant and release) run quickly when there is no deadlock, and +avoid the overhead of deadlock handling as much as possible. We do this +using an "optimistic waiting" approach: if a process cannot acquire the +lock it wants immediately, it goes to sleep without any deadlock check. +But it also sets a delay timer, with a delay of DeadlockTimeout +milliseconds (typically set to one second). If the delay expires before +the process is granted the lock it wants, it runs the deadlock +detection/breaking code. Normally this code will determine that there is +no deadlock condition, and then the process will go back to sleep and +wait quietly until it is granted the lock. But if a deadlock condition +does exist, it will be resolved, usually by aborting the detecting +process' transaction. In this way, we avoid deadlock handling overhead +whenever the wait time for a lock is less than DeadlockTimeout, while +not imposing an unreasonable delay of detection when there is an error. + +Lock acquisition (routines LockAcquire and ProcSleep) follows these rules: + +1. A lock request is granted immediately if it does not conflict with +any existing or waiting lock request, or if the process already holds an +instance of the same lock type (eg, there's no penalty to acquire a read +lock twice). Note that a process never conflicts with itself, eg one +can obtain read lock when one already holds exclusive lock. + +2. Otherwise the process joins the lock's wait queue. Normally it will +be added to the end of the queue, but there is an exception: if the +process already holds locks on this same lockable object that conflict +with the request of any pending waiter, then the process will be +inserted in the wait queue just ahead of the first such waiter. (If we +did not make this check, the deadlock detection code would adjust the +queue order to resolve the conflict, but it's relatively cheap to make +the check in ProcSleep and avoid a deadlock timeout delay in this case.) +Note special case when inserting before the end of the queue: if the +process's request does not conflict with any existing lock nor any +waiting request before its insertion point, then go ahead and grant the +lock without waiting. + +When a lock is released, the lock release routine (ProcLockWakeup) scans +the lock object's wait queue. Each waiter is awoken if (a) its request +does not conflict with already-granted locks, and (b) its request does +not conflict with the requests of prior un-wakable waiters. Rule (b) +ensures that conflicting requests are granted in order of arrival. There +are cases where a later waiter must be allowed to go in front of +conflicting earlier waiters to avoid deadlock, but it is not +ProcLockWakeup's responsibility to recognize these cases; instead, the +deadlock detection code will re-order the wait queue when necessary. + +To perform deadlock checking, we use the standard method of viewing the +various processes as nodes in a directed graph (the waits-for graph or +WFG). There is a graph edge leading from process A to process B if A +waits for B, ie, A is waiting for some lock and B holds a conflicting +lock. There is a deadlock condition if and only if the WFG contains a +cycle. We detect cycles by searching outward along waits-for edges to +see if we return to our starting point. There are three possible +outcomes: + +1. All outgoing paths terminate at a running process (which has no +outgoing edge). + +2. A deadlock is detected by looping back to the start point. We +resolve such a deadlock by canceling the start point's lock request and +reporting an error in that transaction, which normally leads to +transaction abort and release of that transaction's held locks. Note +that it's sufficient to cancel one request to remove the cycle; we don't +need to kill all the transactions involved. + +3. Some path(s) loop back to a node other than the start point. This +indicates a deadlock, but one that does not involve our starting +process. We ignore this condition on the grounds that resolving such a +deadlock is the responsibility of the processes involved --- killing our +start-point process would not resolve the deadlock. So, cases 1 and 3 +both report "no deadlock". + +Postgres' situation is a little more complex than the standard discussion +of deadlock detection, for two reasons: + +1. A process can be waiting for more than one other process, since there +might be multiple PROCLOCKs of (non-conflicting) lock types that all +conflict with the waiter's request. This creates no real difficulty +however; we simply need to be prepared to trace more than one outgoing +edge. + +2. If a process A is behind a process B in some lock's wait queue, and +their requested locks conflict, then we must say that A waits for B, since +ProcLockWakeup will never awaken A before B. This creates additional +edges in the WFG. We call these "soft" edges, as opposed to the "hard" +edges induced by locks already held. Note that if B already holds any +locks conflicting with A's request, then their relationship is a hard edge +not a soft edge. + +A "soft" block, or wait-priority block, has the same potential for +inducing deadlock as a hard block. However, we may be able to resolve +a soft block without aborting the transactions involved: we can instead +rearrange the order of the wait queue. This rearrangement reverses the +direction of the soft edge between two processes with conflicting requests +whose queue order is reversed. If we can find a rearrangement that +eliminates a cycle without creating new ones, then we can avoid an abort. +Checking for such possible rearrangements is the trickiest part of the +algorithm. + +The workhorse of the deadlock detector is a routine FindLockCycle() which +is given a starting point process (which must be a waiting process). +It recursively scans outward across waits-for edges as discussed above. +If it finds no cycle involving the start point, it returns "false". +(As discussed above, we can ignore cycles not involving the start point.) +When such a cycle is found, FindLockCycle() returns "true", and as it +unwinds it also builds a list of any "soft" edges involved in the cycle. +If the resulting list is empty then there is a hard deadlock and the +configuration cannot succeed. However, if the list is not empty, then +reversing any one of the listed edges through wait-queue rearrangement +will eliminate that cycle. Since such a reversal might create cycles +elsewhere, we may need to try every possibility. Therefore, we need to +be able to invoke FindLockCycle() on hypothetical configurations (wait +orders) as well as the current real order. + +The easiest way to handle this seems to be to have a lookaside table that +shows the proposed new queue order for each wait queue that we are +considering rearranging. This table is checked by FindLockCycle, and it +believes the proposed queue order rather than the real order for each lock +that has an entry in the lookaside table. + +We build a proposed new queue order by doing a "topological sort" of the +existing entries. Each soft edge that we are currently considering +reversing creates a property of the partial order that the topological sort +has to enforce. We must use a sort method that preserves the input +ordering as much as possible, so as not to gratuitously break arrival +order for processes not involved in a deadlock. (This is not true of the +tsort method shown in Knuth, for example, but it's easily done by a simple +doubly-nested-loop method that emits the first legal candidate at each +step. Fortunately, we don't need a highly efficient sort algorithm, since +the number of partial order constraints is not likely to be large.) Note +that failure of the topological sort tells us we have conflicting ordering +constraints, and therefore that the last-added soft edge reversal +conflicts with a prior edge reversal. We need to detect this case to +avoid an infinite loop in the case where no possible rearrangement will +work: otherwise, we might try a reversal, find that it still leads to +a cycle, then try to un-reverse the reversal while trying to get rid of +that cycle, etc etc. Topological sort failure tells us the un-reversal +is not a legitimate move in this context. + +So, the basic step in our rearrangement method is to take a list of +soft edges in a cycle (as returned by FindLockCycle()) and successively +try the reversal of each one as a topological-sort constraint added to +whatever constraints we are already considering. We recursively search +through all such sets of constraints to see if any one eliminates all +the deadlock cycles at once. Although this might seem impossibly +inefficient, it shouldn't be a big problem in practice, because there +will normally be very few, and not very large, deadlock cycles --- if +any at all. So the combinatorial inefficiency isn't going to hurt us. +Besides, it's better to spend some time to guarantee that we've checked +all possible escape routes than to abort a transaction when we didn't +really have to. + +Each edge reversal constraint can be viewed as requesting that the waiting +process A be moved to before the blocking process B in the wait queue they +are both in. This action will reverse the desired soft edge, as well as +any other soft edges between A and other processes it is advanced over. +No other edges will be affected (note this is actually a constraint on our +topological sort method to not re-order the queue more than necessary.) +Therefore, we can be sure we have not created any new deadlock cycles if +neither FindLockCycle(A) nor FindLockCycle(B) discovers any cycle. Given +the above-defined behavior of FindLockCycle, each of these searches is +necessary as well as sufficient, since FindLockCycle starting at the +original start point will not complain about cycles that include A or B +but not the original start point. + +In short then, a proposed rearrangement of the wait queue(s) is determined +by one or more broken soft edges A->B, fully specified by the output of +topological sorts of each wait queue involved, and then tested by invoking +FindLockCycle() starting at the original start point as well as each of +the mentioned processes (A's and B's). If none of the tests detect a +cycle, then we have a valid configuration and can implement it by +reordering the wait queues per the sort outputs (and then applying +ProcLockWakeup on each reordered queue, in case a waiter has become wakable). +If any test detects a soft cycle, we can try to resolve it by adding each +soft link in that cycle, in turn, to the proposed rearrangement list. +This is repeated recursively until we either find a workable rearrangement +or determine that none exists. In the latter case, the outer level +resolves the deadlock by aborting the original start-point transaction. + +The particular order in which rearrangements are tried depends on the +order FindLockCycle() happens to scan in, so if there are multiple +workable rearrangements of the wait queues, then it is unspecified which +one will be chosen. What's more important is that we guarantee to try +every queue rearrangement that could lead to success. (For example, +if we have A before B before C and the needed order constraints are +C before A and B before C, we would first discover that A before C +doesn't work and try the rearrangement C before A before B. This would +eventually lead to the discovery of the additional constraint B before C.) + +Got that? + +Miscellaneous Notes +------------------- + +1. It is easily proven that no deadlock will be missed due to our +asynchronous invocation of deadlock checking. A deadlock cycle in the WFG +is formed when the last edge in the cycle is added; therefore the last +process in the cycle to wait (the one from which that edge is outgoing) is +certain to detect and resolve the cycle when it later runs CheckDeadLock. +This holds even if that edge addition created multiple cycles; the process +may indeed abort without ever noticing those additional cycles, but we +don't particularly care. The only other possible creation of deadlocks is +during deadlock resolution's rearrangement of wait queues, and we already +saw that that algorithm will prove that it creates no new deadlocks before +it attempts to actually execute any rearrangement. + +2. It is not certain that a deadlock will be resolved by aborting the +last-to-wait process. If earlier waiters in the cycle have not yet run +CheckDeadLock, then the first one to do so will be the victim. + +3. No live (wakable) process can be missed by ProcLockWakeup, since it +examines every member of the wait queue (this was not true in the 7.0 +implementation, BTW). Therefore, if ProcLockWakeup is always invoked +after a lock is released or a wait queue is rearranged, there can be no +failure to wake a wakable process. One should also note that +LockErrorCleanup (abort a waiter due to outside factors) must run +ProcLockWakeup, in case the canceled waiter was soft-blocking other +waiters. + +4. We can minimize excess rearrangement-trial work by being careful to +scan the wait queue from the front when looking for soft edges. For +example, if we have queue order A,B,C and C has deadlock conflicts with +both A and B, we want to generate the "C before A" constraint first, +rather than wasting time with "C before B", which won't move C far +enough up. So we look for soft edges outgoing from C starting at the +front of the wait queue. + +5. The working data structures needed by the deadlock detection code can +be limited to numbers of entries computed from MaxBackends. Therefore, +we can allocate the worst-case space needed during backend startup. This +seems a safer approach than trying to allocate workspace on the fly; we +don't want to risk having the deadlock detector run out of memory, else +we really have no guarantees at all that deadlock will be detected. + +6. We abuse the deadlock detector to implement autovacuum cancellation. +When we run the detector and we find that there's an autovacuum worker +involved in the waits-for graph, we store a pointer to its PGPROC, and +return a special return code (unless a hard deadlock has been detected). +The caller can then send a cancellation signal. This implements the +principle that autovacuum has a low locking priority (eg it must not block +DDL on the table). + +Group Locking +------------- + +As if all of that weren't already complicated enough, PostgreSQL now supports +parallelism (see src/backend/access/transam/README.parallel), which means that +we might need to resolve deadlocks that occur between gangs of related +processes rather than individual processes. This doesn't change the basic +deadlock detection algorithm very much, but it makes the bookkeeping more +complicated. + +We choose to regard locks held by processes in the same parallel group as +non-conflicting with the exception of relation extension lock. This means that +two processes in a parallel group can hold a self-exclusive lock on the same +relation at the same time, or one process can acquire an AccessShareLock while +the other already holds AccessExclusiveLock. This might seem dangerous and +could be in some cases (more on that below), but if we didn't do this then +parallel query would be extremely prone to self-deadlock. For example, a +parallel query against a relation on which the leader already had +AccessExclusiveLock would hang, because the workers would try to lock the same +relation and be blocked by the leader; yet the leader can't finish until it +receives completion indications from all workers. An undetected deadlock +results. This is far from the only scenario where such a problem happens. The +same thing will occur if the leader holds only AccessShareLock, the worker +seeks AccessShareLock, but between the time the leader attempts to acquire the +lock and the time the worker attempts to acquire it, some other process queues +up waiting for an AccessExclusiveLock. In this case, too, an indefinite hang +results. + +It might seem that we could predict which locks the workers will attempt to +acquire and ensure before going parallel that those locks would be acquired +successfully. But this is very difficult to make work in a general way. For +example, a parallel worker's portion of the query plan could involve an +SQL-callable function which generates a query dynamically, and that query +might happen to hit a table on which the leader happens to hold +AccessExclusiveLock. By imposing enough restrictions on what workers can do, +we could eventually create a situation where their behavior can be adequately +restricted, but these restrictions would be fairly onerous, and even then, the +system required to decide whether the workers will succeed at acquiring the +necessary locks would be complex and possibly buggy. + +So, instead, we take the approach of deciding that locks within a lock group +do not conflict. This eliminates the possibility of an undetected deadlock, +but also opens up some problem cases: if the leader and worker try to do some +operation at the same time which would ordinarily be prevented by the +heavyweight lock mechanism, undefined behavior might result. In practice, the +dangers are modest. The leader and worker share the same transaction, +snapshot, and combo CID hash, and neither can perform any DDL or, indeed, +write any data at all. Thus, for either to read a table locked exclusively by +the other is safe enough. Problems would occur if the leader initiated +parallelism from a point in the code at which it had some backend-private +state that made table access from another process unsafe, for example after +calling SetReindexProcessing and before calling ResetReindexProcessing, +catastrophe could ensue, because the worker won't have that state. Similarly, +problems could occur with certain kinds of non-relation locks, such as +GIN page locks. It's no safer for two related processes to perform GIN clean +up at the same time than for unrelated processes to do the same. +However, since parallel mode is strictly read-only at present, neither this +nor most of the similar cases can arise at present. To allow parallel writes, +we'll either need to (1) further enhance the deadlock detector to handle those +types of locks in a different way than other types; or (2) have parallel +workers use some other mutual exclusion method for such cases. + +Group locking adds three new members to each PGPROC: lockGroupLeader, +lockGroupMembers, and lockGroupLink. A PGPROC's lockGroupLeader is NULL for +processes not involved in parallel query. When a process wants to cooperate +with parallel workers, it becomes a lock group leader, which means setting +this field to point to its own PGPROC. When a parallel worker starts up, it +points this field at the leader. The lockGroupMembers field is only used in +the leader; it is a list of the member PGPROCs of the lock group (the leader +and all workers). The lockGroupLink field is the list link for this list. + +All three of these fields are considered to be protected by a lock manager +partition lock. The partition lock that protects these fields within a given +lock group is chosen by taking the leader's pgprocno modulo the number of lock +manager partitions. This unusual arrangement has a major advantage: the +deadlock detector can count on the fact that no lockGroupLeader field can +change while the deadlock detector is running, because it knows that it holds +all the lock manager locks. Also, holding this single lock allows safe +manipulation of the lockGroupMembers list for the lock group. + +We need an additional interlock when setting these fields, because a newly +started parallel worker has to try to join the leader's lock group, but it +has no guarantee that the group leader is still alive by the time it gets +started. We try to ensure that the parallel leader dies after all workers +in normal cases, but also that the system could survive relatively intact +if that somehow fails to happen. This is one of the precautions against +such a scenario: the leader relays its PGPROC and also its PID to the +worker, and the worker fails to join the lock group unless the given PGPROC +still has the same PID and is still a lock group leader. We assume that +PIDs are not recycled quickly enough for this interlock to fail. + + +User Locks (Advisory Locks) +--------------------------- + +User locks are handled totally on the application side as long term +cooperative locks which may extend beyond the normal transaction boundaries. +Their purpose is to indicate to an application that someone is `working' +on an item. So it is possible to put a user lock on a tuple's oid, +retrieve the tuple, work on it for an hour and then update it and remove +the lock. While the lock is active other clients can still read and write +the tuple but they can be aware that it has been locked at the application +level by someone. + +User locks and normal locks are completely orthogonal and they don't +interfere with each other. + +User locks can be acquired either at session level or transaction level. +A session-level lock request is not automatically released at transaction +end, but must be explicitly released by the application. (However, any +remaining locks are always released at session end.) Transaction-level +user lock requests behave the same as normal lock requests, in that they +are released at transaction end and do not need explicit unlocking. + +Locking during Hot Standby +-------------------------- + +The Startup process is the only backend that can make changes during +recovery, all other backends are read only. As a result the Startup +process does not acquire locks on relations or objects except when the lock +level is AccessExclusiveLock. + +Regular backends are only allowed to take locks on relations or objects +at RowExclusiveLock or lower. This ensures that they do not conflict with +each other or with the Startup process, unless AccessExclusiveLocks are +requested by the Startup process. + +Deadlocks involving AccessExclusiveLocks are not possible, so we need +not be concerned that a user initiated deadlock can prevent recovery from +progressing. + +AccessExclusiveLocks on the primary node generate WAL records +that are then applied by the Startup process. Locks are released at end +of transaction just as they are in normal processing. These locks are +held by the Startup process, acting as a proxy for the backends that +originally acquired these locks. Again, these locks cannot conflict with +one another, so the Startup process cannot deadlock itself either. + +Although deadlock is not possible, a regular backend's weak lock can +prevent the Startup process from making progress in applying WAL, which is +usually not something that should be tolerated for very long. Mechanisms +exist to forcibly cancel a regular backend's query if it blocks the +Startup process for too long. diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI new file mode 100644 index 0000000..50d2ecc --- /dev/null +++ b/src/backend/storage/lmgr/README-SSI @@ -0,0 +1,646 @@ +src/backend/storage/lmgr/README-SSI + +Serializable Snapshot Isolation (SSI) and Predicate Locking +=========================================================== + +This code is in the lmgr directory because about 90% of it is an +implementation of predicate locking, which is required for SSI, +rather than being directly related to SSI itself. When another use +for predicate locking justifies the effort to tease these two things +apart, this README file should probably be split. + + +Credits +------- + +This feature was developed by Kevin Grittner and Dan R. K. Ports, +with review and suggestions from Joe Conway, Heikki Linnakangas, and +Jeff Davis. It is based on work published in these papers: + + Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008. + Serializable isolation for snapshot databases. + In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD + international conference on Management of data, + pages 729-738, New York, NY, USA. ACM. + http://doi.acm.org/10.1145/1376616.1376690 + + Michael James Cahill. 2009. + Serializable Isolation for Snapshot Databases. + Sydney Digital Theses. + University of Sydney, School of Information Technologies. + http://hdl.handle.net/2123/5353 + + +Overview +-------- + +With true serializable transactions, if you can show that your +transaction will do the right thing if there are no concurrent +transactions, it will do the right thing in any mix of serializable +transactions or be rolled back with a serialization failure. This +feature has been implemented in PostgreSQL using SSI. + + +Serializable and Snapshot Transaction Isolation Levels +------------------------------------------------------ + +Serializable transaction isolation is attractive for shops with +active development by many programmers against a complex schema +because it guarantees data integrity with very little staff time -- +if a transaction can be shown to always do the right thing when it is +run alone (before or after any other transaction), it will always do +the right thing in any mix of concurrent serializable transactions. +Where conflicts with other transactions would result in an +inconsistent state within the database or an inconsistent view of +the data, a serializable transaction will block or roll back to +prevent the anomaly. The SQL standard provides a specific SQLSTATE +for errors generated when a transaction rolls back for this reason, +so that transactions can be retried automatically. + +Before version 9.1, PostgreSQL did not support a full serializable +isolation level. A request for serializable transaction isolation +actually provided snapshot isolation. This has well known anomalies +which can allow data corruption or inconsistent views of the data +during concurrent transactions; although these anomalies only occur +when certain patterns of read-write dependencies exist within a set +of concurrent transactions. Where these patterns exist, the anomalies +can be prevented by introducing conflicts through explicitly +programmed locks or otherwise unnecessary writes to the database. +Snapshot isolation is popular because performance is better than +serializable isolation and the integrity guarantees which it does +provide allow anomalies to be avoided or managed with reasonable +effort in many environments. + + +Serializable Isolation Implementation Strategies +------------------------------------------------ + +Techniques for implementing full serializable isolation have been +published and in use in many database products for decades. The +primary technique which has been used is Strict Two-Phase Locking +(S2PL), which operates by blocking writes against data which has been +read by concurrent transactions and blocking any access (read or +write) against data which has been written by concurrent +transactions. A cycle in a graph of blocking indicates a deadlock, +requiring a rollback. Blocking and deadlocks under S2PL in high +contention workloads can be debilitating, crippling throughput and +response time. + +A new technique for implementing full serializable isolation in an +MVCC database appears in the literature beginning in 2008. This +technique, known as Serializable Snapshot Isolation (SSI) has many of +the advantages of snapshot isolation. In particular, reads don't +block anything and writes don't block reads. Essentially, it runs +snapshot isolation but monitors the read-write conflicts between +transactions to identify dangerous structures in the transaction +graph which indicate that a set of concurrent transactions might +produce an anomaly, and rolls back transactions to ensure that no +anomalies occur. It will produce some false positives (where a +transaction is rolled back even though there would not have been an +anomaly), but will never let an anomaly occur. In the two known +prototype implementations, performance for many workloads (even with +the need to restart transactions which are rolled back) is very close +to snapshot isolation and generally far better than an S2PL +implementation. + + +Apparent Serial Order of Execution +---------------------------------- + +One way to understand when snapshot anomalies can occur, and to +visualize the difference between the serializable implementations +described above, is to consider that among transactions executing at +the serializable transaction isolation level, the results are +required to be consistent with some serial (one-at-a-time) execution +of the transactions [1]. How is that order determined in each? + +In S2PL, each transaction locks any data it accesses. It holds the +locks until committing, preventing other transactions from making +conflicting accesses to the same data in the interim. Some +transactions may have to be rolled back to prevent deadlock. But +successful transactions can always be viewed as having occurred +sequentially, in the order they committed. + +With snapshot isolation, reads never block writes, nor vice versa, so +more concurrency is possible. The order in which transactions appear +to have executed is determined by something more subtle than in S2PL: +read/write dependencies. If a transaction reads data, it appears to +execute after the transaction that wrote the data it is reading. +Similarly, if it updates data, it appears to execute after the +transaction that wrote the previous version. These dependencies, which +we call "wr-dependencies" and "ww-dependencies", are consistent with +the commit order, because the first transaction must have committed +before the second starts. However, there can also be dependencies +between two *concurrent* transactions, i.e. where one was running when +the other acquired its snapshot. These "rw-conflicts" occur when one +transaction attempts to read data which is not visible to it because +the transaction which wrote it (or will later write it) is +concurrent. The reading transaction appears to have executed first, +regardless of the actual sequence of transaction starts or commits, +because it sees a database state prior to that in which the other +transaction leaves it. + +Anomalies occur when a cycle is created in the graph of dependencies: +when a dependency or series of dependencies causes transaction A to +appear to have executed before transaction B, but another series of +dependencies causes B to appear before A. If that's the case, then +the results can't be consistent with any serial execution of the +transactions. + + +SSI Algorithm +------------- + +As of 9.1, serializable transactions in PostgreSQL are implemented using +Serializable Snapshot Isolation (SSI), based on the work of Cahill +et al. Fundamentally, this allows snapshot isolation to run as it +previously did, while monitoring for conditions which could create a +serialization anomaly. + +SSI is based on the observation [2] that each snapshot isolation +anomaly corresponds to a cycle that contains a "dangerous structure" +of two adjacent rw-conflict edges: + + Tin ------> Tpivot ------> Tout + rw rw + +SSI works by watching for this dangerous structure, and rolling +back a transaction when needed to prevent any anomaly. This means it +only needs to track rw-conflicts between concurrent transactions, not +wr- and ww-dependencies. It also means there is a risk of false +positives, because not every dangerous structure is embedded in an +actual cycle. The number of false positives is low in practice, so +this represents an acceptable tradeoff for keeping the detection +overhead low. + +The PostgreSQL implementation uses two additional optimizations: + +* Tout must commit before any other transaction in the cycle + (see proof of Theorem 2.1 of [2]). We only roll back a transaction + if Tout commits before Tpivot and Tin. + +* if Tin is read-only, there can only be an anomaly if Tout committed + before Tin takes its snapshot. This optimization is an original + one. Proof: + + - Because there is a cycle, there must be some transaction T0 that + precedes Tin in the cycle. (T0 might be the same as Tout.) + + - The edge between T0 and Tin can't be a rw-conflict or ww-dependency, + because Tin was read-only, so it must be a wr-dependency. + Those can only occur if T0 committed before Tin took its snapshot, + else Tin would have ignored T0's output. + + - Because Tout must commit before any other transaction in the + cycle, it must commit before T0 commits -- and thus before Tin + starts. + + +PostgreSQL Implementation +------------------------- + + * Since this technique is based on Snapshot Isolation (SI), those +areas in PostgreSQL which don't use SI can't be brought under SSI. +This includes system tables, temporary tables, sequences, hint bit +rewrites, etc. SSI can not eliminate existing anomalies in these +areas. + + * Any transaction which is run at a transaction isolation level +other than SERIALIZABLE will not be affected by SSI. If you want to +enforce business rules through SSI, all transactions should be run at +the SERIALIZABLE transaction isolation level, and that should +probably be set as the default. + + * If all transactions are run at the SERIALIZABLE transaction +isolation level, business rules can be enforced in triggers or +application code without ever having a need to acquire an explicit +lock or to use SELECT FOR SHARE or SELECT FOR UPDATE. + + * Those who want to continue to use snapshot isolation without +the additional protections of SSI (and the associated costs of +enforcing those protections), can use the REPEATABLE READ transaction +isolation level. This level retains its legacy behavior, which +is identical to the old SERIALIZABLE implementation and fully +consistent with the standard's requirements for the REPEATABLE READ +transaction isolation level. + + * Performance under this SSI implementation will be significantly +improved if transactions which don't modify permanent tables are +declared to be READ ONLY before they begin reading data. + + * Performance under SSI will tend to degrade more rapidly with a +large number of active database transactions than under less strict +isolation levels. Limiting the number of active transactions through +use of a connection pool or similar techniques may be necessary to +maintain good performance. + + * Any transaction which must be rolled back to prevent +serialization anomalies will fail with SQLSTATE 40001, which has a +standard meaning of "serialization failure". + + * This SSI implementation makes an effort to choose the +transaction to be canceled such that an immediate retry of the +transaction will not fail due to conflicts with exactly the same +transactions. Pursuant to this goal, no transaction is canceled +until one of the other transactions in the set of conflicts which +could generate an anomaly has successfully committed. This is +conceptually similar to how write conflicts are handled. To fully +implement this guarantee there needs to be a way to roll back the +active transaction for another process with a serialization failure +SQLSTATE, even if it is "idle in transaction". + + +Predicate Locking +----------------- + +Both S2PL and SSI require some form of predicate locking to handle +situations where reads conflict with later inserts or with later +updates which move data into the selected range. PostgreSQL didn't +already have predicate locking, so it needed to be added to support +full serializable transactions under either strategy. Practical +implementations of predicate locking generally involve acquiring +locks against data as it is accessed, using multiple granularities +(tuple, page, table, etc.) with escalation as needed to keep the lock +count to a number which can be tracked within RAM structures. This +approach was used in PostgreSQL. Coarse granularities can cause some +false positive indications of conflict. The number of false positives +can be influenced by plan choice. + + +Implementation overview +----------------------- + +New RAM structures, inspired by those used to track traditional locks +in PostgreSQL, but tailored to the needs of SIREAD predicate locking, +are used. These refer to physical objects actually accessed in the +course of executing the query, to model the predicates through +inference. Anyone interested in this subject should review the +Hellerstein, Stonebraker and Hamilton paper [3], along with the +locking papers referenced from that and the Cahill papers. + +Because the SIREAD locks don't block, traditional locking techniques +have to be modified. Intent locking (locking higher level objects +before locking lower level objects) doesn't work with non-blocking +"locks" (which are, in some respects, more like flags than locks). + +A configurable amount of shared memory is reserved at postmaster +start-up to track predicate locks. This size cannot be changed +without a restart. + +To prevent resource exhaustion, multiple fine-grained locks may +be promoted to a single coarser-grained lock as needed. + +An attempt to acquire an SIREAD lock on a tuple when the same +transaction already holds an SIREAD lock on the page or the relation +will be ignored. Likewise, an attempt to lock a page when the +relation is locked will be ignored, and the acquisition of a coarser +lock will result in the automatic release of all finer-grained locks +it covers. + + +Heap locking +------------ + +Predicate locks will be acquired for the heap based on the following: + + * For a table scan, the entire relation will be locked. + + * Each tuple read which is visible to the reading transaction +will be locked, whether or not it meets selection criteria; except +that there is no need to acquire an SIREAD lock on a tuple when the +transaction already holds a write lock on any tuple representing the +row, since a rw-conflict would also create a ww-dependency which +has more aggressive enforcement and thus will prevent any anomaly. + + * Modifying a heap tuple creates a rw-conflict with any transaction +that holds a SIREAD lock on that tuple, or on the page or relation +that contains it. + + * Inserting a new tuple creates a rw-conflict with any transaction +holding a SIREAD lock on the entire relation. It doesn't conflict with +page-level locks, because page-level locks are only used to aggregate +tuple locks. Unlike index page locks, they don't lock "gaps" on the page. + + +Index AM implementations +------------------------ + +Since predicate locks only exist to detect writes which conflict with +earlier reads, and heap tuple locks are acquired to cover all heap +tuples actually read, including those read through indexes, the index +tuples which were actually scanned are not of interest in themselves; +we only care about their "new neighbors" -- later inserts into the +index which would have been included in the scan had they existed at +the time. Conceptually, we want to lock the gaps between and +surrounding index entries within the scanned range. + +Correctness requires that any insert into an index generates a +rw-conflict with a concurrent serializable transaction if, after that +insert, re-execution of any index scan of the other transaction would +access the heap for a row not accessed during the previous execution. +Note that a non-HOT update which expires an old index entry covered +by the scan and adds a new entry for the modified row's new tuple +need not generate a conflict, although an update which "moves" a row +into the scan must generate a conflict. While correctness allows +false positives, they should be minimized for performance reasons. + +Several optimizations are possible, though not all are implemented yet: + + * An index scan which is just finding the right position for an +index insertion or deletion need not acquire a predicate lock. + + * An index scan which is comparing for equality on the entire key +for a unique index need not acquire a predicate lock as long as a key +is found corresponding to a visible tuple which has not been modified +by another transaction -- there are no "between or around" gaps to +cover. + + * As long as built-in foreign key enforcement continues to use +its current "special tricks" to deal with MVCC issues, predicate +locks should not be needed for scans done by enforcement code. + + * If a search determines that no rows can be found regardless of +index contents because the search conditions are contradictory (e.g., +x = 1 AND x = 2), then no predicate lock is needed. + +Other index AM implementation considerations: + + * For an index AM that doesn't have support for predicate locking, +we just acquire a predicate lock on the whole index for any search. + + * B-tree index searches acquire predicate locks only on the +index *leaf* pages needed to lock the appropriate index range. If, +however, a search discovers that no root page has yet been created, a +predicate lock on the index relation is required. + + * Like a B-tree, GIN searches acquire predicate locks only on the +leaf pages of entry tree. When performing an equality scan, and an +entry has a posting tree, the posting tree root is locked instead, to +lock only that key value. However, fastupdate=on postpones the +insertion of tuples into index structure by temporarily storing them +into pending list. That makes us unable to detect r-w conflicts using +page-level locks. To cope with that, insertions to the pending list +conflict with all scans. + + * GiST searches can determine that there are no matches at any +level of the index, so we acquire predicate lock at each index +level during a GiST search. An index insert at the leaf level can +then be trusted to ripple up to all levels and locations where +conflicting predicate locks may exist. In case there is a page split, +we need to copy predicate lock from the original page to all the new +pages. + + * Hash index searches acquire predicate locks on the primary +page of a bucket. It acquires a lock on both the old and new buckets +for scans that happen concurrently with page splits. During a bucket +split, a predicate lock is copied from the primary page of an old +bucket to the primary page of a new bucket. + + * The effects of page splits, overflows, consolidations, and +removals must be carefully reviewed to ensure that predicate locks +aren't "lost" during those operations, or kept with pages which could +get re-used for different parts of the index. + + +Innovations +----------- + +The PostgreSQL implementation of Serializable Snapshot Isolation +differs from what is described in the cited papers for several +reasons: + + 1. PostgreSQL didn't have any existing predicate locking. It had +to be added from scratch. + + 2. The existing in-memory lock structures were not suitable for +tracking SIREAD locks. + * In PostgreSQL, tuple level locks are not held in RAM for +any length of time; lock information is written to the tuples +involved in the transactions. + * In PostgreSQL, existing lock structures have pointers to +memory which is related to a session. SIREAD locks need to persist +past the end of the originating transaction and even the session +which ran it. + * PostgreSQL needs to be able to tolerate a large number of +transactions executing while one long-running transaction stays open +-- the in-RAM techniques discussed in the papers wouldn't support +that. + + 3. Unlike the database products used for the prototypes described +in the papers, PostgreSQL didn't already have a true serializable +isolation level distinct from snapshot isolation. + + 4. PostgreSQL supports subtransactions -- an issue not mentioned +in the papers. + + 5. PostgreSQL doesn't assign a transaction number to a database +transaction until and unless necessary (normally, when the transaction +attempts to modify data). + + 6. PostgreSQL has pluggable data types with user-definable +operators, as well as pluggable index types, not all of which are +based around data types which support ordering. + + 7. Some possible optimizations became apparent during development +and testing. + +Differences from the implementation described in the papers are +listed below. + + * New structures needed to be created in shared memory to track +the proper information for serializable transactions and their SIREAD +locks. + + * Because PostgreSQL does not have the same concept of an "oldest +transaction ID" for all serializable transactions as assumed in the +Cahill thesis, we track the oldest snapshot xmin among serializable +transactions, and a count of how many active transactions use that +xmin. When the count hits zero we find the new oldest xmin and run a +clean-up based on that. + + * Because reads in a subtransaction may cause that subtransaction +to roll back, thereby affecting what is written by the top level +transaction, predicate locks must survive a subtransaction rollback. +As a consequence, all xid usage in SSI, including predicate locking, +is based on the top level xid. When looking at an xid that comes +from a tuple's xmin or xmax, for example, we always call +SubTransGetTopmostTransaction() before doing much else with it. + + * PostgreSQL does not use "update in place" with a rollback log +for its MVCC implementation. Where possible it uses "HOT" updates on +the same page (if there is room and no indexed value is changed). +For non-HOT updates the old tuple is expired in place and a new tuple +is inserted at a new location. Because of this difference, a tuple +lock in PostgreSQL doesn't automatically lock any other versions of a +row. We don't try to copy or expand a tuple lock to any other +versions of the row, based on the following proof that any additional +serialization failures we would get from that would be false +positives: + + o If transaction T1 reads a row version (thus acquiring a +predicate lock on it) and a second transaction T2 updates that row +version (thus creating a rw-conflict graph edge from T1 to T2), must a +third transaction T3 which re-updates the new version of the row also +have a rw-conflict in from T1 to prevent anomalies? In other words, +does it matter whether we recognize the edge T1 -> T3? + + o If T1 has a conflict in, it certainly doesn't. Adding the +edge T1 -> T3 would create a dangerous structure, but we already had +one from the edge T1 -> T2, so we would have aborted something anyway. +(T2 has already committed, else T3 could not have updated its output; +but we would have aborted either T1 or T1's predecessor(s). Hence +no cycle involving T1 and T3 can survive.) + + o Now let's consider the case where T1 doesn't have a +rw-conflict in. If that's the case, for this edge T1 -> T3 to make a +difference, T3 must have a rw-conflict out that induces a cycle in the +dependency graph, i.e. a conflict out to some transaction preceding T1 +in the graph. (A conflict out to T1 itself would be problematic too, +but that would mean T1 has a conflict in, the case we already +eliminated.) + + o So now we're trying to figure out if there can be an +rw-conflict edge T3 -> T0, where T0 is some transaction that precedes +T1. For T0 to precede T1, there has to be some edge, or sequence of +edges, from T0 to T1. At least the last edge has to be a wr-dependency +or ww-dependency rather than a rw-conflict, because T1 doesn't have a +rw-conflict in. And that gives us enough information about the order +of transactions to see that T3 can't have a rw-conflict to T0: + - T0 committed before T1 started (the wr/ww-dependency implies this) + - T1 started before T2 committed (the T1->T2 rw-conflict implies this) + - T2 committed before T3 started (otherwise, T3 would get aborted + because of an update conflict) + + o That means T0 committed before T3 started, and therefore +there can't be a rw-conflict from T3 to T0. + + o So in all cases, we don't need the T1 -> T3 edge to +recognize cycles. Therefore it's not necessary for T1's SIREAD lock +on the original tuple version to cover later versions as well. + + * Predicate locking in PostgreSQL starts at the tuple level +when possible. Multiple fine-grained locks are promoted to a single +coarser-granularity lock as needed to avoid resource exhaustion. The +amount of memory used for these structures is configurable, to balance +RAM usage against SIREAD lock granularity. + + * Each backend keeps a process-local table of the locks it holds. +To support granularity promotion decisions with low CPU and locking +overhead, this table also includes the coarser covering locks and the +number of finer-granularity locks they cover. + + * Conflicts are identified by looking for predicate locks +when tuples are written, and by looking at the MVCC information when +tuples are read. There is no matching between two RAM-based locks. + + * Because write locks are stored in the heap tuples rather than a +RAM-based lock table, the optimization described in the Cahill thesis +which eliminates an SIREAD lock where there is a write lock is +implemented by the following: + 1. When checking a heap write for conflicts against existing +predicate locks, a tuple lock on the tuple being written is removed. + 2. When acquiring a predicate lock on a heap tuple, we +return quickly without doing anything if it is a tuple written by the +reading transaction. + + * Rather than using conflictIn and conflictOut pointers which use +NULL to indicate no conflict and a self-reference to indicate +multiple conflicts or conflicts with committed transactions, we use a +list of rw-conflicts. With the more complete information, false +positives are reduced and we have sufficient data for more aggressive +clean-up and other optimizations: + + o We can avoid ever rolling back a transaction until and +unless there is a pivot where a transaction on the conflict *out* +side of the pivot committed before either of the other transactions. + + o We can avoid ever rolling back a transaction when the +transaction on the conflict *in* side of the pivot is explicitly or +implicitly READ ONLY unless the transaction on the conflict *out* +side of the pivot committed before the READ ONLY transaction acquired +its snapshot. (An implicit READ ONLY transaction is one which +committed without writing, even though it was not explicitly declared +to be READ ONLY.) + + o We can more aggressively clean up conflicts, predicate +locks, and SSI transaction information. + + * We allow a READ ONLY transaction to "opt out" of SSI if there are +no READ WRITE transactions which could cause the READ ONLY +transaction to ever become part of a "dangerous structure" of +overlapping transaction dependencies. + + * We allow the user to request that a READ ONLY transaction wait +until the conditions are right for it to start in the "opt out" state +described above. We add a DEFERRABLE state to transactions, which is +specified and maintained in a way similar to READ ONLY. It is +ignored for transactions that are not SERIALIZABLE and READ ONLY. + + * When a transaction must be rolled back, we pick among the +active transactions such that an immediate retry will not fail again +on conflicts with the same transactions. + + * We use the PostgreSQL SLRU system to hold summarized +information about older committed transactions to put an upper bound +on RAM used. Beyond that limit, information spills to disk. +Performance can degrade in a pessimal situation, but it should be +tolerable, and transactions won't need to be canceled or blocked +from starting. + + +R&D Issues +---------- + +This is intended to be the place to record specific issues which need +more detailed review or analysis. + + * WAL file replay. While serializable implementations using S2PL +can guarantee that the write-ahead log contains commits in a sequence +consistent with some serial execution of serializable transactions, +SSI cannot make that guarantee. While the WAL replay is no less +consistent than under snapshot isolation, it is possible that under +PITR recovery or hot standby a database could reach a readable state +where some transactions appear before other transactions which would +have had to precede them to maintain serializable consistency. In +essence, if we do nothing, WAL replay will be at snapshot isolation +even for serializable transactions. Is this OK? If not, how do we +address it? + + * External replication. Look at how this impacts external +replication solutions, like Postgres-R, Slony, pgpool, HS/SR, etc. +This is related to the "WAL file replay" issue. + + * UNIQUE btree search for equality on all columns. Since a search +of a UNIQUE index using equality tests on all columns will lock the +heap tuple if an entry is found, it appears that there is no need to +get a predicate lock on the index in that case. A predicate lock is +still needed for such a search if a matching index entry which points +to a visible tuple is not found. + + * Minimize touching of shared memory. Should lists in shared +memory push entries which have just been returned to the front of the +available list, so they will be popped back off soon and some memory +might never be touched, or should we keep adding returned items to +the end of the available list? + + +References +---------- + +[1] http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt +Search for serial execution to find the relevant section. + +[2] A. Fekete et al. Making Snapshot Isolation Serializable. In ACM +Transactions on Database Systems 30:2, Jun. 2005. +http://dx.doi.org/10.1145/1071610.1071615 + +[3] Joseph M. Hellerstein, Michael Stonebraker and James Hamilton. 2007. +Architecture of a Database System. Foundations and Trends(R) in +Databases Vol. 1, No. 2 (2007) 141-259. +http://db.cs.berkeley.edu/papers/fntdb07-architecture.pdf + Of particular interest: + * 6.1 A Note on ACID + * 6.2 A Brief Review of Serializability + * 6.3 Locking and Latching + * 6.3.1 Transaction Isolation Levels + * 6.5.3 Next-Key Locking: Physical Surrogates for Logical Properties diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier new file mode 100644 index 0000000..f78e5ac --- /dev/null +++ b/src/backend/storage/lmgr/README.barrier @@ -0,0 +1,197 @@ +Memory Barriers +=============== + +Modern CPUs make extensive use of pipe-lining and out-of-order execution, +meaning that the CPU is often executing more than one instruction at a +time, and not necessarily in the order that the source code would suggest. +Furthermore, even before the CPU gets a chance to reorder operations, the +compiler may (and often does) reorganize the code for greater efficiency, +particularly at higher optimization levels. Optimizing compilers and +out-of-order execution are both critical for good performance, but they +can lead to surprising results when multiple processes access the same +memory space. + +Example +======= + +Suppose x is a pointer to a structure stored in shared memory, and that the +entire structure has been initialized to zero bytes. One backend executes +the following code fragment: + + x->foo = 1; + x->bar = 1; + +Meanwhile, at approximately the same time, another backend executes this +code fragment: + + bar = x->bar; + foo = x->foo; + +The second backend might end up with foo = 1 and bar = 1 (if it executes +both statements after the first backend), or with foo = 0 and bar = 0 (if +it executes both statements before the first backend), or with foo = 1 and +bar = 0 (if the first backend executes the first statement, the second +backend executes both statements, and then the first backend executes the +second statement). + +Surprisingly, however, the second backend could also end up with foo = 0 +and bar = 1. The compiler might swap the order of the two stores performed +by the first backend, or the two loads performed by the second backend. +Even if it doesn't, on a machine with weak memory ordering (such as PowerPC +or ARM) the CPU might choose to execute either the loads or the stores +out of order. This surprising result can lead to bugs. + +A common pattern where this actually does result in a bug is when adding items +onto a queue. The writer does this: + + q->items[q->num_items] = new_item; + ++q->num_items; + +The reader does this: + + num_items = q->num_items; + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +This code turns out to be unsafe, because the writer might increment +q->num_items before it finishes storing the new item into the appropriate slot. +More subtly, the reader might prefetch the contents of the q->items array +before reading q->num_items. Thus, there's still a bug here *even if the +writer does everything in the order we expect*. We need the writer to update +the array before bumping the item counter, and the reader to examine the item +counter before examining the array. + +Note that these types of highly counterintuitive bugs can *only* occur when +multiple processes are interacting with the same memory segment. A given +process always perceives its *own* writes to memory in program order. + +Avoiding Memory Ordering Bugs +============================= + +The simplest (and often best) way to avoid memory ordering bugs is to +protect the data structures involved with an lwlock. For more details, see +src/backend/storage/lmgr/README. For instance, in the above example, the +writer could acquire an lwlock in exclusive mode before appending to the +queue, and each reader could acquire the same lock in shared mode before +reading it. If the data structure is not heavily trafficked, this solution is +generally entirely adequate. + +However, in some cases, it is desirable to avoid the overhead of acquiring +and releasing locks. In this case, memory barriers may be used to ensure +that the apparent order of execution is as the programmer desires. In +PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve +this result. In the example above, we can prevent the reader from seeing a +garbage value by having the writer do this: + + q->items[q->num_items] = new_item; + pg_memory_barrier(); + ++q->num_items; + +And by having the reader do this: + + num_items = q->num_items; + pg_memory_barrier(); + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +The pg_memory_barrier() macro will (1) prevent the compiler from rearranging +the code in such a way as to allow the memory accesses to occur out of order +and (2) generate any code (often, inline assembly) that is needed to prevent +the CPU from executing the memory accesses out of order. Specifically, the +barrier prevents loads and stores written after the barrier from being +performed before the barrier, and vice-versa. + +Although this code will work, it is needlessly inefficient. On systems with +strong memory ordering (such as x86), the CPU never reorders loads with other +loads, nor stores with other stores. It can, however, allow a load to be +performed before a subsequent store. To avoid emitting unnecessary memory +instructions, we provide two additional primitives: pg_read_barrier(), and +pg_write_barrier(). When a memory barrier is being used to separate two +loads, use pg_read_barrier(); when it is separating two stores, use +pg_write_barrier(); when it is a separating a load and a store (in either +order), use pg_memory_barrier(). pg_memory_barrier() can always substitute +for either a read or a write barrier, but is typically more expensive, and +therefore should be used only when needed. + +With these guidelines in mind, the writer can do this: + + q->items[q->num_items] = new_item; + pg_write_barrier(); + ++q->num_items; + +And the reader can do this: + + num_items = q->num_items; + pg_read_barrier(); + for (i = 0; i < num_items; ++i) + /* do something with q->items[i] */ + +On machines with strong memory ordering, these weaker barriers will simply +prevent compiler rearrangement, without emitting any actual machine code. +On machines with weak memory ordering, they will prevent compiler +reordering and also emit whatever hardware barrier may be required. Even +on machines with weak memory ordering, a read or write barrier may be able +to use a less expensive instruction than a full barrier. + +Weaknesses of Memory Barriers +============================= + +While memory barriers are a powerful tool, and much cheaper than locks, they +are also much less capable than locks. Here are some of the problems. + +1. Concurrent writers are unsafe. In the above example of a queue, using +memory barriers doesn't make it safe for two processes to add items to the +same queue at the same time. If more than one process can write to the queue, +a spinlock or lwlock must be used to synchronize access. The readers can +perhaps proceed without any lock, but the writers may not. + +Even very simple write operations often require additional synchronization. +For example, it's not safe for multiple writers to simultaneously execute +this code (supposing x is a pointer into shared memory): + + x->foo++; + +Although this may compile down to a single machine-language instruction, +the CPU will execute that instruction by reading the current value of foo, +adding one to it, and then storing the result back to the original address. +If two CPUs try to do this simultaneously, both may do their reads before +either one does their writes. Such a case could be made safe by using an +atomic variable and an atomic add. See port/atomics.h. + +2. Eight-byte loads and stores aren't necessarily atomic. We assume in +various places in the source code that an aligned four-byte load or store is +atomic, and that other processes therefore won't see a half-set value. +Sadly, the same can't be said for eight-byte value: on some platforms, an +aligned eight-byte load or store will generate two four-byte operations. If +you need an atomic eight-byte read or write, you must either serialize access +with a lock or use an atomic variable. + +3. No ordering guarantees. While memory barriers ensure that any given +process performs loads and stores to shared memory in order, they don't +guarantee synchronization. In the queue example above, we can use memory +barriers to be sure that readers won't see garbage, but there's nothing to +say whether a given reader will run before or after a given writer. If this +matters in a given situation, some other mechanism must be used instead of +or in addition to memory barriers. + +4. Barrier proliferation. Many algorithms that at first seem appealing +require multiple barriers. If the number of barriers required is more than +one or two, you may be better off just using a lock. Keep in mind that, on +some platforms, a barrier may be implemented by acquiring and releasing a +backend-private spinlock. This may be better than a centralized lock under +contention, but it may also be slower in the uncontended case. + +Further Reading +=============== + +Much of the documentation about memory barriers appears to be quite +Linux-specific. The following papers may be helpful: + +Memory Ordering in Modern Microprocessors, by Paul E. McKenney +* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf + +Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney +* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf + +The Linux kernel also has some useful documentation on this topic. Start +with Documentation/memory-barriers.txt diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c new file mode 100644 index 0000000..910a768 --- /dev/null +++ b/src/backend/storage/lmgr/condition_variable.c @@ -0,0 +1,360 @@ +/*------------------------------------------------------------------------- + * + * condition_variable.c + * Implementation of condition variables. Condition variables provide + * a way for one process to wait until a specific condition occurs, + * without needing to know the specific identity of the process for + * which they are waiting. Waits for condition variables can be + * interrupted, unlike LWLock waits. Condition variables are safe + * to use within dynamic shared memory segments. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/storage/lmgr/condition_variable.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "portability/instr_time.h" +#include "storage/condition_variable.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/proclist.h" +#include "storage/spin.h" +#include "utils/memutils.h" + +/* Initially, we are not prepared to sleep on any condition variable. */ +static ConditionVariable *cv_sleep_target = NULL; + +/* + * Initialize a condition variable. + */ +void +ConditionVariableInit(ConditionVariable *cv) +{ + SpinLockInit(&cv->mutex); + proclist_init(&cv->wakeup); +} + +/* + * Prepare to wait on a given condition variable. + * + * This can optionally be called before entering a test/sleep loop. + * Doing so is more efficient if we'll need to sleep at least once. + * However, if the first test of the exit condition is likely to succeed, + * it's more efficient to omit the ConditionVariablePrepareToSleep call. + * See comments in ConditionVariableSleep for more detail. + * + * Caution: "before entering the loop" means you *must* test the exit + * condition between calling ConditionVariablePrepareToSleep and calling + * ConditionVariableSleep. If that is inconvenient, omit calling + * ConditionVariablePrepareToSleep. + */ +void +ConditionVariablePrepareToSleep(ConditionVariable *cv) +{ + int pgprocno = MyProc->pgprocno; + + /* + * If some other sleep is already prepared, cancel it; this is necessary + * because we have just one static variable tracking the prepared sleep, + * and also only one cvWaitLink in our PGPROC. It's okay to do this + * because whenever control does return to the other test-and-sleep loop, + * its ConditionVariableSleep call will just re-establish that sleep as + * the prepared one. + */ + if (cv_sleep_target != NULL) + ConditionVariableCancelSleep(); + + /* Record the condition variable on which we will sleep. */ + cv_sleep_target = cv; + + /* Add myself to the wait queue. */ + SpinLockAcquire(&cv->mutex); + proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink); + SpinLockRelease(&cv->mutex); +} + +/* + * Wait for the given condition variable to be signaled. + * + * This should be called in a predicate loop that tests for a specific exit + * condition and otherwise sleeps, like so: + * + * ConditionVariablePrepareToSleep(cv); // optional + * while (condition for which we are waiting is not true) + * ConditionVariableSleep(cv, wait_event_info); + * ConditionVariableCancelSleep(); + * + * wait_event_info should be a value from one of the WaitEventXXX enums + * defined in pgstat.h. This controls the contents of pg_stat_activity's + * wait_event_type and wait_event columns while waiting. + */ +void +ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info) +{ + (void) ConditionVariableTimedSleep(cv, -1 /* no timeout */ , + wait_event_info); +} + +/* + * Wait for a condition variable to be signaled or a timeout to be reached. + * + * Returns true when timeout expires, otherwise returns false. + * + * See ConditionVariableSleep() for general usage. + */ +bool +ConditionVariableTimedSleep(ConditionVariable *cv, long timeout, + uint32 wait_event_info) +{ + long cur_timeout = -1; + instr_time start_time; + instr_time cur_time; + int wait_events; + + /* + * If the caller didn't prepare to sleep explicitly, then do so now and + * return immediately. The caller's predicate loop should immediately + * call again if its exit condition is not yet met. This will result in + * the exit condition being tested twice before we first sleep. The extra + * test can be prevented by calling ConditionVariablePrepareToSleep(cv) + * first. Whether it's worth doing that depends on whether you expect the + * exit condition to be met initially, in which case skipping the prepare + * is recommended because it avoids manipulations of the wait list, or not + * met initially, in which case preparing first is better because it + * avoids one extra test of the exit condition. + * + * If we are currently prepared to sleep on some other CV, we just cancel + * that and prepare this one; see ConditionVariablePrepareToSleep. + */ + if (cv_sleep_target != cv) + { + ConditionVariablePrepareToSleep(cv); + return false; + } + + /* + * Record the current time so that we can calculate the remaining timeout + * if we are woken up spuriously. + */ + if (timeout >= 0) + { + INSTR_TIME_SET_CURRENT(start_time); + Assert(timeout >= 0 && timeout <= INT_MAX); + cur_timeout = timeout; + wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH; + } + else + wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH; + + while (true) + { + bool done = false; + + /* + * Wait for latch to be set. (If we're awakened for some other + * reason, the code below will cope anyway.) + */ + (void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info); + + /* Reset latch before examining the state of the wait list. */ + ResetLatch(MyLatch); + + /* + * If this process has been taken out of the wait list, then we know + * that it has been signaled by ConditionVariableSignal (or + * ConditionVariableBroadcast), so we should return to the caller. But + * that doesn't guarantee that the exit condition is met, only that we + * ought to check it. So we must put the process back into the wait + * list, to ensure we don't miss any additional wakeup occurring while + * the caller checks its exit condition. We can take ourselves out of + * the wait list only when the caller calls + * ConditionVariableCancelSleep. + * + * If we're still in the wait list, then the latch must have been set + * by something other than ConditionVariableSignal; though we don't + * guarantee not to return spuriously, we'll avoid this obvious case. + */ + SpinLockAcquire(&cv->mutex); + if (!proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink)) + { + done = true; + proclist_push_tail(&cv->wakeup, MyProc->pgprocno, cvWaitLink); + } + SpinLockRelease(&cv->mutex); + + /* + * Check for interrupts, and return spuriously if that caused the + * current sleep target to change (meaning that interrupt handler code + * waited for a different condition variable). + */ + CHECK_FOR_INTERRUPTS(); + if (cv != cv_sleep_target) + done = true; + + /* We were signaled, so return */ + if (done) + return false; + + /* If we're not done, update cur_timeout for next iteration */ + if (timeout >= 0) + { + INSTR_TIME_SET_CURRENT(cur_time); + INSTR_TIME_SUBTRACT(cur_time, start_time); + cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time); + + /* Have we crossed the timeout threshold? */ + if (cur_timeout <= 0) + return true; + } + } +} + +/* + * Cancel any pending sleep operation. + * + * We just need to remove ourselves from the wait queue of any condition + * variable for which we have previously prepared a sleep. + * + * Do nothing if nothing is pending; this allows this function to be called + * during transaction abort to clean up any unfinished CV sleep. + * + * Return true if we've been signaled. + */ +bool +ConditionVariableCancelSleep(void) +{ + ConditionVariable *cv = cv_sleep_target; + bool signaled = false; + + if (cv == NULL) + return false; + + SpinLockAcquire(&cv->mutex); + if (proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink)) + proclist_delete(&cv->wakeup, MyProc->pgprocno, cvWaitLink); + else + signaled = true; + SpinLockRelease(&cv->mutex); + + cv_sleep_target = NULL; + + return signaled; +} + +/* + * Wake up the oldest process sleeping on the CV, if there is any. + * + * Note: it's difficult to tell whether this has any real effect: we know + * whether we took an entry off the list, but the entry might only be a + * sentinel. Hence, think twice before proposing that this should return + * a flag telling whether it woke somebody. + */ +void +ConditionVariableSignal(ConditionVariable *cv) +{ + PGPROC *proc = NULL; + + /* Remove the first process from the wakeup queue (if any). */ + SpinLockAcquire(&cv->mutex); + if (!proclist_is_empty(&cv->wakeup)) + proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink); + SpinLockRelease(&cv->mutex); + + /* If we found someone sleeping, set their latch to wake them up. */ + if (proc != NULL) + SetLatch(&proc->procLatch); +} + +/* + * Wake up all processes sleeping on the given CV. + * + * This guarantees to wake all processes that were sleeping on the CV + * at time of call, but processes that add themselves to the list mid-call + * will typically not get awakened. + */ +void +ConditionVariableBroadcast(ConditionVariable *cv) +{ + int pgprocno = MyProc->pgprocno; + PGPROC *proc = NULL; + bool have_sentinel = false; + + /* + * In some use-cases, it is common for awakened processes to immediately + * re-queue themselves. If we just naively try to reduce the wakeup list + * to empty, we'll get into a potentially-indefinite loop against such a + * process. The semantics we really want are just to be sure that we have + * wakened all processes that were in the list at entry. We can use our + * own cvWaitLink as a sentinel to detect when we've finished. + * + * A seeming flaw in this approach is that someone else might signal the + * CV and in doing so remove our sentinel entry. But that's fine: since + * CV waiters are always added and removed in order, that must mean that + * every previous waiter has been wakened, so we're done. We'll get an + * extra "set" on our latch from the someone else's signal, which is + * slightly inefficient but harmless. + * + * We can't insert our cvWaitLink as a sentinel if it's already in use in + * some other proclist. While that's not expected to be true for typical + * uses of this function, we can deal with it by simply canceling any + * prepared CV sleep. The next call to ConditionVariableSleep will take + * care of re-establishing the lost state. + */ + if (cv_sleep_target != NULL) + ConditionVariableCancelSleep(); + + /* + * Inspect the state of the queue. If it's empty, we have nothing to do. + * If there's exactly one entry, we need only remove and signal that + * entry. Otherwise, remove the first entry and insert our sentinel. + */ + SpinLockAcquire(&cv->mutex); + /* While we're here, let's assert we're not in the list. */ + Assert(!proclist_contains(&cv->wakeup, pgprocno, cvWaitLink)); + + if (!proclist_is_empty(&cv->wakeup)) + { + proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink); + if (!proclist_is_empty(&cv->wakeup)) + { + proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink); + have_sentinel = true; + } + } + SpinLockRelease(&cv->mutex); + + /* Awaken first waiter, if there was one. */ + if (proc != NULL) + SetLatch(&proc->procLatch); + + while (have_sentinel) + { + /* + * Each time through the loop, remove the first wakeup list entry, and + * signal it unless it's our sentinel. Repeat as long as the sentinel + * remains in the list. + * + * Notice that if someone else removes our sentinel, we will waken one + * additional process before exiting. That's intentional, because if + * someone else signals the CV, they may be intending to waken some + * third process that added itself to the list after we added the + * sentinel. Better to give a spurious wakeup (which should be + * harmless beyond wasting some cycles) than to lose a wakeup. + */ + proc = NULL; + SpinLockAcquire(&cv->mutex); + if (!proclist_is_empty(&cv->wakeup)) + proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink); + have_sentinel = proclist_contains(&cv->wakeup, pgprocno, cvWaitLink); + SpinLockRelease(&cv->mutex); + + if (proc != NULL && proc != MyProc) + SetLatch(&proc->procLatch); + } +} diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c new file mode 100644 index 0000000..2bdd20b --- /dev/null +++ b/src/backend/storage/lmgr/deadlock.c @@ -0,0 +1,1159 @@ +/*------------------------------------------------------------------------- + * + * deadlock.c + * POSTGRES deadlock detection code + * + * See src/backend/storage/lmgr/README for a description of the deadlock + * detection and resolution algorithms. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/deadlock.c + * + * Interface: + * + * DeadLockCheck() + * DeadLockReport() + * RememberSimpleDeadLock() + * InitDeadLockChecking() + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "utils/memutils.h" + + +/* + * One edge in the waits-for graph. + * + * waiter and blocker may or may not be members of a lock group, but if either + * is, it will be the leader rather than any other member of the lock group. + * The group leaders act as representatives of the whole group even though + * those particular processes need not be waiting at all. There will be at + * least one member of the waiter's lock group on the wait queue for the given + * lock, maybe more. + */ +typedef struct +{ + PGPROC *waiter; /* the leader of the waiting lock group */ + PGPROC *blocker; /* the leader of the group it is waiting for */ + LOCK *lock; /* the lock being waited for */ + int pred; /* workspace for TopoSort */ + int link; /* workspace for TopoSort */ +} EDGE; + +/* One potential reordering of a lock's wait queue */ +typedef struct +{ + LOCK *lock; /* the lock whose wait queue is described */ + PGPROC **procs; /* array of PGPROC *'s in new wait order */ + int nProcs; +} WAIT_ORDER; + +/* + * Information saved about each edge in a detected deadlock cycle. This + * is used to print a diagnostic message upon failure. + * + * Note: because we want to examine this info after releasing the lock + * manager's partition locks, we can't just store LOCK and PGPROC pointers; + * we must extract out all the info we want to be able to print. + */ +typedef struct +{ + LOCKTAG locktag; /* ID of awaited lock object */ + LOCKMODE lockmode; /* type of lock we're waiting for */ + int pid; /* PID of blocked backend */ +} DEADLOCK_INFO; + + +static bool DeadLockCheckRecurse(PGPROC *proc); +static int TestConfiguration(PGPROC *startProc); +static bool FindLockCycle(PGPROC *checkProc, + EDGE *softEdges, int *nSoftEdges); +static bool FindLockCycleRecurse(PGPROC *checkProc, int depth, + EDGE *softEdges, int *nSoftEdges); +static bool FindLockCycleRecurseMember(PGPROC *checkProc, + PGPROC *checkProcLeader, + int depth, EDGE *softEdges, int *nSoftEdges); +static bool ExpandConstraints(EDGE *constraints, int nConstraints); +static bool TopoSort(LOCK *lock, EDGE *constraints, int nConstraints, + PGPROC **ordering); + +#ifdef DEBUG_DEADLOCK +static void PrintLockQueue(LOCK *lock, const char *info); +#endif + + +/* + * Working space for the deadlock detector + */ + +/* Workspace for FindLockCycle */ +static PGPROC **visitedProcs; /* Array of visited procs */ +static int nVisitedProcs; + +/* Workspace for TopoSort */ +static PGPROC **topoProcs; /* Array of not-yet-output procs */ +static int *beforeConstraints; /* Counts of remaining before-constraints */ +static int *afterConstraints; /* List head for after-constraints */ + +/* Output area for ExpandConstraints */ +static WAIT_ORDER *waitOrders; /* Array of proposed queue rearrangements */ +static int nWaitOrders; +static PGPROC **waitOrderProcs; /* Space for waitOrders queue contents */ + +/* Current list of constraints being considered */ +static EDGE *curConstraints; +static int nCurConstraints; +static int maxCurConstraints; + +/* Storage space for results from FindLockCycle */ +static EDGE *possibleConstraints; +static int nPossibleConstraints; +static int maxPossibleConstraints; +static DEADLOCK_INFO *deadlockDetails; +static int nDeadlockDetails; + +/* PGPROC pointer of any blocking autovacuum worker found */ +static PGPROC *blocking_autovacuum_proc = NULL; + + +/* + * InitDeadLockChecking -- initialize deadlock checker during backend startup + * + * This does per-backend initialization of the deadlock checker; primarily, + * allocation of working memory for DeadLockCheck. We do this per-backend + * since there's no percentage in making the kernel do copy-on-write + * inheritance of workspace from the postmaster. We want to allocate the + * space at startup because (a) the deadlock checker might be invoked when + * there's no free memory left, and (b) the checker is normally run inside a + * signal handler, which is a very dangerous place to invoke palloc from. + */ +void +InitDeadLockChecking(void) +{ + MemoryContext oldcxt; + + /* Make sure allocations are permanent */ + oldcxt = MemoryContextSwitchTo(TopMemoryContext); + + /* + * FindLockCycle needs at most MaxBackends entries in visitedProcs[] and + * deadlockDetails[]. + */ + visitedProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *)); + deadlockDetails = (DEADLOCK_INFO *) palloc(MaxBackends * sizeof(DEADLOCK_INFO)); + + /* + * TopoSort needs to consider at most MaxBackends wait-queue entries, and + * it needn't run concurrently with FindLockCycle. + */ + topoProcs = visitedProcs; /* re-use this space */ + beforeConstraints = (int *) palloc(MaxBackends * sizeof(int)); + afterConstraints = (int *) palloc(MaxBackends * sizeof(int)); + + /* + * We need to consider rearranging at most MaxBackends/2 wait queues + * (since it takes at least two waiters in a queue to create a soft edge), + * and the expanded form of the wait queues can't involve more than + * MaxBackends total waiters. + */ + waitOrders = (WAIT_ORDER *) + palloc((MaxBackends / 2) * sizeof(WAIT_ORDER)); + waitOrderProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *)); + + /* + * Allow at most MaxBackends distinct constraints in a configuration. (Is + * this enough? In practice it seems it should be, but I don't quite see + * how to prove it. If we run out, we might fail to find a workable wait + * queue rearrangement even though one exists.) NOTE that this number + * limits the maximum recursion depth of DeadLockCheckRecurse. Making it + * really big might potentially allow a stack-overflow problem. + */ + maxCurConstraints = MaxBackends; + curConstraints = (EDGE *) palloc(maxCurConstraints * sizeof(EDGE)); + + /* + * Allow up to 3*MaxBackends constraints to be saved without having to + * re-run TestConfiguration. (This is probably more than enough, but we + * can survive if we run low on space by doing excess runs of + * TestConfiguration to re-compute constraint lists each time needed.) The + * last MaxBackends entries in possibleConstraints[] are reserved as + * output workspace for FindLockCycle. + */ + maxPossibleConstraints = MaxBackends * 4; + possibleConstraints = + (EDGE *) palloc(maxPossibleConstraints * sizeof(EDGE)); + + MemoryContextSwitchTo(oldcxt); +} + +/* + * DeadLockCheck -- Checks for deadlocks for a given process + * + * This code looks for deadlocks involving the given process. If any + * are found, it tries to rearrange lock wait queues to resolve the + * deadlock. If resolution is impossible, return DS_HARD_DEADLOCK --- + * the caller is then expected to abort the given proc's transaction. + * + * Caller must already have locked all partitions of the lock tables. + * + * On failure, deadlock details are recorded in deadlockDetails[] for + * subsequent printing by DeadLockReport(). That activity is separate + * because (a) we don't want to do it while holding all those LWLocks, + * and (b) we are typically invoked inside a signal handler. + */ +DeadLockState +DeadLockCheck(PGPROC *proc) +{ + /* Initialize to "no constraints" */ + nCurConstraints = 0; + nPossibleConstraints = 0; + nWaitOrders = 0; + + /* Initialize to not blocked by an autovacuum worker */ + blocking_autovacuum_proc = NULL; + + /* Search for deadlocks and possible fixes */ + if (DeadLockCheckRecurse(proc)) + { + /* + * Call FindLockCycle one more time, to record the correct + * deadlockDetails[] for the basic state with no rearrangements. + */ + int nSoftEdges; + + TRACE_POSTGRESQL_DEADLOCK_FOUND(); + + nWaitOrders = 0; + if (!FindLockCycle(proc, possibleConstraints, &nSoftEdges)) + elog(FATAL, "deadlock seems to have disappeared"); + + return DS_HARD_DEADLOCK; /* cannot find a non-deadlocked state */ + } + + /* Apply any needed rearrangements of wait queues */ + for (int i = 0; i < nWaitOrders; i++) + { + LOCK *lock = waitOrders[i].lock; + PGPROC **procs = waitOrders[i].procs; + int nProcs = waitOrders[i].nProcs; + dclist_head *waitQueue = &lock->waitProcs; + + Assert(nProcs == dclist_count(waitQueue)); + +#ifdef DEBUG_DEADLOCK + PrintLockQueue(lock, "DeadLockCheck:"); +#endif + + /* Reset the queue and re-add procs in the desired order */ + dclist_init(waitQueue); + for (int j = 0; j < nProcs; j++) + dclist_push_tail(waitQueue, &procs[j]->links); + +#ifdef DEBUG_DEADLOCK + PrintLockQueue(lock, "rearranged to:"); +#endif + + /* See if any waiters for the lock can be woken up now */ + ProcLockWakeup(GetLocksMethodTable(lock), lock); + } + + /* Return code tells caller if we had to escape a deadlock or not */ + if (nWaitOrders > 0) + return DS_SOFT_DEADLOCK; + else if (blocking_autovacuum_proc != NULL) + return DS_BLOCKED_BY_AUTOVACUUM; + else + return DS_NO_DEADLOCK; +} + +/* + * Return the PGPROC of the autovacuum that's blocking a process. + * + * We reset the saved pointer as soon as we pass it back. + */ +PGPROC * +GetBlockingAutoVacuumPgproc(void) +{ + PGPROC *ptr; + + ptr = blocking_autovacuum_proc; + blocking_autovacuum_proc = NULL; + + return ptr; +} + +/* + * DeadLockCheckRecurse -- recursively search for valid orderings + * + * curConstraints[] holds the current set of constraints being considered + * by an outer level of recursion. Add to this each possible solution + * constraint for any cycle detected at this level. + * + * Returns true if no solution exists. Returns false if a deadlock-free + * state is attainable, in which case waitOrders[] shows the required + * rearrangements of lock wait queues (if any). + */ +static bool +DeadLockCheckRecurse(PGPROC *proc) +{ + int nEdges; + int oldPossibleConstraints; + bool savedList; + int i; + + nEdges = TestConfiguration(proc); + if (nEdges < 0) + return true; /* hard deadlock --- no solution */ + if (nEdges == 0) + return false; /* good configuration found */ + if (nCurConstraints >= maxCurConstraints) + return true; /* out of room for active constraints? */ + oldPossibleConstraints = nPossibleConstraints; + if (nPossibleConstraints + nEdges + MaxBackends <= maxPossibleConstraints) + { + /* We can save the edge list in possibleConstraints[] */ + nPossibleConstraints += nEdges; + savedList = true; + } + else + { + /* Not room; will need to regenerate the edges on-the-fly */ + savedList = false; + } + + /* + * Try each available soft edge as an addition to the configuration. + */ + for (i = 0; i < nEdges; i++) + { + if (!savedList && i > 0) + { + /* Regenerate the list of possible added constraints */ + if (nEdges != TestConfiguration(proc)) + elog(FATAL, "inconsistent results during deadlock check"); + } + curConstraints[nCurConstraints] = + possibleConstraints[oldPossibleConstraints + i]; + nCurConstraints++; + if (!DeadLockCheckRecurse(proc)) + return false; /* found a valid solution! */ + /* give up on that added constraint, try again */ + nCurConstraints--; + } + nPossibleConstraints = oldPossibleConstraints; + return true; /* no solution found */ +} + + +/*-------------------- + * Test a configuration (current set of constraints) for validity. + * + * Returns: + * 0: the configuration is good (no deadlocks) + * -1: the configuration has a hard deadlock or is not self-consistent + * >0: the configuration has one or more soft deadlocks + * + * In the soft-deadlock case, one of the soft cycles is chosen arbitrarily + * and a list of its soft edges is returned beginning at + * possibleConstraints+nPossibleConstraints. The return value is the + * number of soft edges. + *-------------------- + */ +static int +TestConfiguration(PGPROC *startProc) +{ + int softFound = 0; + EDGE *softEdges = possibleConstraints + nPossibleConstraints; + int nSoftEdges; + int i; + + /* + * Make sure we have room for FindLockCycle's output. + */ + if (nPossibleConstraints + MaxBackends > maxPossibleConstraints) + return -1; + + /* + * Expand current constraint set into wait orderings. Fail if the + * constraint set is not self-consistent. + */ + if (!ExpandConstraints(curConstraints, nCurConstraints)) + return -1; + + /* + * Check for cycles involving startProc or any of the procs mentioned in + * constraints. We check startProc last because if it has a soft cycle + * still to be dealt with, we want to deal with that first. + */ + for (i = 0; i < nCurConstraints; i++) + { + if (FindLockCycle(curConstraints[i].waiter, softEdges, &nSoftEdges)) + { + if (nSoftEdges == 0) + return -1; /* hard deadlock detected */ + softFound = nSoftEdges; + } + if (FindLockCycle(curConstraints[i].blocker, softEdges, &nSoftEdges)) + { + if (nSoftEdges == 0) + return -1; /* hard deadlock detected */ + softFound = nSoftEdges; + } + } + if (FindLockCycle(startProc, softEdges, &nSoftEdges)) + { + if (nSoftEdges == 0) + return -1; /* hard deadlock detected */ + softFound = nSoftEdges; + } + return softFound; +} + + +/* + * FindLockCycle -- basic check for deadlock cycles + * + * Scan outward from the given proc to see if there is a cycle in the + * waits-for graph that includes this proc. Return true if a cycle + * is found, else false. If a cycle is found, we return a list of + * the "soft edges", if any, included in the cycle. These edges could + * potentially be eliminated by rearranging wait queues. We also fill + * deadlockDetails[] with information about the detected cycle; this info + * is not used by the deadlock algorithm itself, only to print a useful + * message after failing. + * + * Since we need to be able to check hypothetical configurations that would + * exist after wait queue rearrangement, the routine pays attention to the + * table of hypothetical queue orders in waitOrders[]. These orders will + * be believed in preference to the actual ordering seen in the locktable. + */ +static bool +FindLockCycle(PGPROC *checkProc, + EDGE *softEdges, /* output argument */ + int *nSoftEdges) /* output argument */ +{ + nVisitedProcs = 0; + nDeadlockDetails = 0; + *nSoftEdges = 0; + return FindLockCycleRecurse(checkProc, 0, softEdges, nSoftEdges); +} + +static bool +FindLockCycleRecurse(PGPROC *checkProc, + int depth, + EDGE *softEdges, /* output argument */ + int *nSoftEdges) /* output argument */ +{ + int i; + dlist_iter iter; + + /* + * If this process is a lock group member, check the leader instead. (Note + * that we might be the leader, in which case this is a no-op.) + */ + if (checkProc->lockGroupLeader != NULL) + checkProc = checkProc->lockGroupLeader; + + /* + * Have we already seen this proc? + */ + for (i = 0; i < nVisitedProcs; i++) + { + if (visitedProcs[i] == checkProc) + { + /* If we return to starting point, we have a deadlock cycle */ + if (i == 0) + { + /* + * record total length of cycle --- outer levels will now fill + * deadlockDetails[] + */ + Assert(depth <= MaxBackends); + nDeadlockDetails = depth; + + return true; + } + + /* + * Otherwise, we have a cycle but it does not include the start + * point, so say "no deadlock". + */ + return false; + } + } + /* Mark proc as seen */ + Assert(nVisitedProcs < MaxBackends); + visitedProcs[nVisitedProcs++] = checkProc; + + /* + * If the process is waiting, there is an outgoing waits-for edge to each + * process that blocks it. + */ + if (checkProc->links.next != NULL && checkProc->waitLock != NULL && + FindLockCycleRecurseMember(checkProc, checkProc, depth, softEdges, + nSoftEdges)) + return true; + + /* + * If the process is not waiting, there could still be outgoing waits-for + * edges if it is part of a lock group, because other members of the lock + * group might be waiting even though this process is not. (Given lock + * groups {A1, A2} and {B1, B2}, if A1 waits for B1 and B2 waits for A2, + * that is a deadlock even neither of B1 and A2 are waiting for anything.) + */ + dlist_foreach(iter, &checkProc->lockGroupMembers) + { + PGPROC *memberProc; + + memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur); + + if (memberProc->links.next != NULL && memberProc->waitLock != NULL && + memberProc != checkProc && + FindLockCycleRecurseMember(memberProc, checkProc, depth, softEdges, + nSoftEdges)) + return true; + } + + return false; +} + +static bool +FindLockCycleRecurseMember(PGPROC *checkProc, + PGPROC *checkProcLeader, + int depth, + EDGE *softEdges, /* output argument */ + int *nSoftEdges) /* output argument */ +{ + PGPROC *proc; + LOCK *lock = checkProc->waitLock; + dlist_iter proclock_iter; + LockMethod lockMethodTable; + int conflictMask; + int i; + int numLockModes, + lm; + + /* + * The relation extension lock can never participate in actual deadlock + * cycle. See Assert in LockAcquireExtended. So, there is no advantage + * in checking wait edges from it. + */ + if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND) + return false; + + lockMethodTable = GetLocksMethodTable(lock); + numLockModes = lockMethodTable->numLockModes; + conflictMask = lockMethodTable->conflictTab[checkProc->waitLockMode]; + + /* + * Scan for procs that already hold conflicting locks. These are "hard" + * edges in the waits-for graph. + */ + dlist_foreach(proclock_iter, &lock->procLocks) + { + PROCLOCK *proclock = dlist_container(PROCLOCK, lockLink, proclock_iter.cur); + PGPROC *leader; + + proc = proclock->tag.myProc; + leader = proc->lockGroupLeader == NULL ? proc : proc->lockGroupLeader; + + /* A proc never blocks itself or any other lock group member */ + if (leader != checkProcLeader) + { + for (lm = 1; lm <= numLockModes; lm++) + { + if ((proclock->holdMask & LOCKBIT_ON(lm)) && + (conflictMask & LOCKBIT_ON(lm))) + { + /* This proc hard-blocks checkProc */ + if (FindLockCycleRecurse(proc, depth + 1, + softEdges, nSoftEdges)) + { + /* fill deadlockDetails[] */ + DEADLOCK_INFO *info = &deadlockDetails[depth]; + + info->locktag = lock->tag; + info->lockmode = checkProc->waitLockMode; + info->pid = checkProc->pid; + + return true; + } + + /* + * No deadlock here, but see if this proc is an autovacuum + * that is directly hard-blocking our own proc. If so, + * report it so that the caller can send a cancel signal + * to it, if appropriate. If there's more than one such + * proc, it's indeterminate which one will be reported. + * + * We don't touch autovacuums that are indirectly blocking + * us; it's up to the direct blockee to take action. This + * rule simplifies understanding the behavior and ensures + * that an autovacuum won't be canceled with less than + * deadlock_timeout grace period. + * + * Note we read statusFlags without any locking. This is + * OK only for checking the PROC_IS_AUTOVACUUM flag, + * because that flag is set at process start and never + * reset. There is logic elsewhere to avoid canceling an + * autovacuum that is working to prevent XID wraparound + * problems (which needs to read a different statusFlags + * bit), but we don't do that here to avoid grabbing + * ProcArrayLock. + */ + if (checkProc == MyProc && + proc->statusFlags & PROC_IS_AUTOVACUUM) + blocking_autovacuum_proc = proc; + + /* We're done looking at this proclock */ + break; + } + } + } + } + + /* + * Scan for procs that are ahead of this one in the lock's wait queue. + * Those that have conflicting requests soft-block this one. This must be + * done after the hard-block search, since if another proc both hard- and + * soft-blocks this one, we want to call it a hard edge. + * + * If there is a proposed re-ordering of the lock's wait order, use that + * rather than the current wait order. + */ + for (i = 0; i < nWaitOrders; i++) + { + if (waitOrders[i].lock == lock) + break; + } + + if (i < nWaitOrders) + { + /* Use the given hypothetical wait queue order */ + PGPROC **procs = waitOrders[i].procs; + int queue_size = waitOrders[i].nProcs; + + for (i = 0; i < queue_size; i++) + { + PGPROC *leader; + + proc = procs[i]; + leader = proc->lockGroupLeader == NULL ? proc : + proc->lockGroupLeader; + + /* + * TopoSort will always return an ordering with group members + * adjacent to each other in the wait queue (see comments + * therein). So, as soon as we reach a process in the same lock + * group as checkProc, we know we've found all the conflicts that + * precede any member of the lock group lead by checkProcLeader. + */ + if (leader == checkProcLeader) + break; + + /* Is there a conflict with this guy's request? */ + if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0) + { + /* This proc soft-blocks checkProc */ + if (FindLockCycleRecurse(proc, depth + 1, + softEdges, nSoftEdges)) + { + /* fill deadlockDetails[] */ + DEADLOCK_INFO *info = &deadlockDetails[depth]; + + info->locktag = lock->tag; + info->lockmode = checkProc->waitLockMode; + info->pid = checkProc->pid; + + /* + * Add this edge to the list of soft edges in the cycle + */ + Assert(*nSoftEdges < MaxBackends); + softEdges[*nSoftEdges].waiter = checkProcLeader; + softEdges[*nSoftEdges].blocker = leader; + softEdges[*nSoftEdges].lock = lock; + (*nSoftEdges)++; + return true; + } + } + } + } + else + { + PGPROC *lastGroupMember = NULL; + dlist_iter proc_iter; + dclist_head *waitQueue; + + /* Use the true lock wait queue order */ + waitQueue = &lock->waitProcs; + + /* + * Find the last member of the lock group that is present in the wait + * queue. Anything after this is not a soft lock conflict. If group + * locking is not in use, then we know immediately which process we're + * looking for, but otherwise we've got to search the wait queue to + * find the last process actually present. + */ + if (checkProc->lockGroupLeader == NULL) + lastGroupMember = checkProc; + else + { + dclist_foreach(proc_iter, waitQueue) + { + proc = dlist_container(PGPROC, links, proc_iter.cur); + + if (proc->lockGroupLeader == checkProcLeader) + lastGroupMember = proc; + } + Assert(lastGroupMember != NULL); + } + + /* + * OK, now rescan (or scan) the queue to identify the soft conflicts. + */ + dclist_foreach(proc_iter, waitQueue) + { + PGPROC *leader; + + proc = dlist_container(PGPROC, links, proc_iter.cur); + + leader = proc->lockGroupLeader == NULL ? proc : + proc->lockGroupLeader; + + /* Done when we reach the target proc */ + if (proc == lastGroupMember) + break; + + /* Is there a conflict with this guy's request? */ + if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0 && + leader != checkProcLeader) + { + /* This proc soft-blocks checkProc */ + if (FindLockCycleRecurse(proc, depth + 1, + softEdges, nSoftEdges)) + { + /* fill deadlockDetails[] */ + DEADLOCK_INFO *info = &deadlockDetails[depth]; + + info->locktag = lock->tag; + info->lockmode = checkProc->waitLockMode; + info->pid = checkProc->pid; + + /* + * Add this edge to the list of soft edges in the cycle + */ + Assert(*nSoftEdges < MaxBackends); + softEdges[*nSoftEdges].waiter = checkProcLeader; + softEdges[*nSoftEdges].blocker = leader; + softEdges[*nSoftEdges].lock = lock; + (*nSoftEdges)++; + return true; + } + } + } + } + + /* + * No conflict detected here. + */ + return false; +} + + +/* + * ExpandConstraints -- expand a list of constraints into a set of + * specific new orderings for affected wait queues + * + * Input is a list of soft edges to be reversed. The output is a list + * of nWaitOrders WAIT_ORDER structs in waitOrders[], with PGPROC array + * workspace in waitOrderProcs[]. + * + * Returns true if able to build an ordering that satisfies all the + * constraints, false if not (there are contradictory constraints). + */ +static bool +ExpandConstraints(EDGE *constraints, + int nConstraints) +{ + int nWaitOrderProcs = 0; + int i, + j; + + nWaitOrders = 0; + + /* + * Scan constraint list backwards. This is because the last-added + * constraint is the only one that could fail, and so we want to test it + * for inconsistency first. + */ + for (i = nConstraints; --i >= 0;) + { + LOCK *lock = constraints[i].lock; + + /* Did we already make a list for this lock? */ + for (j = nWaitOrders; --j >= 0;) + { + if (waitOrders[j].lock == lock) + break; + } + if (j >= 0) + continue; + /* No, so allocate a new list */ + waitOrders[nWaitOrders].lock = lock; + waitOrders[nWaitOrders].procs = waitOrderProcs + nWaitOrderProcs; + waitOrders[nWaitOrders].nProcs = dclist_count(&lock->waitProcs); + nWaitOrderProcs += dclist_count(&lock->waitProcs); + Assert(nWaitOrderProcs <= MaxBackends); + + /* + * Do the topo sort. TopoSort need not examine constraints after this + * one, since they must be for different locks. + */ + if (!TopoSort(lock, constraints, i + 1, + waitOrders[nWaitOrders].procs)) + return false; + nWaitOrders++; + } + return true; +} + + +/* + * TopoSort -- topological sort of a wait queue + * + * Generate a re-ordering of a lock's wait queue that satisfies given + * constraints about certain procs preceding others. (Each such constraint + * is a fact of a partial ordering.) Minimize rearrangement of the queue + * not needed to achieve the partial ordering. + * + * This is a lot simpler and slower than, for example, the topological sort + * algorithm shown in Knuth's Volume 1. However, Knuth's method doesn't + * try to minimize the damage to the existing order. In practice we are + * not likely to be working with more than a few constraints, so the apparent + * slowness of the algorithm won't really matter. + * + * The initial queue ordering is taken directly from the lock's wait queue. + * The output is an array of PGPROC pointers, of length equal to the lock's + * wait queue length (the caller is responsible for providing this space). + * The partial order is specified by an array of EDGE structs. Each EDGE + * is one that we need to reverse, therefore the "waiter" must appear before + * the "blocker" in the output array. The EDGE array may well contain + * edges associated with other locks; these should be ignored. + * + * Returns true if able to build an ordering that satisfies all the + * constraints, false if not (there are contradictory constraints). + */ +static bool +TopoSort(LOCK *lock, + EDGE *constraints, + int nConstraints, + PGPROC **ordering) /* output argument */ +{ + dclist_head *waitQueue = &lock->waitProcs; + int queue_size = dclist_count(waitQueue); + PGPROC *proc; + int i, + j, + jj, + k, + kk, + last; + dlist_iter proc_iter; + + /* First, fill topoProcs[] array with the procs in their current order */ + i = 0; + dclist_foreach(proc_iter, waitQueue) + { + proc = dlist_container(PGPROC, links, proc_iter.cur); + topoProcs[i++] = proc; + } + Assert(i == queue_size); + + /* + * Scan the constraints, and for each proc in the array, generate a count + * of the number of constraints that say it must be before something else, + * plus a list of the constraints that say it must be after something + * else. The count for the j'th proc is stored in beforeConstraints[j], + * and the head of its list in afterConstraints[j]. Each constraint + * stores its list link in constraints[i].link (note any constraint will + * be in just one list). The array index for the before-proc of the i'th + * constraint is remembered in constraints[i].pred. + * + * Note that it's not necessarily the case that every constraint affects + * this particular wait queue. Prior to group locking, a process could be + * waiting for at most one lock. But a lock group can be waiting for + * zero, one, or multiple locks. Since topoProcs[] is an array of the + * processes actually waiting, while constraints[] is an array of group + * leaders, we've got to scan through topoProcs[] for each constraint, + * checking whether both a waiter and a blocker for that group are + * present. If so, the constraint is relevant to this wait queue; if not, + * it isn't. + */ + MemSet(beforeConstraints, 0, queue_size * sizeof(int)); + MemSet(afterConstraints, 0, queue_size * sizeof(int)); + for (i = 0; i < nConstraints; i++) + { + /* + * Find a representative process that is on the lock queue and part of + * the waiting lock group. This may or may not be the leader, which + * may or may not be waiting at all. If there are any other processes + * in the same lock group on the queue, set their number of + * beforeConstraints to -1 to indicate that they should be emitted + * with their groupmates rather than considered separately. + * + * In this loop and the similar one just below, it's critical that we + * consistently select the same representative member of any one lock + * group, so that all the constraints are associated with the same + * proc, and the -1's are only associated with not-representative + * members. We select the last one in the topoProcs array. + */ + proc = constraints[i].waiter; + Assert(proc != NULL); + jj = -1; + for (j = queue_size; --j >= 0;) + { + PGPROC *waiter = topoProcs[j]; + + if (waiter == proc || waiter->lockGroupLeader == proc) + { + Assert(waiter->waitLock == lock); + if (jj == -1) + jj = j; + else + { + Assert(beforeConstraints[j] <= 0); + beforeConstraints[j] = -1; + } + } + } + + /* If no matching waiter, constraint is not relevant to this lock. */ + if (jj < 0) + continue; + + /* + * Similarly, find a representative process that is on the lock queue + * and waiting for the blocking lock group. Again, this could be the + * leader but does not need to be. + */ + proc = constraints[i].blocker; + Assert(proc != NULL); + kk = -1; + for (k = queue_size; --k >= 0;) + { + PGPROC *blocker = topoProcs[k]; + + if (blocker == proc || blocker->lockGroupLeader == proc) + { + Assert(blocker->waitLock == lock); + if (kk == -1) + kk = k; + else + { + Assert(beforeConstraints[k] <= 0); + beforeConstraints[k] = -1; + } + } + } + + /* If no matching blocker, constraint is not relevant to this lock. */ + if (kk < 0) + continue; + + Assert(beforeConstraints[jj] >= 0); + beforeConstraints[jj]++; /* waiter must come before */ + /* add this constraint to list of after-constraints for blocker */ + constraints[i].pred = jj; + constraints[i].link = afterConstraints[kk]; + afterConstraints[kk] = i + 1; + } + + /*-------------------- + * Now scan the topoProcs array backwards. At each step, output the + * last proc that has no remaining before-constraints plus any other + * members of the same lock group; then decrease the beforeConstraints + * count of each of the procs it was constrained against. + * i = index of ordering[] entry we want to output this time + * j = search index for topoProcs[] + * k = temp for scanning constraint list for proc j + * last = last non-null index in topoProcs (avoid redundant searches) + *-------------------- + */ + last = queue_size - 1; + for (i = queue_size - 1; i >= 0;) + { + int c; + int nmatches = 0; + + /* Find next candidate to output */ + while (topoProcs[last] == NULL) + last--; + for (j = last; j >= 0; j--) + { + if (topoProcs[j] != NULL && beforeConstraints[j] == 0) + break; + } + + /* If no available candidate, topological sort fails */ + if (j < 0) + return false; + + /* + * Output everything in the lock group. There's no point in + * outputting an ordering where members of the same lock group are not + * consecutive on the wait queue: if some other waiter is between two + * requests that belong to the same group, then either it conflicts + * with both of them and is certainly not a solution; or it conflicts + * with at most one of them and is thus isomorphic to an ordering + * where the group members are consecutive. + */ + proc = topoProcs[j]; + if (proc->lockGroupLeader != NULL) + proc = proc->lockGroupLeader; + Assert(proc != NULL); + for (c = 0; c <= last; ++c) + { + if (topoProcs[c] == proc || (topoProcs[c] != NULL && + topoProcs[c]->lockGroupLeader == proc)) + { + ordering[i - nmatches] = topoProcs[c]; + topoProcs[c] = NULL; + ++nmatches; + } + } + Assert(nmatches > 0); + i -= nmatches; + + /* Update beforeConstraints counts of its predecessors */ + for (k = afterConstraints[j]; k > 0; k = constraints[k - 1].link) + beforeConstraints[constraints[k - 1].pred]--; + } + + /* Done */ + return true; +} + +#ifdef DEBUG_DEADLOCK +static void +PrintLockQueue(LOCK *lock, const char *info) +{ + dclist_head *waitQueue = &lock->waitProcs; + dlist_iter proc_iter; + + printf("%s lock %p queue ", info, lock); + + dclist_foreach(proc_iter, waitQueue) + { + PGPROC *proc = dlist_container(PGPROC, links, proc_iter.cur); + + printf(" %d", proc->pid); + } + printf("\n"); + fflush(stdout); +} +#endif + +/* + * Report a detected deadlock, with available details. + */ +void +DeadLockReport(void) +{ + StringInfoData clientbuf; /* errdetail for client */ + StringInfoData logbuf; /* errdetail for server log */ + StringInfoData locktagbuf; + int i; + + initStringInfo(&clientbuf); + initStringInfo(&logbuf); + initStringInfo(&locktagbuf); + + /* Generate the "waits for" lines sent to the client */ + for (i = 0; i < nDeadlockDetails; i++) + { + DEADLOCK_INFO *info = &deadlockDetails[i]; + int nextpid; + + /* The last proc waits for the first one... */ + if (i < nDeadlockDetails - 1) + nextpid = info[1].pid; + else + nextpid = deadlockDetails[0].pid; + + /* reset locktagbuf to hold next object description */ + resetStringInfo(&locktagbuf); + + DescribeLockTag(&locktagbuf, &info->locktag); + + if (i > 0) + appendStringInfoChar(&clientbuf, '\n'); + + appendStringInfo(&clientbuf, + _("Process %d waits for %s on %s; blocked by process %d."), + info->pid, + GetLockmodeName(info->locktag.locktag_lockmethodid, + info->lockmode), + locktagbuf.data, + nextpid); + } + + /* Duplicate all the above for the server ... */ + appendBinaryStringInfo(&logbuf, clientbuf.data, clientbuf.len); + + /* ... and add info about query strings */ + for (i = 0; i < nDeadlockDetails; i++) + { + DEADLOCK_INFO *info = &deadlockDetails[i]; + + appendStringInfoChar(&logbuf, '\n'); + + appendStringInfo(&logbuf, + _("Process %d: %s"), + info->pid, + pgstat_get_backend_current_activity(info->pid, false)); + } + + pgstat_report_deadlock(); + + ereport(ERROR, + (errcode(ERRCODE_T_R_DEADLOCK_DETECTED), + errmsg("deadlock detected"), + errdetail_internal("%s", clientbuf.data), + errdetail_log("%s", logbuf.data), + errhint("See server log for query details."))); +} + +/* + * RememberSimpleDeadLock: set up info for DeadLockReport when ProcSleep + * detects a trivial (two-way) deadlock. proc1 wants to block for lockmode + * on lock, but proc2 is already waiting and would be blocked by proc1. + */ +void +RememberSimpleDeadLock(PGPROC *proc1, + LOCKMODE lockmode, + LOCK *lock, + PGPROC *proc2) +{ + DEADLOCK_INFO *info = &deadlockDetails[0]; + + info->locktag = lock->tag; + info->lockmode = lockmode; + info->pid = proc1->pid; + info++; + info->locktag = proc2->waitLock->tag; + info->lockmode = proc2->waitLockMode; + info->pid = proc2->pid; + nDeadlockDetails = 2; +} diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl new file mode 100644 index 0000000..863c882 --- /dev/null +++ b/src/backend/storage/lmgr/generate-lwlocknames.pl @@ -0,0 +1,77 @@ +#!/usr/bin/perl +# +# Generate lwlocknames.h and lwlocknames.c from lwlocknames.txt +# Copyright (c) 2000-2023, PostgreSQL Global Development Group + +use strict; +use warnings; +use Getopt::Long; + +my $output_path = '.'; + +my $lastlockidx = -1; +my $continue = "\n"; + +GetOptions('outdir:s' => \$output_path); + +open my $lwlocknames, '<', $ARGV[0] or die; + +# Include PID in suffix in case parallel make runs this multiple times. +my $htmp = "$output_path/lwlocknames.h.tmp$$"; +my $ctmp = "$output_path/lwlocknames.c.tmp$$"; +open my $h, '>', $htmp or die "Could not open $htmp: $!"; +open my $c, '>', $ctmp or die "Could not open $ctmp: $!"; + +my $autogen = + "/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */\n"; +print $h $autogen; +print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n"; +print $c $autogen, "\n"; + +print $c "const char *const IndividualLWLockNames[] = {"; + +while (<$lwlocknames>) +{ + chomp; + + # Skip comments + next if /^#/; + next if /^\s*$/; + + die "unable to parse lwlocknames.txt" + unless /^(\w+)\s+(\d+)$/; + + (my $lockname, my $lockidx) = ($1, $2); + + my $trimmedlockname = $lockname; + $trimmedlockname =~ s/Lock$//; + die "lock names must end with 'Lock'" if $trimmedlockname eq $lockname; + + die "lwlocknames.txt not in order" if $lockidx < $lastlockidx; + die "lwlocknames.txt has duplicates" if $lockidx == $lastlockidx; + + while ($lastlockidx < $lockidx - 1) + { + ++$lastlockidx; + printf $c "%s \"\"", $continue, $lastlockidx; + $continue = ",\n"; + } + printf $c "%s \"%s\"", $continue, $trimmedlockname; + $lastlockidx = $lockidx; + $continue = ",\n"; + + print $h "#define $lockname (&MainLWLockArray[$lockidx].lock)\n"; +} + +printf $c "\n};\n"; +print $h "\n"; +printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1; + +close $h; +close $c; + +rename($htmp, "$output_path/lwlocknames.h") + || die "rename: $htmp to $output_path/lwlocknames.h: $!"; +rename($ctmp, "$output_path/lwlocknames.c") || die "rename: $ctmp: $!"; + +close $lwlocknames; diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c new file mode 100644 index 0000000..ee9b89a --- /dev/null +++ b/src/backend/storage/lmgr/lmgr.c @@ -0,0 +1,1270 @@ +/*------------------------------------------------------------------------- + * + * lmgr.c + * POSTGRES lock manager code + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/lmgr.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/xact.h" +#include "catalog/catalog.h" +#include "commands/progress.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "utils/inval.h" + + +/* + * Per-backend counter for generating speculative insertion tokens. + * + * This may wrap around, but that's OK as it's only used for the short + * duration between inserting a tuple and checking that there are no (unique) + * constraint violations. It's theoretically possible that a backend sees a + * tuple that was speculatively inserted by another backend, but before it has + * started waiting on the token, the other backend completes its insertion, + * and then performs 2^32 unrelated insertions. And after all that, the + * first backend finally calls SpeculativeInsertionLockAcquire(), with the + * intention of waiting for the first insertion to complete, but ends up + * waiting for the latest unrelated insertion instead. Even then, nothing + * particularly bad happens: in the worst case they deadlock, causing one of + * the transactions to abort. + */ +static uint32 speculativeInsertionToken = 0; + + +/* + * Struct to hold context info for transaction lock waits. + * + * 'oper' is the operation that needs to wait for the other transaction; 'rel' + * and 'ctid' specify the address of the tuple being waited for. + */ +typedef struct XactLockTableWaitInfo +{ + XLTW_Oper oper; + Relation rel; + ItemPointer ctid; +} XactLockTableWaitInfo; + +static void XactLockTableWaitErrorCb(void *arg); + +/* + * RelationInitLockInfo + * Initializes the lock information in a relation descriptor. + * + * relcache.c must call this during creation of any reldesc. + */ +void +RelationInitLockInfo(Relation relation) +{ + Assert(RelationIsValid(relation)); + Assert(OidIsValid(RelationGetRelid(relation))); + + relation->rd_lockInfo.lockRelId.relId = RelationGetRelid(relation); + + if (relation->rd_rel->relisshared) + relation->rd_lockInfo.lockRelId.dbId = InvalidOid; + else + relation->rd_lockInfo.lockRelId.dbId = MyDatabaseId; +} + +/* + * SetLocktagRelationOid + * Set up a locktag for a relation, given only relation OID + */ +static inline void +SetLocktagRelationOid(LOCKTAG *tag, Oid relid) +{ + Oid dbid; + + if (IsSharedRelation(relid)) + dbid = InvalidOid; + else + dbid = MyDatabaseId; + + SET_LOCKTAG_RELATION(*tag, dbid, relid); +} + +/* + * LockRelationOid + * + * Lock a relation given only its OID. This should generally be used + * before attempting to open the relation's relcache entry. + */ +void +LockRelationOid(Oid relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SetLocktagRelationOid(&tag, relid); + + res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock); + + /* + * Now that we have the lock, check for invalidation messages, so that we + * will update or flush any stale relcache entry before we try to use it. + * RangeVarGetRelid() specifically relies on us for this. We can skip + * this in the not-uncommon case that we already had the same type of lock + * being requested, since then no one else could have modified the + * relcache entry in an undesirable way. (In the case where our own xact + * modifies the rel, the relcache update happens via + * CommandCounterIncrement, not here.) + * + * However, in corner cases where code acts on tables (usually catalogs) + * recursively, we might get here while still processing invalidation + * messages in some outer execution of this function or a sibling. The + * "cleared" status of the lock tells us whether we really are done + * absorbing relevant inval messages. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } +} + +/* + * ConditionalLockRelationOid + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + * + * NOTE: we do not currently need conditional versions of all the + * LockXXX routines in this file, but they could easily be added if needed. + */ +bool +ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SetLocktagRelationOid(&tag, relid); + + res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock); + + if (res == LOCKACQUIRE_NOT_AVAIL) + return false; + + /* + * Now that we have the lock, check for invalidation messages; see notes + * in LockRelationOid. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } + + return true; +} + +/* + * LockRelationId + * + * Lock, given a LockRelId. Same as LockRelationOid but take LockRelId as an + * input. + */ +void +LockRelationId(LockRelId *relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId); + + res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock); + + /* + * Now that we have the lock, check for invalidation messages; see notes + * in LockRelationOid. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } +} + +/* + * UnlockRelationId + * + * Unlock, given a LockRelId. This is preferred over UnlockRelationOid + * for speed reasons. + */ +void +UnlockRelationId(LockRelId *relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId); + + LockRelease(&tag, lockmode, false); +} + +/* + * UnlockRelationOid + * + * Unlock, given only a relation Oid. Use UnlockRelationId if you can. + */ +void +UnlockRelationOid(Oid relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SetLocktagRelationOid(&tag, relid); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockRelation + * + * This is a convenience routine for acquiring an additional lock on an + * already-open relation. Never try to do "relation_open(foo, NoLock)" + * and then lock with this. + */ +void +LockRelation(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock); + + /* + * Now that we have the lock, check for invalidation messages; see notes + * in LockRelationOid. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } +} + +/* + * ConditionalLockRelation + * + * This is a convenience routine for acquiring an additional lock on an + * already-open relation. Never try to do "relation_open(foo, NoLock)" + * and then lock with this. + */ +bool +ConditionalLockRelation(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock); + + if (res == LOCKACQUIRE_NOT_AVAIL) + return false; + + /* + * Now that we have the lock, check for invalidation messages; see notes + * in LockRelationOid. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } + + return true; +} + +/* + * UnlockRelation + * + * This is a convenience routine for unlocking a relation without also + * closing it. + */ +void +UnlockRelation(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + LockRelease(&tag, lockmode, false); +} + +/* + * CheckRelationLockedByMe + * + * Returns true if current transaction holds a lock on 'relation' of mode + * 'lockmode'. If 'orstronger' is true, a stronger lockmode is also OK. + * ("Stronger" is defined as "numerically higher", which is a bit + * semantically dubious but is OK for the purposes we use this for.) + */ +bool +CheckRelationLockedByMe(Relation relation, LOCKMODE lockmode, bool orstronger) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + if (LockHeldByMe(&tag, lockmode)) + return true; + + if (orstronger) + { + LOCKMODE slockmode; + + for (slockmode = lockmode + 1; + slockmode <= MaxLockMode; + slockmode++) + { + if (LockHeldByMe(&tag, slockmode)) + { +#ifdef NOT_USED + /* Sometimes this might be useful for debugging purposes */ + elog(WARNING, "lock mode %s substituted for %s on relation %s", + GetLockmodeName(tag.locktag_lockmethodid, slockmode), + GetLockmodeName(tag.locktag_lockmethodid, lockmode), + RelationGetRelationName(relation)); +#endif + return true; + } + } + } + + return false; +} + +/* + * LockHasWaitersRelation + * + * This is a function to check whether someone else is waiting for a + * lock which we are currently holding. + */ +bool +LockHasWaitersRelation(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return LockHasWaiters(&tag, lockmode, false); +} + +/* + * LockRelationIdForSession + * + * This routine grabs a session-level lock on the target relation. The + * session lock persists across transaction boundaries. It will be removed + * when UnlockRelationIdForSession() is called, or if an ereport(ERROR) occurs, + * or if the backend exits. + * + * Note that one should also grab a transaction-level lock on the rel + * in any transaction that actually uses the rel, to ensure that the + * relcache entry is up to date. + */ +void +LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId); + + (void) LockAcquire(&tag, lockmode, true, false); +} + +/* + * UnlockRelationIdForSession + */ +void +UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId); + + LockRelease(&tag, lockmode, true); +} + +/* + * LockRelationForExtension + * + * This lock tag is used to interlock addition of pages to relations. + * We need such locking because bufmgr/smgr definition of P_NEW is not + * race-condition-proof. + * + * We assume the caller is already holding some type of regular lock on + * the relation, so no AcceptInvalidationMessages call is needed here. + */ +void +LockRelationForExtension(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* + * ConditionalLockRelationForExtension + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + */ +bool +ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); +} + +/* + * RelationExtensionLockWaiterCount + * + * Count the number of processes waiting for the given relation extension lock. + */ +int +RelationExtensionLockWaiterCount(Relation relation) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + return LockWaiterCount(&tag); +} + +/* + * UnlockRelationForExtension + */ +void +UnlockRelationForExtension(Relation relation, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_RELATION_EXTEND(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockDatabaseFrozenIds + * + * This allows one backend per database to execute vac_update_datfrozenxid(). + */ +void +LockDatabaseFrozenIds(LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* + * LockPage + * + * Obtain a page-level lock. This is currently used by some index access + * methods to lock individual index pages. + */ +void +LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_PAGE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + blkno); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* + * ConditionalLockPage + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + */ +bool +ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_PAGE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + blkno); + + return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); +} + +/* + * UnlockPage + */ +void +UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_PAGE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + blkno); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockTuple + * + * Obtain a tuple-level lock. This is used in a less-than-intuitive fashion + * because we can't afford to keep a separate lock in shared memory for every + * tuple. See heap_lock_tuple before using this! + */ +void +LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_TUPLE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + (void) LockAcquire(&tag, lockmode, false, false); +} + +/* + * ConditionalLockTuple + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + */ +bool +ConditionalLockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_TUPLE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL); +} + +/* + * UnlockTuple + */ +void +UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_TUPLE(tag, + relation->rd_lockInfo.lockRelId.dbId, + relation->rd_lockInfo.lockRelId.relId, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + + LockRelease(&tag, lockmode, false); +} + +/* + * XactLockTableInsert + * + * Insert a lock showing that the given transaction ID is running --- + * this is done when an XID is acquired by a transaction or subtransaction. + * The lock can then be used to wait for the transaction to finish. + */ +void +XactLockTableInsert(TransactionId xid) +{ + LOCKTAG tag; + + SET_LOCKTAG_TRANSACTION(tag, xid); + + (void) LockAcquire(&tag, ExclusiveLock, false, false); +} + +/* + * XactLockTableDelete + * + * Delete the lock showing that the given transaction ID is running. + * (This is never used for main transaction IDs; those locks are only + * released implicitly at transaction end. But we do use it for subtrans IDs.) + */ +void +XactLockTableDelete(TransactionId xid) +{ + LOCKTAG tag; + + SET_LOCKTAG_TRANSACTION(tag, xid); + + LockRelease(&tag, ExclusiveLock, false); +} + +/* + * XactLockTableWait + * + * Wait for the specified transaction to commit or abort. If an operation + * is specified, an error context callback is set up. If 'oper' is passed as + * None, no error context callback is set up. + * + * Note that this does the right thing for subtransactions: if we wait on a + * subtransaction, we will exit as soon as it aborts or its top parent commits. + * It takes some extra work to ensure this, because to save on shared memory + * the XID lock of a subtransaction is released when it ends, whether + * successfully or unsuccessfully. So we have to check if it's "still running" + * and if so wait for its parent. + */ +void +XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid, + XLTW_Oper oper) +{ + LOCKTAG tag; + XactLockTableWaitInfo info; + ErrorContextCallback callback; + bool first = true; + + /* + * If an operation is specified, set up our verbose error context + * callback. + */ + if (oper != XLTW_None) + { + Assert(RelationIsValid(rel)); + Assert(ItemPointerIsValid(ctid)); + + info.rel = rel; + info.ctid = ctid; + info.oper = oper; + + callback.callback = XactLockTableWaitErrorCb; + callback.arg = &info; + callback.previous = error_context_stack; + error_context_stack = &callback; + } + + for (;;) + { + Assert(TransactionIdIsValid(xid)); + Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); + + SET_LOCKTAG_TRANSACTION(tag, xid); + + (void) LockAcquire(&tag, ShareLock, false, false); + + LockRelease(&tag, ShareLock, false); + + if (!TransactionIdIsInProgress(xid)) + break; + + /* + * If the Xid belonged to a subtransaction, then the lock would have + * gone away as soon as it was finished; for correct tuple visibility, + * the right action is to wait on its parent transaction to go away. + * But instead of going levels up one by one, we can just wait for the + * topmost transaction to finish with the same end result, which also + * incurs less locktable traffic. + * + * Some uses of this function don't involve tuple visibility -- such + * as when building snapshots for logical decoding. It is possible to + * see a transaction in ProcArray before it registers itself in the + * locktable. The topmost transaction in that case is the same xid, + * so we try again after a short sleep. (Don't sleep the first time + * through, to avoid slowing down the normal case.) + */ + if (!first) + pg_usleep(1000L); + first = false; + xid = SubTransGetTopmostTransaction(xid); + } + + if (oper != XLTW_None) + error_context_stack = callback.previous; +} + +/* + * ConditionalXactLockTableWait + * + * As above, but only lock if we can get the lock without blocking. + * Returns true if the lock was acquired. + */ +bool +ConditionalXactLockTableWait(TransactionId xid) +{ + LOCKTAG tag; + bool first = true; + + for (;;) + { + Assert(TransactionIdIsValid(xid)); + Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); + + SET_LOCKTAG_TRANSACTION(tag, xid); + + if (LockAcquire(&tag, ShareLock, false, true) == LOCKACQUIRE_NOT_AVAIL) + return false; + + LockRelease(&tag, ShareLock, false); + + if (!TransactionIdIsInProgress(xid)) + break; + + /* See XactLockTableWait about this case */ + if (!first) + pg_usleep(1000L); + first = false; + xid = SubTransGetTopmostTransaction(xid); + } + + return true; +} + +/* + * SpeculativeInsertionLockAcquire + * + * Insert a lock showing that the given transaction ID is inserting a tuple, + * but hasn't yet decided whether it's going to keep it. The lock can then be + * used to wait for the decision to go ahead with the insertion, or aborting + * it. + * + * The token is used to distinguish multiple insertions by the same + * transaction. It is returned to caller. + */ +uint32 +SpeculativeInsertionLockAcquire(TransactionId xid) +{ + LOCKTAG tag; + + speculativeInsertionToken++; + + /* + * Check for wrap-around. Zero means no token is held, so don't use that. + */ + if (speculativeInsertionToken == 0) + speculativeInsertionToken = 1; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken); + + (void) LockAcquire(&tag, ExclusiveLock, false, false); + + return speculativeInsertionToken; +} + +/* + * SpeculativeInsertionLockRelease + * + * Delete the lock showing that the given transaction is speculatively + * inserting a tuple. + */ +void +SpeculativeInsertionLockRelease(TransactionId xid) +{ + LOCKTAG tag; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken); + + LockRelease(&tag, ExclusiveLock, false); +} + +/* + * SpeculativeInsertionWait + * + * Wait for the specified transaction to finish or abort the insertion of a + * tuple. + */ +void +SpeculativeInsertionWait(TransactionId xid, uint32 token) +{ + LOCKTAG tag; + + SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, token); + + Assert(TransactionIdIsValid(xid)); + Assert(token != 0); + + (void) LockAcquire(&tag, ShareLock, false, false); + LockRelease(&tag, ShareLock, false); +} + +/* + * XactLockTableWaitErrorCb + * Error context callback for transaction lock waits. + */ +static void +XactLockTableWaitErrorCb(void *arg) +{ + XactLockTableWaitInfo *info = (XactLockTableWaitInfo *) arg; + + /* + * We would like to print schema name too, but that would require a + * syscache lookup. + */ + if (info->oper != XLTW_None && + ItemPointerIsValid(info->ctid) && RelationIsValid(info->rel)) + { + const char *cxt; + + switch (info->oper) + { + case XLTW_Update: + cxt = gettext_noop("while updating tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_Delete: + cxt = gettext_noop("while deleting tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_Lock: + cxt = gettext_noop("while locking tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_LockUpdated: + cxt = gettext_noop("while locking updated version (%u,%u) of tuple in relation \"%s\""); + break; + case XLTW_InsertIndex: + cxt = gettext_noop("while inserting index tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_InsertIndexUnique: + cxt = gettext_noop("while checking uniqueness of tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_FetchUpdated: + cxt = gettext_noop("while rechecking updated tuple (%u,%u) in relation \"%s\""); + break; + case XLTW_RecheckExclusionConstr: + cxt = gettext_noop("while checking exclusion constraint on tuple (%u,%u) in relation \"%s\""); + break; + + default: + return; + } + + errcontext(cxt, + ItemPointerGetBlockNumber(info->ctid), + ItemPointerGetOffsetNumber(info->ctid), + RelationGetRelationName(info->rel)); + } +} + +/* + * WaitForLockersMultiple + * Wait until no transaction holds locks that conflict with the given + * locktags at the given lockmode. + * + * To do this, obtain the current list of lockers, and wait on their VXIDs + * until they are finished. + * + * Note we don't try to acquire the locks on the given locktags, only the + * VXIDs and XIDs of their lock holders; if somebody grabs a conflicting lock + * on the objects after we obtained our initial list of lockers, we will not + * wait for them. + */ +void +WaitForLockersMultiple(List *locktags, LOCKMODE lockmode, bool progress) +{ + List *holders = NIL; + ListCell *lc; + int total = 0; + int done = 0; + + /* Done if no locks to wait for */ + if (locktags == NIL) + return; + + /* Collect the transactions we need to wait on */ + foreach(lc, locktags) + { + LOCKTAG *locktag = lfirst(lc); + int count; + + holders = lappend(holders, + GetLockConflicts(locktag, lockmode, + progress ? &count : NULL)); + if (progress) + total += count; + } + + if (progress) + pgstat_progress_update_param(PROGRESS_WAITFOR_TOTAL, total); + + /* + * Note: GetLockConflicts() never reports our own xid, hence we need not + * check for that. Also, prepared xacts are reported and awaited. + */ + + /* Finally wait for each such transaction to complete */ + foreach(lc, holders) + { + VirtualTransactionId *lockholders = lfirst(lc); + + while (VirtualTransactionIdIsValid(*lockholders)) + { + /* If requested, publish who we're going to wait for. */ + if (progress) + { + PGPROC *holder = BackendIdGetProc(lockholders->backendId); + + if (holder) + pgstat_progress_update_param(PROGRESS_WAITFOR_CURRENT_PID, + holder->pid); + } + VirtualXactLock(*lockholders, true); + lockholders++; + + if (progress) + pgstat_progress_update_param(PROGRESS_WAITFOR_DONE, ++done); + } + } + if (progress) + { + const int index[] = { + PROGRESS_WAITFOR_TOTAL, + PROGRESS_WAITFOR_DONE, + PROGRESS_WAITFOR_CURRENT_PID + }; + const int64 values[] = { + 0, 0, 0 + }; + + pgstat_progress_update_multi_param(3, index, values); + } + + list_free_deep(holders); +} + +/* + * WaitForLockers + * + * Same as WaitForLockersMultiple, for a single lock tag. + */ +void +WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode, bool progress) +{ + List *l; + + l = list_make1(&heaplocktag); + WaitForLockersMultiple(l, lockmode, progress); + list_free(l); +} + + +/* + * LockDatabaseObject + * + * Obtain a lock on a general object of the current database. Don't use + * this for shared objects (such as tablespaces). It's unwise to apply it + * to relations, also, since a lock taken this way will NOT conflict with + * locks taken via LockRelation and friends. + */ +void +LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + MyDatabaseId, + classid, + objid, + objsubid); + + (void) LockAcquire(&tag, lockmode, false, false); + + /* Make sure syscaches are up-to-date with any changes we waited for */ + AcceptInvalidationMessages(); +} + +/* + * UnlockDatabaseObject + */ +void +UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + MyDatabaseId, + classid, + objid, + objsubid); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockSharedObject + * + * Obtain a lock on a shared-across-databases object. + */ +void +LockSharedObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + InvalidOid, + classid, + objid, + objsubid); + + (void) LockAcquire(&tag, lockmode, false, false); + + /* Make sure syscaches are up-to-date with any changes we waited for */ + AcceptInvalidationMessages(); +} + +/* + * UnlockSharedObject + */ +void +UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + InvalidOid, + classid, + objid, + objsubid); + + LockRelease(&tag, lockmode, false); +} + +/* + * LockSharedObjectForSession + * + * Obtain a session-level lock on a shared-across-databases object. + * See LockRelationIdForSession for notes about session-level locks. + */ +void +LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + InvalidOid, + classid, + objid, + objsubid); + + (void) LockAcquire(&tag, lockmode, true, false); +} + +/* + * UnlockSharedObjectForSession + */ +void +UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_OBJECT(tag, + InvalidOid, + classid, + objid, + objsubid); + + LockRelease(&tag, lockmode, true); +} + +/* + * LockApplyTransactionForSession + * + * Obtain a session-level lock on a transaction being applied on a logical + * replication subscriber. See LockRelationIdForSession for notes about + * session-level locks. + */ +void +LockApplyTransactionForSession(Oid suboid, TransactionId xid, uint16 objid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_APPLY_TRANSACTION(tag, + MyDatabaseId, + suboid, + xid, + objid); + + (void) LockAcquire(&tag, lockmode, true, false); +} + +/* + * UnlockApplyTransactionForSession + */ +void +UnlockApplyTransactionForSession(Oid suboid, TransactionId xid, uint16 objid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + + SET_LOCKTAG_APPLY_TRANSACTION(tag, + MyDatabaseId, + suboid, + xid, + objid); + + LockRelease(&tag, lockmode, true); +} + +/* + * Append a description of a lockable object to buf. + * + * Ideally we would print names for the numeric values, but that requires + * getting locks on system tables, which might cause problems since this is + * typically used to report deadlock situations. + */ +void +DescribeLockTag(StringInfo buf, const LOCKTAG *tag) +{ + switch ((LockTagType) tag->locktag_type) + { + case LOCKTAG_RELATION: + appendStringInfo(buf, + _("relation %u of database %u"), + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_RELATION_EXTEND: + appendStringInfo(buf, + _("extension of relation %u of database %u"), + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_DATABASE_FROZEN_IDS: + appendStringInfo(buf, + _("pg_database.datfrozenxid of database %u"), + tag->locktag_field1); + break; + case LOCKTAG_PAGE: + appendStringInfo(buf, + _("page %u of relation %u of database %u"), + tag->locktag_field3, + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_TUPLE: + appendStringInfo(buf, + _("tuple (%u,%u) of relation %u of database %u"), + tag->locktag_field3, + tag->locktag_field4, + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_TRANSACTION: + appendStringInfo(buf, + _("transaction %u"), + tag->locktag_field1); + break; + case LOCKTAG_VIRTUALTRANSACTION: + appendStringInfo(buf, + _("virtual transaction %d/%u"), + tag->locktag_field1, + tag->locktag_field2); + break; + case LOCKTAG_SPECULATIVE_TOKEN: + appendStringInfo(buf, + _("speculative token %u of transaction %u"), + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_OBJECT: + appendStringInfo(buf, + _("object %u of class %u of database %u"), + tag->locktag_field3, + tag->locktag_field2, + tag->locktag_field1); + break; + case LOCKTAG_USERLOCK: + /* reserved for old contrib code, now on pgfoundry */ + appendStringInfo(buf, + _("user lock [%u,%u,%u]"), + tag->locktag_field1, + tag->locktag_field2, + tag->locktag_field3); + break; + case LOCKTAG_ADVISORY: + appendStringInfo(buf, + _("advisory lock [%u,%u,%u,%u]"), + tag->locktag_field1, + tag->locktag_field2, + tag->locktag_field3, + tag->locktag_field4); + break; + case LOCKTAG_APPLY_TRANSACTION: + appendStringInfo(buf, + _("remote transaction %u of subscription %u of database %u"), + tag->locktag_field3, + tag->locktag_field2, + tag->locktag_field1); + break; + default: + appendStringInfo(buf, + _("unrecognized locktag type %d"), + (int) tag->locktag_type); + break; + } +} + +/* + * GetLockNameFromTagType + * + * Given locktag type, return the corresponding lock name. + */ +const char * +GetLockNameFromTagType(uint16 locktag_type) +{ + if (locktag_type > LOCKTAG_LAST_TYPE) + return "???"; + return LockTagTypeNames[locktag_type]; +} diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c new file mode 100644 index 0000000..ec6240f --- /dev/null +++ b/src/backend/storage/lmgr/lock.c @@ -0,0 +1,4651 @@ +/*------------------------------------------------------------------------- + * + * lock.c + * POSTGRES primary lock mechanism + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/lock.c + * + * NOTES + * A lock table is a shared memory hash table. When + * a process tries to acquire a lock of a type that conflicts + * with existing locks, it is put to sleep using the routines + * in storage/lmgr/proc.c. + * + * For the most part, this code should be invoked via lmgr.c + * or another lock-management module, not directly. + * + * Interface: + * + * InitLocks(), GetLocksMethodTable(), GetLockTagsMethodTable(), + * LockAcquire(), LockRelease(), LockReleaseAll(), + * LockCheckConflicts(), GrantLock() + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "access/transam.h" +#include "access/twophase.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "storage/spin.h" +#include "storage/standby.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/resowner_private.h" + + +/* This configuration variable is used to set the lock table size */ +int max_locks_per_xact; /* set by guc.c */ + +#define NLOCKENTS() \ + mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) + + +/* + * Data structures defining the semantics of the standard lock methods. + * + * The conflict table defines the semantics of the various lock modes. + */ +static const LOCKMASK LockConflicts[] = { + 0, + + /* AccessShareLock */ + LOCKBIT_ON(AccessExclusiveLock), + + /* RowShareLock */ + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* RowExclusiveLock */ + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* ShareUpdateExclusiveLock */ + LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* ShareLock */ + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* ShareRowExclusiveLock */ + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* ExclusiveLock */ + LOCKBIT_ON(RowShareLock) | + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock), + + /* AccessExclusiveLock */ + LOCKBIT_ON(AccessShareLock) | LOCKBIT_ON(RowShareLock) | + LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) | + LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) | + LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock) + +}; + +/* Names of lock modes, for debug printouts */ +static const char *const lock_mode_names[] = +{ + "INVALID", + "AccessShareLock", + "RowShareLock", + "RowExclusiveLock", + "ShareUpdateExclusiveLock", + "ShareLock", + "ShareRowExclusiveLock", + "ExclusiveLock", + "AccessExclusiveLock" +}; + +#ifndef LOCK_DEBUG +static bool Dummy_trace = false; +#endif + +static const LockMethodData default_lockmethod = { + MaxLockMode, + LockConflicts, + lock_mode_names, +#ifdef LOCK_DEBUG + &Trace_locks +#else + &Dummy_trace +#endif +}; + +static const LockMethodData user_lockmethod = { + MaxLockMode, + LockConflicts, + lock_mode_names, +#ifdef LOCK_DEBUG + &Trace_userlocks +#else + &Dummy_trace +#endif +}; + +/* + * map from lock method id to the lock table data structures + */ +static const LockMethod LockMethods[] = { + NULL, + &default_lockmethod, + &user_lockmethod +}; + + +/* Record that's written to 2PC state file when a lock is persisted */ +typedef struct TwoPhaseLockRecord +{ + LOCKTAG locktag; + LOCKMODE lockmode; +} TwoPhaseLockRecord; + + +/* + * Count of the number of fast path lock slots we believe to be used. This + * might be higher than the real number if another backend has transferred + * our locks to the primary lock table, but it can never be lower than the + * real value, since only we can acquire locks on our own behalf. + */ +static int FastPathLocalUseCount = 0; + +/* + * Flag to indicate if the relation extension lock is held by this backend. + * This flag is used to ensure that while holding the relation extension lock + * we don't try to acquire a heavyweight lock on any other object. This + * restriction implies that the relation extension lock won't ever participate + * in the deadlock cycle because we can never wait for any other heavyweight + * lock after acquiring this lock. + * + * Such a restriction is okay for relation extension locks as unlike other + * heavyweight locks these are not held till the transaction end. These are + * taken for a short duration to extend a particular relation and then + * released. + */ +static bool IsRelationExtensionLockHeld PG_USED_FOR_ASSERTS_ONLY = false; + +/* Macros for manipulating proc->fpLockBits */ +#define FAST_PATH_BITS_PER_SLOT 3 +#define FAST_PATH_LOCKNUMBER_OFFSET 1 +#define FAST_PATH_MASK ((1 << FAST_PATH_BITS_PER_SLOT) - 1) +#define FAST_PATH_GET_BITS(proc, n) \ + (((proc)->fpLockBits >> (FAST_PATH_BITS_PER_SLOT * n)) & FAST_PATH_MASK) +#define FAST_PATH_BIT_POSITION(n, l) \ + (AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \ + AssertMacro((l) < FAST_PATH_BITS_PER_SLOT+FAST_PATH_LOCKNUMBER_OFFSET), \ + AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \ + ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n))) +#define FAST_PATH_SET_LOCKMODE(proc, n, l) \ + (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l) +#define FAST_PATH_CLEAR_LOCKMODE(proc, n, l) \ + (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)) +#define FAST_PATH_CHECK_LOCKMODE(proc, n, l) \ + ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))) + +/* + * The fast-path lock mechanism is concerned only with relation locks on + * unshared relations by backends bound to a database. The fast-path + * mechanism exists mostly to accelerate acquisition and release of locks + * that rarely conflict. Because ShareUpdateExclusiveLock is + * self-conflicting, it can't use the fast-path mechanism; but it also does + * not conflict with any of the locks that do, so we can ignore it completely. + */ +#define EligibleForRelationFastPath(locktag, mode) \ + ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \ + (locktag)->locktag_type == LOCKTAG_RELATION && \ + (locktag)->locktag_field1 == MyDatabaseId && \ + MyDatabaseId != InvalidOid && \ + (mode) < ShareUpdateExclusiveLock) +#define ConflictsWithRelationFastPath(locktag, mode) \ + ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \ + (locktag)->locktag_type == LOCKTAG_RELATION && \ + (locktag)->locktag_field1 != InvalidOid && \ + (mode) > ShareUpdateExclusiveLock) + +static bool FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode); +static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode); +static bool FastPathTransferRelationLocks(LockMethod lockMethodTable, + const LOCKTAG *locktag, uint32 hashcode); +static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock); + +/* + * To make the fast-path lock mechanism work, we must have some way of + * preventing the use of the fast-path when a conflicting lock might be present. + * We partition* the locktag space into FAST_PATH_STRONG_LOCK_HASH_PARTITIONS, + * and maintain an integer count of the number of "strong" lockers + * in each partition. When any "strong" lockers are present (which is + * hopefully not very often), the fast-path mechanism can't be used, and we + * must fall back to the slower method of pushing matching locks directly + * into the main lock tables. + * + * The deadlock detector does not know anything about the fast path mechanism, + * so any locks that might be involved in a deadlock must be transferred from + * the fast-path queues to the main lock table. + */ + +#define FAST_PATH_STRONG_LOCK_HASH_BITS 10 +#define FAST_PATH_STRONG_LOCK_HASH_PARTITIONS \ + (1 << FAST_PATH_STRONG_LOCK_HASH_BITS) +#define FastPathStrongLockHashPartition(hashcode) \ + ((hashcode) % FAST_PATH_STRONG_LOCK_HASH_PARTITIONS) + +typedef struct +{ + slock_t mutex; + uint32 count[FAST_PATH_STRONG_LOCK_HASH_PARTITIONS]; +} FastPathStrongRelationLockData; + +static volatile FastPathStrongRelationLockData *FastPathStrongRelationLocks; + + +/* + * Pointers to hash tables containing lock state + * + * The LockMethodLockHash and LockMethodProcLockHash hash tables are in + * shared memory; LockMethodLocalHash is local to each backend. + */ +static HTAB *LockMethodLockHash; +static HTAB *LockMethodProcLockHash; +static HTAB *LockMethodLocalHash; + + +/* private state for error cleanup */ +static LOCALLOCK *StrongLockInProgress; +static LOCALLOCK *awaitedLock; +static ResourceOwner awaitedOwner; + + +#ifdef LOCK_DEBUG + +/*------ + * The following configuration options are available for lock debugging: + * + * TRACE_LOCKS -- give a bunch of output what's going on in this file + * TRACE_USERLOCKS -- same but for user locks + * TRACE_LOCK_OIDMIN-- do not trace locks for tables below this oid + * (use to avoid output on system tables) + * TRACE_LOCK_TABLE -- trace locks on this table (oid) unconditionally + * DEBUG_DEADLOCKS -- currently dumps locks at untimely occasions ;) + * + * Furthermore, but in storage/lmgr/lwlock.c: + * TRACE_LWLOCKS -- trace lightweight locks (pretty useless) + * + * Define LOCK_DEBUG at compile time to get all these enabled. + * -------- + */ + +int Trace_lock_oidmin = FirstNormalObjectId; +bool Trace_locks = false; +bool Trace_userlocks = false; +int Trace_lock_table = 0; +bool Debug_deadlocks = false; + + +inline static bool +LOCK_DEBUG_ENABLED(const LOCKTAG *tag) +{ + return + (*(LockMethods[tag->locktag_lockmethodid]->trace_flag) && + ((Oid) tag->locktag_field2 >= (Oid) Trace_lock_oidmin)) + || (Trace_lock_table && + (tag->locktag_field2 == Trace_lock_table)); +} + + +inline static void +LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type) +{ + if (LOCK_DEBUG_ENABLED(&lock->tag)) + elog(LOG, + "%s: lock(%p) id(%u,%u,%u,%u,%u,%u) grantMask(%x) " + "req(%d,%d,%d,%d,%d,%d,%d)=%d " + "grant(%d,%d,%d,%d,%d,%d,%d)=%d wait(%d) type(%s)", + where, lock, + lock->tag.locktag_field1, lock->tag.locktag_field2, + lock->tag.locktag_field3, lock->tag.locktag_field4, + lock->tag.locktag_type, lock->tag.locktag_lockmethodid, + lock->grantMask, + lock->requested[1], lock->requested[2], lock->requested[3], + lock->requested[4], lock->requested[5], lock->requested[6], + lock->requested[7], lock->nRequested, + lock->granted[1], lock->granted[2], lock->granted[3], + lock->granted[4], lock->granted[5], lock->granted[6], + lock->granted[7], lock->nGranted, + dclist_count(&lock->waitProcs), + LockMethods[LOCK_LOCKMETHOD(*lock)]->lockModeNames[type]); +} + + +inline static void +PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP) +{ + if (LOCK_DEBUG_ENABLED(&proclockP->tag.myLock->tag)) + elog(LOG, + "%s: proclock(%p) lock(%p) method(%u) proc(%p) hold(%x)", + where, proclockP, proclockP->tag.myLock, + PROCLOCK_LOCKMETHOD(*(proclockP)), + proclockP->tag.myProc, (int) proclockP->holdMask); +} +#else /* not LOCK_DEBUG */ + +#define LOCK_PRINT(where, lock, type) ((void) 0) +#define PROCLOCK_PRINT(where, proclockP) ((void) 0) +#endif /* not LOCK_DEBUG */ + + +static uint32 proclock_hash(const void *key, Size keysize); +static void RemoveLocalLock(LOCALLOCK *locallock); +static PROCLOCK *SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, + const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode); +static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner); +static void BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode); +static void FinishStrongLockAcquire(void); +static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner); +static void ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock); +static void LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent); +static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode, + PROCLOCK *proclock, LockMethod lockMethodTable); +static void CleanUpLock(LOCK *lock, PROCLOCK *proclock, + LockMethod lockMethodTable, uint32 hashcode, + bool wakeupNeeded); +static void LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, + LOCKTAG *locktag, LOCKMODE lockmode, + bool decrement_strong_lock_count); +static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc, + BlockedProcsData *data); + + +/* + * InitLocks -- Initialize the lock manager's data structures. + * + * This is called from CreateSharedMemoryAndSemaphores(), which see for + * more comments. In the normal postmaster case, the shared hash tables + * are created here, as well as a locallock hash table that will remain + * unused and empty in the postmaster itself. Backends inherit the pointers + * to the shared tables via fork(), and also inherit an image of the locallock + * hash table, which they proceed to use. In the EXEC_BACKEND case, each + * backend re-executes this code to obtain pointers to the already existing + * shared hash tables and to create its locallock hash table. + */ +void +InitLocks(void) +{ + HASHCTL info; + long init_table_size, + max_table_size; + bool found; + + /* + * Compute init/max size to request for lock hashtables. Note these + * calculations must agree with LockShmemSize! + */ + max_table_size = NLOCKENTS(); + init_table_size = max_table_size / 2; + + /* + * Allocate hash table for LOCK structs. This stores per-locked-object + * information. + */ + info.keysize = sizeof(LOCKTAG); + info.entrysize = sizeof(LOCK); + info.num_partitions = NUM_LOCK_PARTITIONS; + + LockMethodLockHash = ShmemInitHash("LOCK hash", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | HASH_PARTITION); + + /* Assume an average of 2 holders per lock */ + max_table_size *= 2; + init_table_size *= 2; + + /* + * Allocate hash table for PROCLOCK structs. This stores + * per-lock-per-holder information. + */ + info.keysize = sizeof(PROCLOCKTAG); + info.entrysize = sizeof(PROCLOCK); + info.hash = proclock_hash; + info.num_partitions = NUM_LOCK_PARTITIONS; + + LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash", + init_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); + + /* + * Allocate fast-path structures. + */ + FastPathStrongRelationLocks = + ShmemInitStruct("Fast Path Strong Relation Lock Data", + sizeof(FastPathStrongRelationLockData), &found); + if (!found) + SpinLockInit(&FastPathStrongRelationLocks->mutex); + + /* + * Allocate non-shared hash table for LOCALLOCK structs. This stores lock + * counts and resource owner information. + * + * The non-shared table could already exist in this process (this occurs + * when the postmaster is recreating shared memory after a backend crash). + * If so, delete and recreate it. (We could simply leave it, since it + * ought to be empty in the postmaster, but for safety let's zap it.) + */ + if (LockMethodLocalHash) + hash_destroy(LockMethodLocalHash); + + info.keysize = sizeof(LOCALLOCKTAG); + info.entrysize = sizeof(LOCALLOCK); + + LockMethodLocalHash = hash_create("LOCALLOCK hash", + 16, + &info, + HASH_ELEM | HASH_BLOBS); +} + + +/* + * Fetch the lock method table associated with a given lock + */ +LockMethod +GetLocksMethodTable(const LOCK *lock) +{ + LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*lock); + + Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods)); + return LockMethods[lockmethodid]; +} + +/* + * Fetch the lock method table associated with a given locktag + */ +LockMethod +GetLockTagsMethodTable(const LOCKTAG *locktag) +{ + LOCKMETHODID lockmethodid = (LOCKMETHODID) locktag->locktag_lockmethodid; + + Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods)); + return LockMethods[lockmethodid]; +} + + +/* + * Compute the hash code associated with a LOCKTAG. + * + * To avoid unnecessary recomputations of the hash code, we try to do this + * just once per function, and then pass it around as needed. Aside from + * passing the hashcode to hash_search_with_hash_value(), we can extract + * the lock partition number from the hashcode. + */ +uint32 +LockTagHashCode(const LOCKTAG *locktag) +{ + return get_hash_value(LockMethodLockHash, (const void *) locktag); +} + +/* + * Compute the hash code associated with a PROCLOCKTAG. + * + * Because we want to use just one set of partition locks for both the + * LOCK and PROCLOCK hash tables, we have to make sure that PROCLOCKs + * fall into the same partition number as their associated LOCKs. + * dynahash.c expects the partition number to be the low-order bits of + * the hash code, and therefore a PROCLOCKTAG's hash code must have the + * same low-order bits as the associated LOCKTAG's hash code. We achieve + * this with this specialized hash function. + */ +static uint32 +proclock_hash(const void *key, Size keysize) +{ + const PROCLOCKTAG *proclocktag = (const PROCLOCKTAG *) key; + uint32 lockhash; + Datum procptr; + + Assert(keysize == sizeof(PROCLOCKTAG)); + + /* Look into the associated LOCK object, and compute its hash code */ + lockhash = LockTagHashCode(&proclocktag->myLock->tag); + + /* + * To make the hash code also depend on the PGPROC, we xor the proc + * struct's address into the hash code, left-shifted so that the + * partition-number bits don't change. Since this is only a hash, we + * don't care if we lose high-order bits of the address; use an + * intermediate variable to suppress cast-pointer-to-int warnings. + */ + procptr = PointerGetDatum(proclocktag->myProc); + lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS; + + return lockhash; +} + +/* + * Compute the hash code associated with a PROCLOCKTAG, given the hashcode + * for its underlying LOCK. + * + * We use this just to avoid redundant calls of LockTagHashCode(). + */ +static inline uint32 +ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode) +{ + uint32 lockhash = hashcode; + Datum procptr; + + /* + * This must match proclock_hash()! + */ + procptr = PointerGetDatum(proclocktag->myProc); + lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS; + + return lockhash; +} + +/* + * Given two lock modes, return whether they would conflict. + */ +bool +DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2) +{ + LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD]; + + if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2)) + return true; + + return false; +} + +/* + * LockHeldByMe -- test whether lock 'locktag' is held with mode 'lockmode' + * by the current transaction + */ +bool +LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode) +{ + LOCALLOCKTAG localtag; + LOCALLOCK *locallock; + + /* + * See if there is a LOCALLOCK entry for this lock and lockmode + */ + MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */ + localtag.lock = *locktag; + localtag.mode = lockmode; + + locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, + &localtag, + HASH_FIND, NULL); + + return (locallock && locallock->nLocks > 0); +} + +#ifdef USE_ASSERT_CHECKING +/* + * GetLockMethodLocalHash -- return the hash of local locks, for modules that + * evaluate assertions based on all locks held. + */ +HTAB * +GetLockMethodLocalHash(void) +{ + return LockMethodLocalHash; +} +#endif + +/* + * LockHasWaiters -- look up 'locktag' and check if releasing this + * lock would wake up other processes waiting for it. + */ +bool +LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LockMethod lockMethodTable; + LOCALLOCKTAG localtag; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + LWLock *partitionLock; + bool hasWaiters = false; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) + elog(ERROR, "unrecognized lock mode: %d", lockmode); + +#ifdef LOCK_DEBUG + if (LOCK_DEBUG_ENABLED(locktag)) + elog(LOG, "LockHasWaiters: lock [%u,%u] %s", + locktag->locktag_field1, locktag->locktag_field2, + lockMethodTable->lockModeNames[lockmode]); +#endif + + /* + * Find the LOCALLOCK entry for this lock and lockmode + */ + MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */ + localtag.lock = *locktag; + localtag.mode = lockmode; + + locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, + &localtag, + HASH_FIND, NULL); + + /* + * let the caller print its own error message, too. Do not ereport(ERROR). + */ + if (!locallock || locallock->nLocks <= 0) + { + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + return false; + } + + /* + * Check the shared lock table. + */ + partitionLock = LockHashPartitionLock(locallock->hashcode); + + LWLockAcquire(partitionLock, LW_SHARED); + + /* + * We don't need to re-find the lock or proclock, since we kept their + * addresses in the locallock table, and they couldn't have been removed + * while we were holding a lock on them. + */ + lock = locallock->lock; + LOCK_PRINT("LockHasWaiters: found", lock, lockmode); + proclock = locallock->proclock; + PROCLOCK_PRINT("LockHasWaiters: found", proclock); + + /* + * Double-check that we are actually holding a lock of the type we want to + * release. + */ + if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) + { + PROCLOCK_PRINT("LockHasWaiters: WRONGTYPE", proclock); + LWLockRelease(partitionLock); + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + RemoveLocalLock(locallock); + return false; + } + + /* + * Do the checking. + */ + if ((lockMethodTable->conflictTab[lockmode] & lock->waitMask) != 0) + hasWaiters = true; + + LWLockRelease(partitionLock); + + return hasWaiters; +} + +/* + * LockAcquire -- Check for lock conflicts, sleep if conflict found, + * set lock if/when no conflicts. + * + * Inputs: + * locktag: unique identifier for the lockable object + * lockmode: lock mode to acquire + * sessionLock: if true, acquire lock for session not current transaction + * dontWait: if true, don't wait to acquire lock + * + * Returns one of: + * LOCKACQUIRE_NOT_AVAIL lock not available, and dontWait=true + * LOCKACQUIRE_OK lock successfully acquired + * LOCKACQUIRE_ALREADY_HELD incremented count for lock already held + * LOCKACQUIRE_ALREADY_CLEAR incremented count for lock already clear + * + * In the normal case where dontWait=false and the caller doesn't need to + * distinguish a freshly acquired lock from one already taken earlier in + * this same transaction, there is no need to examine the return value. + * + * Side Effects: The lock is acquired and recorded in lock tables. + * + * NOTE: if we wait for the lock, there is no way to abort the wait + * short of aborting the transaction. + */ +LockAcquireResult +LockAcquire(const LOCKTAG *locktag, + LOCKMODE lockmode, + bool sessionLock, + bool dontWait) +{ + return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait, + true, NULL); +} + +/* + * LockAcquireExtended - allows us to specify additional options + * + * reportMemoryError specifies whether a lock request that fills the lock + * table should generate an ERROR or not. Passing "false" allows the caller + * to attempt to recover from lock-table-full situations, perhaps by forcibly + * canceling other lock holders and then retrying. Note, however, that the + * return code for that is LOCKACQUIRE_NOT_AVAIL, so that it's unsafe to use + * in combination with dontWait = true, as the cause of failure couldn't be + * distinguished. + * + * If locallockp isn't NULL, *locallockp receives a pointer to the LOCALLOCK + * table entry if a lock is successfully acquired, or NULL if not. + */ +LockAcquireResult +LockAcquireExtended(const LOCKTAG *locktag, + LOCKMODE lockmode, + bool sessionLock, + bool dontWait, + bool reportMemoryError, + LOCALLOCK **locallockp) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LockMethod lockMethodTable; + LOCALLOCKTAG localtag; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + bool found; + ResourceOwner owner; + uint32 hashcode; + LWLock *partitionLock; + bool found_conflict; + bool log_lock = false; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) + elog(ERROR, "unrecognized lock mode: %d", lockmode); + + if (RecoveryInProgress() && !InRecovery && + (locktag->locktag_type == LOCKTAG_OBJECT || + locktag->locktag_type == LOCKTAG_RELATION) && + lockmode > RowExclusiveLock) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("cannot acquire lock mode %s on database objects while recovery is in progress", + lockMethodTable->lockModeNames[lockmode]), + errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery."))); + +#ifdef LOCK_DEBUG + if (LOCK_DEBUG_ENABLED(locktag)) + elog(LOG, "LockAcquire: lock [%u,%u] %s", + locktag->locktag_field1, locktag->locktag_field2, + lockMethodTable->lockModeNames[lockmode]); +#endif + + /* Identify owner for lock */ + if (sessionLock) + owner = NULL; + else + owner = CurrentResourceOwner; + + /* + * Find or create a LOCALLOCK entry for this lock and lockmode + */ + MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */ + localtag.lock = *locktag; + localtag.mode = lockmode; + + locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, + &localtag, + HASH_ENTER, &found); + + /* + * if it's a new locallock object, initialize it + */ + if (!found) + { + locallock->lock = NULL; + locallock->proclock = NULL; + locallock->hashcode = LockTagHashCode(&(localtag.lock)); + locallock->nLocks = 0; + locallock->holdsStrongLockCount = false; + locallock->lockCleared = false; + locallock->numLockOwners = 0; + locallock->maxLockOwners = 8; + locallock->lockOwners = NULL; /* in case next line fails */ + locallock->lockOwners = (LOCALLOCKOWNER *) + MemoryContextAlloc(TopMemoryContext, + locallock->maxLockOwners * sizeof(LOCALLOCKOWNER)); + } + else + { + /* Make sure there will be room to remember the lock */ + if (locallock->numLockOwners >= locallock->maxLockOwners) + { + int newsize = locallock->maxLockOwners * 2; + + locallock->lockOwners = (LOCALLOCKOWNER *) + repalloc(locallock->lockOwners, + newsize * sizeof(LOCALLOCKOWNER)); + locallock->maxLockOwners = newsize; + } + } + hashcode = locallock->hashcode; + + if (locallockp) + *locallockp = locallock; + + /* + * If we already hold the lock, we can just increase the count locally. + * + * If lockCleared is already set, caller need not worry about absorbing + * sinval messages related to the lock's object. + */ + if (locallock->nLocks > 0) + { + GrantLockLocal(locallock, owner); + if (locallock->lockCleared) + return LOCKACQUIRE_ALREADY_CLEAR; + else + return LOCKACQUIRE_ALREADY_HELD; + } + + /* + * We don't acquire any other heavyweight lock while holding the relation + * extension lock. We do allow to acquire the same relation extension + * lock more than once but that case won't reach here. + */ + Assert(!IsRelationExtensionLockHeld); + + /* + * Prepare to emit a WAL record if acquisition of this lock needs to be + * replayed in a standby server. + * + * Here we prepare to log; after lock is acquired we'll issue log record. + * This arrangement simplifies error recovery in case the preparation step + * fails. + * + * Only AccessExclusiveLocks can conflict with lock types that read-only + * transactions can acquire in a standby server. Make sure this definition + * matches the one in GetRunningTransactionLocks(). + */ + if (lockmode >= AccessExclusiveLock && + locktag->locktag_type == LOCKTAG_RELATION && + !RecoveryInProgress() && + XLogStandbyInfoActive()) + { + LogAccessExclusiveLockPrepare(); + log_lock = true; + } + + /* + * Attempt to take lock via fast path, if eligible. But if we remember + * having filled up the fast path array, we don't attempt to make any + * further use of it until we release some locks. It's possible that some + * other backend has transferred some of those locks to the shared hash + * table, leaving space free, but it's not worth acquiring the LWLock just + * to check. It's also possible that we're acquiring a second or third + * lock type on a relation we have already locked using the fast-path, but + * for now we don't worry about that case either. + */ + if (EligibleForRelationFastPath(locktag, lockmode) && + FastPathLocalUseCount < FP_LOCK_SLOTS_PER_BACKEND) + { + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + bool acquired; + + /* + * LWLockAcquire acts as a memory sequencing point, so it's safe to + * assume that any strong locker whose increment to + * FastPathStrongRelationLocks->counts becomes visible after we test + * it has yet to begin to transfer fast-path locks. + */ + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + if (FastPathStrongRelationLocks->count[fasthashcode] != 0) + acquired = false; + else + acquired = FastPathGrantRelationLock(locktag->locktag_field2, + lockmode); + LWLockRelease(&MyProc->fpInfoLock); + if (acquired) + { + /* + * The locallock might contain stale pointers to some old shared + * objects; we MUST reset these to null before considering the + * lock to be acquired via fast-path. + */ + locallock->lock = NULL; + locallock->proclock = NULL; + GrantLockLocal(locallock, owner); + return LOCKACQUIRE_OK; + } + } + + /* + * If this lock could potentially have been taken via the fast-path by + * some other backend, we must (temporarily) disable further use of the + * fast-path for this lock tag, and migrate any locks already taken via + * this method to the main lock table. + */ + if (ConflictsWithRelationFastPath(locktag, lockmode)) + { + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + + BeginStrongLockAcquire(locallock, fasthashcode); + if (!FastPathTransferRelationLocks(lockMethodTable, locktag, + hashcode)) + { + AbortStrongLockAcquire(); + if (locallock->nLocks == 0) + RemoveLocalLock(locallock); + if (locallockp) + *locallockp = NULL; + if (reportMemoryError) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_locks_per_transaction"))); + else + return LOCKACQUIRE_NOT_AVAIL; + } + } + + /* + * We didn't find the lock in our LOCALLOCK table, and we didn't manage to + * take it via the fast-path, either, so we've got to mess with the shared + * lock table. + */ + partitionLock = LockHashPartitionLock(hashcode); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * Find or create lock and proclock entries with this tag + * + * Note: if the locallock object already existed, it might have a pointer + * to the lock already ... but we should not assume that that pointer is + * valid, since a lock object with zero hold and request counts can go + * away anytime. So we have to use SetupLockInTable() to recompute the + * lock and proclock pointers, even if they're already set. + */ + proclock = SetupLockInTable(lockMethodTable, MyProc, locktag, + hashcode, lockmode); + if (!proclock) + { + AbortStrongLockAcquire(); + LWLockRelease(partitionLock); + if (locallock->nLocks == 0) + RemoveLocalLock(locallock); + if (locallockp) + *locallockp = NULL; + if (reportMemoryError) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_locks_per_transaction"))); + else + return LOCKACQUIRE_NOT_AVAIL; + } + locallock->proclock = proclock; + lock = proclock->tag.myLock; + locallock->lock = lock; + + /* + * If lock requested conflicts with locks requested by waiters, must join + * wait queue. Otherwise, check for conflict with already-held locks. + * (That's last because most complex check.) + */ + if (lockMethodTable->conflictTab[lockmode] & lock->waitMask) + found_conflict = true; + else + found_conflict = LockCheckConflicts(lockMethodTable, lockmode, + lock, proclock); + + if (!found_conflict) + { + /* No conflict with held or previously requested locks */ + GrantLock(lock, proclock, lockmode); + GrantLockLocal(locallock, owner); + } + else + { + /* + * We can't acquire the lock immediately. If caller specified no + * blocking, remove useless table entries and return + * LOCKACQUIRE_NOT_AVAIL without waiting. + */ + if (dontWait) + { + AbortStrongLockAcquire(); + if (proclock->holdMask == 0) + { + uint32 proclock_hashcode; + + proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode); + dlist_delete(&proclock->lockLink); + dlist_delete(&proclock->procLink); + if (!hash_search_with_hash_value(LockMethodProcLockHash, + &(proclock->tag), + proclock_hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "proclock table corrupted"); + } + else + PROCLOCK_PRINT("LockAcquire: NOWAIT", proclock); + lock->nRequested--; + lock->requested[lockmode]--; + LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode); + Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0)); + Assert(lock->nGranted <= lock->nRequested); + LWLockRelease(partitionLock); + if (locallock->nLocks == 0) + RemoveLocalLock(locallock); + if (locallockp) + *locallockp = NULL; + return LOCKACQUIRE_NOT_AVAIL; + } + + /* + * Set bitmask of locks this process already holds on this object. + */ + MyProc->heldLocks = proclock->holdMask; + + /* + * Sleep till someone wakes me up. + */ + + TRACE_POSTGRESQL_LOCK_WAIT_START(locktag->locktag_field1, + locktag->locktag_field2, + locktag->locktag_field3, + locktag->locktag_field4, + locktag->locktag_type, + lockmode); + + WaitOnLock(locallock, owner); + + TRACE_POSTGRESQL_LOCK_WAIT_DONE(locktag->locktag_field1, + locktag->locktag_field2, + locktag->locktag_field3, + locktag->locktag_field4, + locktag->locktag_type, + lockmode); + + /* + * NOTE: do not do any material change of state between here and + * return. All required changes in locktable state must have been + * done when the lock was granted to us --- see notes in WaitOnLock. + */ + + /* + * Check the proclock entry status, in case something in the ipc + * communication doesn't work correctly. + */ + if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) + { + AbortStrongLockAcquire(); + PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock); + LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode); + /* Should we retry ? */ + LWLockRelease(partitionLock); + elog(ERROR, "LockAcquire failed"); + } + PROCLOCK_PRINT("LockAcquire: granted", proclock); + LOCK_PRINT("LockAcquire: granted", lock, lockmode); + } + + /* + * Lock state is fully up-to-date now; if we error out after this, no + * special error cleanup is required. + */ + FinishStrongLockAcquire(); + + LWLockRelease(partitionLock); + + /* + * Emit a WAL record if acquisition of this lock needs to be replayed in a + * standby server. + */ + if (log_lock) + { + /* + * Decode the locktag back to the original values, to avoid sending + * lots of empty bytes with every message. See lock.h to check how a + * locktag is defined for LOCKTAG_RELATION + */ + LogAccessExclusiveLock(locktag->locktag_field1, + locktag->locktag_field2); + } + + return LOCKACQUIRE_OK; +} + +/* + * Find or create LOCK and PROCLOCK objects as needed for a new lock + * request. + * + * Returns the PROCLOCK object, or NULL if we failed to create the objects + * for lack of shared memory. + * + * The appropriate partition lock must be held at entry, and will be + * held at exit. + */ +static PROCLOCK * +SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc, + const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode) +{ + LOCK *lock; + PROCLOCK *proclock; + PROCLOCKTAG proclocktag; + uint32 proclock_hashcode; + bool found; + + /* + * Find or create a lock with this tag. + */ + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + locktag, + hashcode, + HASH_ENTER_NULL, + &found); + if (!lock) + return NULL; + + /* + * if it's a new lock object, initialize it + */ + if (!found) + { + lock->grantMask = 0; + lock->waitMask = 0; + dlist_init(&lock->procLocks); + dclist_init(&lock->waitProcs); + lock->nRequested = 0; + lock->nGranted = 0; + MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES); + MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES); + LOCK_PRINT("LockAcquire: new", lock, lockmode); + } + else + { + LOCK_PRINT("LockAcquire: found", lock, lockmode); + Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0)); + Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0)); + Assert(lock->nGranted <= lock->nRequested); + } + + /* + * Create the hash key for the proclock table. + */ + proclocktag.myLock = lock; + proclocktag.myProc = proc; + + proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode); + + /* + * Find or create a proclock entry with this tag + */ + proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, + &proclocktag, + proclock_hashcode, + HASH_ENTER_NULL, + &found); + if (!proclock) + { + /* Oops, not enough shmem for the proclock */ + if (lock->nRequested == 0) + { + /* + * There are no other requestors of this lock, so garbage-collect + * the lock object. We *must* do this to avoid a permanent leak + * of shared memory, because there won't be anything to cause + * anyone to release the lock object later. + */ + Assert(dlist_is_empty(&(lock->procLocks))); + if (!hash_search_with_hash_value(LockMethodLockHash, + &(lock->tag), + hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "lock table corrupted"); + } + return NULL; + } + + /* + * If new, initialize the new entry + */ + if (!found) + { + uint32 partition = LockHashPartition(hashcode); + + /* + * It might seem unsafe to access proclock->groupLeader without a + * lock, but it's not really. Either we are initializing a proclock + * on our own behalf, in which case our group leader isn't changing + * because the group leader for a process can only ever be changed by + * the process itself; or else we are transferring a fast-path lock to + * the main lock table, in which case that process can't change it's + * lock group leader without first releasing all of its locks (and in + * particular the one we are currently transferring). + */ + proclock->groupLeader = proc->lockGroupLeader != NULL ? + proc->lockGroupLeader : proc; + proclock->holdMask = 0; + proclock->releaseMask = 0; + /* Add proclock to appropriate lists */ + dlist_push_tail(&lock->procLocks, &proclock->lockLink); + dlist_push_tail(&proc->myProcLocks[partition], &proclock->procLink); + PROCLOCK_PRINT("LockAcquire: new", proclock); + } + else + { + PROCLOCK_PRINT("LockAcquire: found", proclock); + Assert((proclock->holdMask & ~lock->grantMask) == 0); + +#ifdef CHECK_DEADLOCK_RISK + + /* + * Issue warning if we already hold a lower-level lock on this object + * and do not hold a lock of the requested level or higher. This + * indicates a deadlock-prone coding practice (eg, we'd have a + * deadlock if another backend were following the same code path at + * about the same time). + * + * This is not enabled by default, because it may generate log entries + * about user-level coding practices that are in fact safe in context. + * It can be enabled to help find system-level problems. + * + * XXX Doing numeric comparison on the lockmodes is a hack; it'd be + * better to use a table. For now, though, this works. + */ + { + int i; + + for (i = lockMethodTable->numLockModes; i > 0; i--) + { + if (proclock->holdMask & LOCKBIT_ON(i)) + { + if (i >= (int) lockmode) + break; /* safe: we have a lock >= req level */ + elog(LOG, "deadlock risk: raising lock level" + " from %s to %s on object %u/%u/%u", + lockMethodTable->lockModeNames[i], + lockMethodTable->lockModeNames[lockmode], + lock->tag.locktag_field1, lock->tag.locktag_field2, + lock->tag.locktag_field3); + break; + } + } + } +#endif /* CHECK_DEADLOCK_RISK */ + } + + /* + * lock->nRequested and lock->requested[] count the total number of + * requests, whether granted or waiting, so increment those immediately. + * The other counts don't increment till we get the lock. + */ + lock->nRequested++; + lock->requested[lockmode]++; + Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0)); + + /* + * We shouldn't already hold the desired lock; else locallock table is + * broken. + */ + if (proclock->holdMask & LOCKBIT_ON(lockmode)) + elog(ERROR, "lock %s on object %u/%u/%u is already held", + lockMethodTable->lockModeNames[lockmode], + lock->tag.locktag_field1, lock->tag.locktag_field2, + lock->tag.locktag_field3); + + return proclock; +} + +/* + * Check and set/reset the flag that we hold the relation extension lock. + * + * It is callers responsibility that this function is called after + * acquiring/releasing the relation extension lock. + * + * Pass acquired as true if lock is acquired, false otherwise. + */ +static inline void +CheckAndSetLockHeld(LOCALLOCK *locallock, bool acquired) +{ +#ifdef USE_ASSERT_CHECKING + if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_RELATION_EXTEND) + IsRelationExtensionLockHeld = acquired; +#endif +} + +/* + * Subroutine to free a locallock entry + */ +static void +RemoveLocalLock(LOCALLOCK *locallock) +{ + int i; + + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (locallock->lockOwners[i].owner != NULL) + ResourceOwnerForgetLock(locallock->lockOwners[i].owner, locallock); + } + locallock->numLockOwners = 0; + if (locallock->lockOwners != NULL) + pfree(locallock->lockOwners); + locallock->lockOwners = NULL; + + if (locallock->holdsStrongLockCount) + { + uint32 fasthashcode; + + fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode); + + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0); + FastPathStrongRelationLocks->count[fasthashcode]--; + locallock->holdsStrongLockCount = false; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); + } + + if (!hash_search(LockMethodLocalHash, + &(locallock->tag), + HASH_REMOVE, NULL)) + elog(WARNING, "locallock table corrupted"); + + /* + * Indicate that the lock is released for certain types of locks + */ + CheckAndSetLockHeld(locallock, false); +} + +/* + * LockCheckConflicts -- test whether requested lock conflicts + * with those already granted + * + * Returns true if conflict, false if no conflict. + * + * NOTES: + * Here's what makes this complicated: one process's locks don't + * conflict with one another, no matter what purpose they are held for + * (eg, session and transaction locks do not conflict). Nor do the locks + * of one process in a lock group conflict with those of another process in + * the same group. So, we must subtract off these locks when determining + * whether the requested new lock conflicts with those already held. + */ +bool +LockCheckConflicts(LockMethod lockMethodTable, + LOCKMODE lockmode, + LOCK *lock, + PROCLOCK *proclock) +{ + int numLockModes = lockMethodTable->numLockModes; + LOCKMASK myLocks; + int conflictMask = lockMethodTable->conflictTab[lockmode]; + int conflictsRemaining[MAX_LOCKMODES]; + int totalConflictsRemaining = 0; + dlist_iter proclock_iter; + int i; + + /* + * first check for global conflicts: If no locks conflict with my request, + * then I get the lock. + * + * Checking for conflict: lock->grantMask represents the types of + * currently held locks. conflictTable[lockmode] has a bit set for each + * type of lock that conflicts with request. Bitwise compare tells if + * there is a conflict. + */ + if (!(conflictMask & lock->grantMask)) + { + PROCLOCK_PRINT("LockCheckConflicts: no conflict", proclock); + return false; + } + + /* + * Rats. Something conflicts. But it could still be my own lock, or a + * lock held by another member of my locking group. First, figure out how + * many conflicts remain after subtracting out any locks I hold myself. + */ + myLocks = proclock->holdMask; + for (i = 1; i <= numLockModes; i++) + { + if ((conflictMask & LOCKBIT_ON(i)) == 0) + { + conflictsRemaining[i] = 0; + continue; + } + conflictsRemaining[i] = lock->granted[i]; + if (myLocks & LOCKBIT_ON(i)) + --conflictsRemaining[i]; + totalConflictsRemaining += conflictsRemaining[i]; + } + + /* If no conflicts remain, we get the lock. */ + if (totalConflictsRemaining == 0) + { + PROCLOCK_PRINT("LockCheckConflicts: resolved (simple)", proclock); + return false; + } + + /* If no group locking, it's definitely a conflict. */ + if (proclock->groupLeader == MyProc && MyProc->lockGroupLeader == NULL) + { + Assert(proclock->tag.myProc == MyProc); + PROCLOCK_PRINT("LockCheckConflicts: conflicting (simple)", + proclock); + return true; + } + + /* + * The relation extension lock conflict even between the group members. + */ + if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND) + { + PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)", + proclock); + return true; + } + + /* + * Locks held in conflicting modes by members of our own lock group are + * not real conflicts; we can subtract those out and see if we still have + * a conflict. This is O(N) in the number of processes holding or + * awaiting locks on this object. We could improve that by making the + * shared memory state more complex (and larger) but it doesn't seem worth + * it. + */ + dlist_foreach(proclock_iter, &lock->procLocks) + { + PROCLOCK *otherproclock = + dlist_container(PROCLOCK, lockLink, proclock_iter.cur); + + if (proclock != otherproclock && + proclock->groupLeader == otherproclock->groupLeader && + (otherproclock->holdMask & conflictMask) != 0) + { + int intersectMask = otherproclock->holdMask & conflictMask; + + for (i = 1; i <= numLockModes; i++) + { + if ((intersectMask & LOCKBIT_ON(i)) != 0) + { + if (conflictsRemaining[i] <= 0) + elog(PANIC, "proclocks held do not match lock"); + conflictsRemaining[i]--; + totalConflictsRemaining--; + } + } + + if (totalConflictsRemaining == 0) + { + PROCLOCK_PRINT("LockCheckConflicts: resolved (group)", + proclock); + return false; + } + } + } + + /* Nope, it's a real conflict. */ + PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)", proclock); + return true; +} + +/* + * GrantLock -- update the lock and proclock data structures to show + * the lock request has been granted. + * + * NOTE: if proc was blocked, it also needs to be removed from the wait list + * and have its waitLock/waitProcLock fields cleared. That's not done here. + * + * NOTE: the lock grant also has to be recorded in the associated LOCALLOCK + * table entry; but since we may be awaking some other process, we can't do + * that here; it's done by GrantLockLocal, instead. + */ +void +GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode) +{ + lock->nGranted++; + lock->granted[lockmode]++; + lock->grantMask |= LOCKBIT_ON(lockmode); + if (lock->granted[lockmode] == lock->requested[lockmode]) + lock->waitMask &= LOCKBIT_OFF(lockmode); + proclock->holdMask |= LOCKBIT_ON(lockmode); + LOCK_PRINT("GrantLock", lock, lockmode); + Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0)); + Assert(lock->nGranted <= lock->nRequested); +} + +/* + * UnGrantLock -- opposite of GrantLock. + * + * Updates the lock and proclock data structures to show that the lock + * is no longer held nor requested by the current holder. + * + * Returns true if there were any waiters waiting on the lock that + * should now be woken up with ProcLockWakeup. + */ +static bool +UnGrantLock(LOCK *lock, LOCKMODE lockmode, + PROCLOCK *proclock, LockMethod lockMethodTable) +{ + bool wakeupNeeded = false; + + Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0)); + Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0)); + Assert(lock->nGranted <= lock->nRequested); + + /* + * fix the general lock stats + */ + lock->nRequested--; + lock->requested[lockmode]--; + lock->nGranted--; + lock->granted[lockmode]--; + + if (lock->granted[lockmode] == 0) + { + /* change the conflict mask. No more of this lock type. */ + lock->grantMask &= LOCKBIT_OFF(lockmode); + } + + LOCK_PRINT("UnGrantLock: updated", lock, lockmode); + + /* + * We need only run ProcLockWakeup if the released lock conflicts with at + * least one of the lock types requested by waiter(s). Otherwise whatever + * conflict made them wait must still exist. NOTE: before MVCC, we could + * skip wakeup if lock->granted[lockmode] was still positive. But that's + * not true anymore, because the remaining granted locks might belong to + * some waiter, who could now be awakened because he doesn't conflict with + * his own locks. + */ + if (lockMethodTable->conflictTab[lockmode] & lock->waitMask) + wakeupNeeded = true; + + /* + * Now fix the per-proclock state. + */ + proclock->holdMask &= LOCKBIT_OFF(lockmode); + PROCLOCK_PRINT("UnGrantLock: updated", proclock); + + return wakeupNeeded; +} + +/* + * CleanUpLock -- clean up after releasing a lock. We garbage-collect the + * proclock and lock objects if possible, and call ProcLockWakeup if there + * are remaining requests and the caller says it's OK. (Normally, this + * should be called after UnGrantLock, and wakeupNeeded is the result from + * UnGrantLock.) + * + * The appropriate partition lock must be held at entry, and will be + * held at exit. + */ +static void +CleanUpLock(LOCK *lock, PROCLOCK *proclock, + LockMethod lockMethodTable, uint32 hashcode, + bool wakeupNeeded) +{ + /* + * If this was my last hold on this lock, delete my entry in the proclock + * table. + */ + if (proclock->holdMask == 0) + { + uint32 proclock_hashcode; + + PROCLOCK_PRINT("CleanUpLock: deleting", proclock); + dlist_delete(&proclock->lockLink); + dlist_delete(&proclock->procLink); + proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode); + if (!hash_search_with_hash_value(LockMethodProcLockHash, + &(proclock->tag), + proclock_hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "proclock table corrupted"); + } + + if (lock->nRequested == 0) + { + /* + * The caller just released the last lock, so garbage-collect the lock + * object. + */ + LOCK_PRINT("CleanUpLock: deleting", lock, 0); + Assert(dlist_is_empty(&lock->procLocks)); + if (!hash_search_with_hash_value(LockMethodLockHash, + &(lock->tag), + hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "lock table corrupted"); + } + else if (wakeupNeeded) + { + /* There are waiters on this lock, so wake them up. */ + ProcLockWakeup(lockMethodTable, lock); + } +} + +/* + * GrantLockLocal -- update the locallock data structures to show + * the lock request has been granted. + * + * We expect that LockAcquire made sure there is room to add a new + * ResourceOwner entry. + */ +static void +GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner) +{ + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + int i; + + Assert(locallock->numLockOwners < locallock->maxLockOwners); + /* Count the total */ + locallock->nLocks++; + /* Count the per-owner lock */ + for (i = 0; i < locallock->numLockOwners; i++) + { + if (lockOwners[i].owner == owner) + { + lockOwners[i].nLocks++; + return; + } + } + lockOwners[i].owner = owner; + lockOwners[i].nLocks = 1; + locallock->numLockOwners++; + if (owner != NULL) + ResourceOwnerRememberLock(owner, locallock); + + /* Indicate that the lock is acquired for certain types of locks. */ + CheckAndSetLockHeld(locallock, true); +} + +/* + * BeginStrongLockAcquire - inhibit use of fastpath for a given LOCALLOCK, + * and arrange for error cleanup if it fails + */ +static void +BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode) +{ + Assert(StrongLockInProgress == NULL); + Assert(locallock->holdsStrongLockCount == false); + + /* + * Adding to a memory location is not atomic, so we take a spinlock to + * ensure we don't collide with someone else trying to bump the count at + * the same time. + * + * XXX: It might be worth considering using an atomic fetch-and-add + * instruction here, on architectures where that is supported. + */ + + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + FastPathStrongRelationLocks->count[fasthashcode]++; + locallock->holdsStrongLockCount = true; + StrongLockInProgress = locallock; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); +} + +/* + * FinishStrongLockAcquire - cancel pending cleanup for a strong lock + * acquisition once it's no longer needed + */ +static void +FinishStrongLockAcquire(void) +{ + StrongLockInProgress = NULL; +} + +/* + * AbortStrongLockAcquire - undo strong lock state changes performed by + * BeginStrongLockAcquire. + */ +void +AbortStrongLockAcquire(void) +{ + uint32 fasthashcode; + LOCALLOCK *locallock = StrongLockInProgress; + + if (locallock == NULL) + return; + + fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode); + Assert(locallock->holdsStrongLockCount == true); + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0); + FastPathStrongRelationLocks->count[fasthashcode]--; + locallock->holdsStrongLockCount = false; + StrongLockInProgress = NULL; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); +} + +/* + * GrantAwaitedLock -- call GrantLockLocal for the lock we are doing + * WaitOnLock on. + * + * proc.c needs this for the case where we are booted off the lock by + * timeout, but discover that someone granted us the lock anyway. + * + * We could just export GrantLockLocal, but that would require including + * resowner.h in lock.h, which creates circularity. + */ +void +GrantAwaitedLock(void) +{ + GrantLockLocal(awaitedLock, awaitedOwner); +} + +/* + * MarkLockClear -- mark an acquired lock as "clear" + * + * This means that we know we have absorbed all sinval messages that other + * sessions generated before we acquired this lock, and so we can confidently + * assume we know about any catalog changes protected by this lock. + */ +void +MarkLockClear(LOCALLOCK *locallock) +{ + Assert(locallock->nLocks > 0); + locallock->lockCleared = true; +} + +/* + * WaitOnLock -- wait to acquire a lock + * + * Caller must have set MyProc->heldLocks to reflect locks already held + * on the lockable object by this process. + * + * The appropriate partition lock must be held at entry. + */ +static void +WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner) +{ + LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock); + LockMethod lockMethodTable = LockMethods[lockmethodid]; + + LOCK_PRINT("WaitOnLock: sleeping on lock", + locallock->lock, locallock->tag.mode); + + /* adjust the process title to indicate that it's waiting */ + set_ps_display_suffix("waiting"); + + awaitedLock = locallock; + awaitedOwner = owner; + + /* + * NOTE: Think not to put any shared-state cleanup after the call to + * ProcSleep, in either the normal or failure path. The lock state must + * be fully set by the lock grantor, or by CheckDeadLock if we give up + * waiting for the lock. This is necessary because of the possibility + * that a cancel/die interrupt will interrupt ProcSleep after someone else + * grants us the lock, but before we've noticed it. Hence, after granting, + * the locktable state must fully reflect the fact that we own the lock; + * we can't do additional work on return. + * + * We can and do use a PG_TRY block to try to clean up after failure, but + * this still has a major limitation: elog(FATAL) can occur while waiting + * (eg, a "die" interrupt), and then control won't come back here. So all + * cleanup of essential state should happen in LockErrorCleanup, not here. + * We can use PG_TRY to clear the "waiting" status flags, since doing that + * is unimportant if the process exits. + */ + PG_TRY(); + { + if (ProcSleep(locallock, lockMethodTable) != PROC_WAIT_STATUS_OK) + { + /* + * We failed as a result of a deadlock, see CheckDeadLock(). Quit + * now. + */ + awaitedLock = NULL; + LOCK_PRINT("WaitOnLock: aborting on lock", + locallock->lock, locallock->tag.mode); + LWLockRelease(LockHashPartitionLock(locallock->hashcode)); + + /* + * Now that we aren't holding the partition lock, we can give an + * error report including details about the detected deadlock. + */ + DeadLockReport(); + /* not reached */ + } + } + PG_CATCH(); + { + /* In this path, awaitedLock remains set until LockErrorCleanup */ + + /* reset ps display to remove the suffix */ + set_ps_display_remove_suffix(); + + /* and propagate the error */ + PG_RE_THROW(); + } + PG_END_TRY(); + + awaitedLock = NULL; + + /* reset ps display to remove the suffix */ + set_ps_display_remove_suffix(); + + LOCK_PRINT("WaitOnLock: wakeup on lock", + locallock->lock, locallock->tag.mode); +} + +/* + * Remove a proc from the wait-queue it is on (caller must know it is on one). + * This is only used when the proc has failed to get the lock, so we set its + * waitStatus to PROC_WAIT_STATUS_ERROR. + * + * Appropriate partition lock must be held by caller. Also, caller is + * responsible for signaling the proc if needed. + * + * NB: this does not clean up any locallock object that may exist for the lock. + */ +void +RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode) +{ + LOCK *waitLock = proc->waitLock; + PROCLOCK *proclock = proc->waitProcLock; + LOCKMODE lockmode = proc->waitLockMode; + LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*waitLock); + + /* Make sure proc is waiting */ + Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING); + Assert(proc->links.next != NULL); + Assert(waitLock); + Assert(!dclist_is_empty(&waitLock->waitProcs)); + Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods)); + + /* Remove proc from lock's wait queue */ + dclist_delete_from_thoroughly(&waitLock->waitProcs, &proc->links); + + /* Undo increments of request counts by waiting process */ + Assert(waitLock->nRequested > 0); + Assert(waitLock->nRequested > proc->waitLock->nGranted); + waitLock->nRequested--; + Assert(waitLock->requested[lockmode] > 0); + waitLock->requested[lockmode]--; + /* don't forget to clear waitMask bit if appropriate */ + if (waitLock->granted[lockmode] == waitLock->requested[lockmode]) + waitLock->waitMask &= LOCKBIT_OFF(lockmode); + + /* Clean up the proc's own state, and pass it the ok/fail signal */ + proc->waitLock = NULL; + proc->waitProcLock = NULL; + proc->waitStatus = PROC_WAIT_STATUS_ERROR; + + /* + * Delete the proclock immediately if it represents no already-held locks. + * (This must happen now because if the owner of the lock decides to + * release it, and the requested/granted counts then go to zero, + * LockRelease expects there to be no remaining proclocks.) Then see if + * any other waiters for the lock can be woken up now. + */ + CleanUpLock(waitLock, proclock, + LockMethods[lockmethodid], hashcode, + true); +} + +/* + * LockRelease -- look up 'locktag' and release one 'lockmode' lock on it. + * Release a session lock if 'sessionLock' is true, else release a + * regular transaction lock. + * + * Side Effects: find any waiting processes that are now wakable, + * grant them their requested locks and awaken them. + * (We have to grant the lock here to avoid a race between + * the waking process and any new process to + * come along and request the lock.) + */ +bool +LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LockMethod lockMethodTable; + LOCALLOCKTAG localtag; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + LWLock *partitionLock; + bool wakeupNeeded; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) + elog(ERROR, "unrecognized lock mode: %d", lockmode); + +#ifdef LOCK_DEBUG + if (LOCK_DEBUG_ENABLED(locktag)) + elog(LOG, "LockRelease: lock [%u,%u] %s", + locktag->locktag_field1, locktag->locktag_field2, + lockMethodTable->lockModeNames[lockmode]); +#endif + + /* + * Find the LOCALLOCK entry for this lock and lockmode + */ + MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */ + localtag.lock = *locktag; + localtag.mode = lockmode; + + locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash, + &localtag, + HASH_FIND, NULL); + + /* + * let the caller print its own error message, too. Do not ereport(ERROR). + */ + if (!locallock || locallock->nLocks <= 0) + { + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + return false; + } + + /* + * Decrease the count for the resource owner. + */ + { + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + ResourceOwner owner; + int i; + + /* Identify owner for lock */ + if (sessionLock) + owner = NULL; + else + owner = CurrentResourceOwner; + + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == owner) + { + Assert(lockOwners[i].nLocks > 0); + if (--lockOwners[i].nLocks == 0) + { + if (owner != NULL) + ResourceOwnerForgetLock(owner, locallock); + /* compact out unused slot */ + locallock->numLockOwners--; + if (i < locallock->numLockOwners) + lockOwners[i] = lockOwners[locallock->numLockOwners]; + } + break; + } + } + if (i < 0) + { + /* don't release a lock belonging to another owner */ + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + return false; + } + } + + /* + * Decrease the total local count. If we're still holding the lock, we're + * done. + */ + locallock->nLocks--; + + if (locallock->nLocks > 0) + return true; + + /* + * At this point we can no longer suppose we are clear of invalidation + * messages related to this lock. Although we'll delete the LOCALLOCK + * object before any intentional return from this routine, it seems worth + * the trouble to explicitly reset lockCleared right now, just in case + * some error prevents us from deleting the LOCALLOCK. + */ + locallock->lockCleared = false; + + /* Attempt fast release of any lock eligible for the fast path. */ + if (EligibleForRelationFastPath(locktag, lockmode) && + FastPathLocalUseCount > 0) + { + bool released; + + /* + * We might not find the lock here, even if we originally entered it + * here. Another backend may have moved it to the main table. + */ + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + released = FastPathUnGrantRelationLock(locktag->locktag_field2, + lockmode); + LWLockRelease(&MyProc->fpInfoLock); + if (released) + { + RemoveLocalLock(locallock); + return true; + } + } + + /* + * Otherwise we've got to mess with the shared lock table. + */ + partitionLock = LockHashPartitionLock(locallock->hashcode); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * Normally, we don't need to re-find the lock or proclock, since we kept + * their addresses in the locallock table, and they couldn't have been + * removed while we were holding a lock on them. But it's possible that + * the lock was taken fast-path and has since been moved to the main hash + * table by another backend, in which case we will need to look up the + * objects here. We assume the lock field is NULL if so. + */ + lock = locallock->lock; + if (!lock) + { + PROCLOCKTAG proclocktag; + + Assert(EligibleForRelationFastPath(locktag, lockmode)); + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + locktag, + locallock->hashcode, + HASH_FIND, + NULL); + if (!lock) + elog(ERROR, "failed to re-find shared lock object"); + locallock->lock = lock; + + proclocktag.myLock = lock; + proclocktag.myProc = MyProc; + locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash, + &proclocktag, + HASH_FIND, + NULL); + if (!locallock->proclock) + elog(ERROR, "failed to re-find shared proclock object"); + } + LOCK_PRINT("LockRelease: found", lock, lockmode); + proclock = locallock->proclock; + PROCLOCK_PRINT("LockRelease: found", proclock); + + /* + * Double-check that we are actually holding a lock of the type we want to + * release. + */ + if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) + { + PROCLOCK_PRINT("LockRelease: WRONGTYPE", proclock); + LWLockRelease(partitionLock); + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + RemoveLocalLock(locallock); + return false; + } + + /* + * Do the releasing. CleanUpLock will waken any now-wakable waiters. + */ + wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable); + + CleanUpLock(lock, proclock, + lockMethodTable, locallock->hashcode, + wakeupNeeded); + + LWLockRelease(partitionLock); + + RemoveLocalLock(locallock); + return true; +} + +/* + * LockReleaseAll -- Release all locks of the specified lock method that + * are held by the current process. + * + * Well, not necessarily *all* locks. The available behaviors are: + * allLocks == true: release all locks including session locks. + * allLocks == false: release all non-session locks. + */ +void +LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks) +{ + HASH_SEQ_STATUS status; + LockMethod lockMethodTable; + int i, + numLockModes; + LOCALLOCK *locallock; + LOCK *lock; + int partition; + bool have_fast_path_lwlock = false; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + +#ifdef LOCK_DEBUG + if (*(lockMethodTable->trace_flag)) + elog(LOG, "LockReleaseAll: lockmethod=%d", lockmethodid); +#endif + + /* + * Get rid of our fast-path VXID lock, if appropriate. Note that this is + * the only way that the lock we hold on our own VXID can ever get + * released: it is always and only released when a toplevel transaction + * ends. + */ + if (lockmethodid == DEFAULT_LOCKMETHOD) + VirtualXactLockTableCleanup(); + + numLockModes = lockMethodTable->numLockModes; + + /* + * First we run through the locallock table and get rid of unwanted + * entries, then we scan the process's proclocks and get rid of those. We + * do this separately because we may have multiple locallock entries + * pointing to the same proclock, and we daren't end up with any dangling + * pointers. Fast-path locks are cleaned up during the locallock table + * scan, though. + */ + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + /* + * If the LOCALLOCK entry is unused, we must've run out of shared + * memory while trying to set up this lock. Just forget the local + * entry. + */ + if (locallock->nLocks == 0) + { + RemoveLocalLock(locallock); + continue; + } + + /* Ignore items that are not of the lockmethod to be removed */ + if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid) + continue; + + /* + * If we are asked to release all locks, we can just zap the entry. + * Otherwise, must scan to see if there are session locks. We assume + * there is at most one lockOwners entry for session locks. + */ + if (!allLocks) + { + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + + /* If session lock is above array position 0, move it down to 0 */ + for (i = 0; i < locallock->numLockOwners; i++) + { + if (lockOwners[i].owner == NULL) + lockOwners[0] = lockOwners[i]; + else + ResourceOwnerForgetLock(lockOwners[i].owner, locallock); + } + + if (locallock->numLockOwners > 0 && + lockOwners[0].owner == NULL && + lockOwners[0].nLocks > 0) + { + /* Fix the locallock to show just the session locks */ + locallock->nLocks = lockOwners[0].nLocks; + locallock->numLockOwners = 1; + /* We aren't deleting this locallock, so done */ + continue; + } + else + locallock->numLockOwners = 0; + } + + /* + * If the lock or proclock pointers are NULL, this lock was taken via + * the relation fast-path (and is not known to have been transferred). + */ + if (locallock->proclock == NULL || locallock->lock == NULL) + { + LOCKMODE lockmode = locallock->tag.mode; + Oid relid; + + /* Verify that a fast-path lock is what we've got. */ + if (!EligibleForRelationFastPath(&locallock->tag.lock, lockmode)) + elog(PANIC, "locallock table corrupted"); + + /* + * If we don't currently hold the LWLock that protects our + * fast-path data structures, we must acquire it before attempting + * to release the lock via the fast-path. We will continue to + * hold the LWLock until we're done scanning the locallock table, + * unless we hit a transferred fast-path lock. (XXX is this + * really such a good idea? There could be a lot of entries ...) + */ + if (!have_fast_path_lwlock) + { + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + have_fast_path_lwlock = true; + } + + /* Attempt fast-path release. */ + relid = locallock->tag.lock.locktag_field2; + if (FastPathUnGrantRelationLock(relid, lockmode)) + { + RemoveLocalLock(locallock); + continue; + } + + /* + * Our lock, originally taken via the fast path, has been + * transferred to the main lock table. That's going to require + * some extra work, so release our fast-path lock before starting. + */ + LWLockRelease(&MyProc->fpInfoLock); + have_fast_path_lwlock = false; + + /* + * Now dump the lock. We haven't got a pointer to the LOCK or + * PROCLOCK in this case, so we have to handle this a bit + * differently than a normal lock release. Unfortunately, this + * requires an extra LWLock acquire-and-release cycle on the + * partitionLock, but hopefully it shouldn't happen often. + */ + LockRefindAndRelease(lockMethodTable, MyProc, + &locallock->tag.lock, lockmode, false); + RemoveLocalLock(locallock); + continue; + } + + /* Mark the proclock to show we need to release this lockmode */ + if (locallock->nLocks > 0) + locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode); + + /* And remove the locallock hashtable entry */ + RemoveLocalLock(locallock); + } + + /* Done with the fast-path data structures */ + if (have_fast_path_lwlock) + LWLockRelease(&MyProc->fpInfoLock); + + /* + * Now, scan each lock partition separately. + */ + for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++) + { + LWLock *partitionLock; + dlist_head *procLocks = &MyProc->myProcLocks[partition]; + dlist_mutable_iter proclock_iter; + + partitionLock = LockHashPartitionLockByIndex(partition); + + /* + * If the proclock list for this partition is empty, we can skip + * acquiring the partition lock. This optimization is trickier than + * it looks, because another backend could be in process of adding + * something to our proclock list due to promoting one of our + * fast-path locks. However, any such lock must be one that we + * decided not to delete above, so it's okay to skip it again now; + * we'd just decide not to delete it again. We must, however, be + * careful to re-fetch the list header once we've acquired the + * partition lock, to be sure we have a valid, up-to-date pointer. + * (There is probably no significant risk if pointer fetch/store is + * atomic, but we don't wish to assume that.) + * + * XXX This argument assumes that the locallock table correctly + * represents all of our fast-path locks. While allLocks mode + * guarantees to clean up all of our normal locks regardless of the + * locallock situation, we lose that guarantee for fast-path locks. + * This is not ideal. + */ + if (dlist_is_empty(procLocks)) + continue; /* needn't examine this partition */ + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + dlist_foreach_modify(proclock_iter, procLocks) + { + PROCLOCK *proclock = dlist_container(PROCLOCK, procLink, proclock_iter.cur); + bool wakeupNeeded = false; + + Assert(proclock->tag.myProc == MyProc); + + lock = proclock->tag.myLock; + + /* Ignore items that are not of the lockmethod to be removed */ + if (LOCK_LOCKMETHOD(*lock) != lockmethodid) + continue; + + /* + * In allLocks mode, force release of all locks even if locallock + * table had problems + */ + if (allLocks) + proclock->releaseMask = proclock->holdMask; + else + Assert((proclock->releaseMask & ~proclock->holdMask) == 0); + + /* + * Ignore items that have nothing to be released, unless they have + * holdMask == 0 and are therefore recyclable + */ + if (proclock->releaseMask == 0 && proclock->holdMask != 0) + continue; + + PROCLOCK_PRINT("LockReleaseAll", proclock); + LOCK_PRINT("LockReleaseAll", lock, 0); + Assert(lock->nRequested >= 0); + Assert(lock->nGranted >= 0); + Assert(lock->nGranted <= lock->nRequested); + Assert((proclock->holdMask & ~lock->grantMask) == 0); + + /* + * Release the previously-marked lock modes + */ + for (i = 1; i <= numLockModes; i++) + { + if (proclock->releaseMask & LOCKBIT_ON(i)) + wakeupNeeded |= UnGrantLock(lock, i, proclock, + lockMethodTable); + } + Assert((lock->nRequested >= 0) && (lock->nGranted >= 0)); + Assert(lock->nGranted <= lock->nRequested); + LOCK_PRINT("LockReleaseAll: updated", lock, 0); + + proclock->releaseMask = 0; + + /* CleanUpLock will wake up waiters if needed. */ + CleanUpLock(lock, proclock, + lockMethodTable, + LockTagHashCode(&lock->tag), + wakeupNeeded); + } /* loop over PROCLOCKs within this partition */ + + LWLockRelease(partitionLock); + } /* loop over partitions */ + +#ifdef LOCK_DEBUG + if (*(lockMethodTable->trace_flag)) + elog(LOG, "LockReleaseAll done"); +#endif +} + +/* + * LockReleaseSession -- Release all session locks of the specified lock method + * that are held by the current process. + */ +void +LockReleaseSession(LOCKMETHODID lockmethodid) +{ + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + /* Ignore items that are not of the specified lock method */ + if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid) + continue; + + ReleaseLockIfHeld(locallock, true); + } +} + +/* + * LockReleaseCurrentOwner + * Release all locks belonging to CurrentResourceOwner + * + * If the caller knows what those locks are, it can pass them as an array. + * That speeds up the call significantly, when a lot of locks are held. + * Otherwise, pass NULL for locallocks, and we'll traverse through our hash + * table to find them. + */ +void +LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks) +{ + if (locallocks == NULL) + { + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + ReleaseLockIfHeld(locallock, false); + } + else + { + int i; + + for (i = nlocks - 1; i >= 0; i--) + ReleaseLockIfHeld(locallocks[i], false); + } +} + +/* + * ReleaseLockIfHeld + * Release any session-level locks on this lockable object if sessionLock + * is true; else, release any locks held by CurrentResourceOwner. + * + * It is tempting to pass this a ResourceOwner pointer (or NULL for session + * locks), but without refactoring LockRelease() we cannot support releasing + * locks belonging to resource owners other than CurrentResourceOwner. + * If we were to refactor, it'd be a good idea to fix it so we don't have to + * do a hashtable lookup of the locallock, too. However, currently this + * function isn't used heavily enough to justify refactoring for its + * convenience. + */ +static void +ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock) +{ + ResourceOwner owner; + LOCALLOCKOWNER *lockOwners; + int i; + + /* Identify owner for lock (must match LockRelease!) */ + if (sessionLock) + owner = NULL; + else + owner = CurrentResourceOwner; + + /* Scan to see if there are any locks belonging to the target owner */ + lockOwners = locallock->lockOwners; + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == owner) + { + Assert(lockOwners[i].nLocks > 0); + if (lockOwners[i].nLocks < locallock->nLocks) + { + /* + * We will still hold this lock after forgetting this + * ResourceOwner. + */ + locallock->nLocks -= lockOwners[i].nLocks; + /* compact out unused slot */ + locallock->numLockOwners--; + if (owner != NULL) + ResourceOwnerForgetLock(owner, locallock); + if (i < locallock->numLockOwners) + lockOwners[i] = lockOwners[locallock->numLockOwners]; + } + else + { + Assert(lockOwners[i].nLocks == locallock->nLocks); + /* We want to call LockRelease just once */ + lockOwners[i].nLocks = 1; + locallock->nLocks = 1; + if (!LockRelease(&locallock->tag.lock, + locallock->tag.mode, + sessionLock)) + elog(WARNING, "ReleaseLockIfHeld: failed??"); + } + break; + } + } +} + +/* + * LockReassignCurrentOwner + * Reassign all locks belonging to CurrentResourceOwner to belong + * to its parent resource owner. + * + * If the caller knows what those locks are, it can pass them as an array. + * That speeds up the call significantly, when a lot of locks are held + * (e.g pg_dump with a large schema). Otherwise, pass NULL for locallocks, + * and we'll traverse through our hash table to find them. + */ +void +LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks) +{ + ResourceOwner parent = ResourceOwnerGetParent(CurrentResourceOwner); + + Assert(parent != NULL); + + if (locallocks == NULL) + { + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + LockReassignOwner(locallock, parent); + } + else + { + int i; + + for (i = nlocks - 1; i >= 0; i--) + LockReassignOwner(locallocks[i], parent); + } +} + +/* + * Subroutine of LockReassignCurrentOwner. Reassigns a given lock belonging to + * CurrentResourceOwner to its parent. + */ +static void +LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent) +{ + LOCALLOCKOWNER *lockOwners; + int i; + int ic = -1; + int ip = -1; + + /* + * Scan to see if there are any locks belonging to current owner or its + * parent + */ + lockOwners = locallock->lockOwners; + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == CurrentResourceOwner) + ic = i; + else if (lockOwners[i].owner == parent) + ip = i; + } + + if (ic < 0) + return; /* no current locks */ + + if (ip < 0) + { + /* Parent has no slot, so just give it the child's slot */ + lockOwners[ic].owner = parent; + ResourceOwnerRememberLock(parent, locallock); + } + else + { + /* Merge child's count with parent's */ + lockOwners[ip].nLocks += lockOwners[ic].nLocks; + /* compact out unused slot */ + locallock->numLockOwners--; + if (ic < locallock->numLockOwners) + lockOwners[ic] = lockOwners[locallock->numLockOwners]; + } + ResourceOwnerForgetLock(CurrentResourceOwner, locallock); +} + +/* + * FastPathGrantRelationLock + * Grant lock using per-backend fast-path array, if there is space. + */ +static bool +FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode) +{ + uint32 f; + uint32 unused_slot = FP_LOCK_SLOTS_PER_BACKEND; + + /* Scan for existing entry for this relid, remembering empty slot. */ + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + if (FAST_PATH_GET_BITS(MyProc, f) == 0) + unused_slot = f; + else if (MyProc->fpRelId[f] == relid) + { + Assert(!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode)); + FAST_PATH_SET_LOCKMODE(MyProc, f, lockmode); + return true; + } + } + + /* If no existing entry, use any empty slot. */ + if (unused_slot < FP_LOCK_SLOTS_PER_BACKEND) + { + MyProc->fpRelId[unused_slot] = relid; + FAST_PATH_SET_LOCKMODE(MyProc, unused_slot, lockmode); + ++FastPathLocalUseCount; + return true; + } + + /* No existing entry, and no empty slot. */ + return false; +} + +/* + * FastPathUnGrantRelationLock + * Release fast-path lock, if present. Update backend-private local + * use count, while we're at it. + */ +static bool +FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode) +{ + uint32 f; + bool result = false; + + FastPathLocalUseCount = 0; + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + if (MyProc->fpRelId[f] == relid + && FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode)) + { + Assert(!result); + FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode); + result = true; + /* we continue iterating so as to update FastPathLocalUseCount */ + } + if (FAST_PATH_GET_BITS(MyProc, f) != 0) + ++FastPathLocalUseCount; + } + return result; +} + +/* + * FastPathTransferRelationLocks + * Transfer locks matching the given lock tag from per-backend fast-path + * arrays to the shared hash table. + * + * Returns true if successful, false if ran out of shared memory. + */ +static bool +FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag, + uint32 hashcode) +{ + LWLock *partitionLock = LockHashPartitionLock(hashcode); + Oid relid = locktag->locktag_field2; + uint32 i; + + /* + * Every PGPROC that can potentially hold a fast-path lock is present in + * ProcGlobal->allProcs. Prepared transactions are not, but any + * outstanding fast-path locks held by prepared transactions are + * transferred to the main lock table. + */ + for (i = 0; i < ProcGlobal->allProcCount; i++) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; + + LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE); + + /* + * If the target backend isn't referencing the same database as the + * lock, then we needn't examine the individual relation IDs at all; + * none of them can be relevant. + * + * proc->databaseId is set at backend startup time and never changes + * thereafter, so it might be safe to perform this test before + * acquiring &proc->fpInfoLock. In particular, it's certainly safe to + * assume that if the target backend holds any fast-path locks, it + * must have performed a memory-fencing operation (in particular, an + * LWLock acquisition) since setting proc->databaseId. However, it's + * less clear that our backend is certain to have performed a memory + * fencing operation since the other backend set proc->databaseId. So + * for now, we test it after acquiring the LWLock just to be safe. + */ + if (proc->databaseId != locktag->locktag_field1) + { + LWLockRelease(&proc->fpInfoLock); + continue; + } + + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + uint32 lockmode; + + /* Look for an allocated slot matching the given relid. */ + if (relid != proc->fpRelId[f] || FAST_PATH_GET_BITS(proc, f) == 0) + continue; + + /* Find or create lock object. */ + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + for (lockmode = FAST_PATH_LOCKNUMBER_OFFSET; + lockmode < FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT; + ++lockmode) + { + PROCLOCK *proclock; + + if (!FAST_PATH_CHECK_LOCKMODE(proc, f, lockmode)) + continue; + proclock = SetupLockInTable(lockMethodTable, proc, locktag, + hashcode, lockmode); + if (!proclock) + { + LWLockRelease(partitionLock); + LWLockRelease(&proc->fpInfoLock); + return false; + } + GrantLock(proclock->tag.myLock, proclock, lockmode); + FAST_PATH_CLEAR_LOCKMODE(proc, f, lockmode); + } + LWLockRelease(partitionLock); + + /* No need to examine remaining slots. */ + break; + } + LWLockRelease(&proc->fpInfoLock); + } + return true; +} + +/* + * FastPathGetRelationLockEntry + * Return the PROCLOCK for a lock originally taken via the fast-path, + * transferring it to the primary lock table if necessary. + * + * Note: caller takes care of updating the locallock object. + */ +static PROCLOCK * +FastPathGetRelationLockEntry(LOCALLOCK *locallock) +{ + LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD]; + LOCKTAG *locktag = &locallock->tag.lock; + PROCLOCK *proclock = NULL; + LWLock *partitionLock = LockHashPartitionLock(locallock->hashcode); + Oid relid = locktag->locktag_field2; + uint32 f; + + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + uint32 lockmode; + + /* Look for an allocated slot matching the given relid. */ + if (relid != MyProc->fpRelId[f] || FAST_PATH_GET_BITS(MyProc, f) == 0) + continue; + + /* If we don't have a lock of the given mode, forget it! */ + lockmode = locallock->tag.mode; + if (!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode)) + break; + + /* Find or create lock object. */ + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + proclock = SetupLockInTable(lockMethodTable, MyProc, locktag, + locallock->hashcode, lockmode); + if (!proclock) + { + LWLockRelease(partitionLock); + LWLockRelease(&MyProc->fpInfoLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_locks_per_transaction"))); + } + GrantLock(proclock->tag.myLock, proclock, lockmode); + FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode); + + LWLockRelease(partitionLock); + + /* No need to examine remaining slots. */ + break; + } + + LWLockRelease(&MyProc->fpInfoLock); + + /* Lock may have already been transferred by some other backend. */ + if (proclock == NULL) + { + LOCK *lock; + PROCLOCKTAG proclocktag; + uint32 proclock_hashcode; + + LWLockAcquire(partitionLock, LW_SHARED); + + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + locktag, + locallock->hashcode, + HASH_FIND, + NULL); + if (!lock) + elog(ERROR, "failed to re-find shared lock object"); + + proclocktag.myLock = lock; + proclocktag.myProc = MyProc; + + proclock_hashcode = ProcLockHashCode(&proclocktag, locallock->hashcode); + proclock = (PROCLOCK *) + hash_search_with_hash_value(LockMethodProcLockHash, + &proclocktag, + proclock_hashcode, + HASH_FIND, + NULL); + if (!proclock) + elog(ERROR, "failed to re-find shared proclock object"); + LWLockRelease(partitionLock); + } + + return proclock; +} + +/* + * GetLockConflicts + * Get an array of VirtualTransactionIds of xacts currently holding locks + * that would conflict with the specified lock/lockmode. + * xacts merely awaiting such a lock are NOT reported. + * + * The result array is palloc'd and is terminated with an invalid VXID. + * *countp, if not null, is updated to the number of items set. + * + * Of course, the result could be out of date by the time it's returned, so + * use of this function has to be thought about carefully. Similarly, a + * PGPROC with no "lxid" will be considered non-conflicting regardless of any + * lock it holds. Existing callers don't care about a locker after that + * locker's pg_xact updates complete. CommitTransaction() clears "lxid" after + * pg_xact updates and before releasing locks. + * + * Note we never include the current xact's vxid in the result array, + * since an xact never blocks itself. + */ +VirtualTransactionId * +GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp) +{ + static VirtualTransactionId *vxids; + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LockMethod lockMethodTable; + LOCK *lock; + LOCKMASK conflictMask; + dlist_iter proclock_iter; + PROCLOCK *proclock; + uint32 hashcode; + LWLock *partitionLock; + int count = 0; + int fast_count = 0; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes) + elog(ERROR, "unrecognized lock mode: %d", lockmode); + + /* + * Allocate memory to store results, and fill with InvalidVXID. We only + * need enough space for MaxBackends + max_prepared_xacts + a terminator. + * InHotStandby allocate once in TopMemoryContext. + */ + if (InHotStandby) + { + if (vxids == NULL) + vxids = (VirtualTransactionId *) + MemoryContextAlloc(TopMemoryContext, + sizeof(VirtualTransactionId) * + (MaxBackends + max_prepared_xacts + 1)); + } + else + vxids = (VirtualTransactionId *) + palloc0(sizeof(VirtualTransactionId) * + (MaxBackends + max_prepared_xacts + 1)); + + /* Compute hash code and partition lock, and look up conflicting modes. */ + hashcode = LockTagHashCode(locktag); + partitionLock = LockHashPartitionLock(hashcode); + conflictMask = lockMethodTable->conflictTab[lockmode]; + + /* + * Fast path locks might not have been entered in the primary lock table. + * If the lock we're dealing with could conflict with such a lock, we must + * examine each backend's fast-path array for conflicts. + */ + if (ConflictsWithRelationFastPath(locktag, lockmode)) + { + int i; + Oid relid = locktag->locktag_field2; + VirtualTransactionId vxid; + + /* + * Iterate over relevant PGPROCs. Anything held by a prepared + * transaction will have been transferred to the primary lock table, + * so we need not worry about those. This is all a bit fuzzy, because + * new locks could be taken after we've visited a particular + * partition, but the callers had better be prepared to deal with that + * anyway, since the locks could equally well be taken between the + * time we return the value and the time the caller does something + * with it. + */ + for (i = 0; i < ProcGlobal->allProcCount; i++) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; + + /* A backend never blocks itself */ + if (proc == MyProc) + continue; + + LWLockAcquire(&proc->fpInfoLock, LW_SHARED); + + /* + * If the target backend isn't referencing the same database as + * the lock, then we needn't examine the individual relation IDs + * at all; none of them can be relevant. + * + * See FastPathTransferRelationLocks() for discussion of why we do + * this test after acquiring the lock. + */ + if (proc->databaseId != locktag->locktag_field1) + { + LWLockRelease(&proc->fpInfoLock); + continue; + } + + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++) + { + uint32 lockmask; + + /* Look for an allocated slot matching the given relid. */ + if (relid != proc->fpRelId[f]) + continue; + lockmask = FAST_PATH_GET_BITS(proc, f); + if (!lockmask) + continue; + lockmask <<= FAST_PATH_LOCKNUMBER_OFFSET; + + /* + * There can only be one entry per relation, so if we found it + * and it doesn't conflict, we can skip the rest of the slots. + */ + if ((lockmask & conflictMask) == 0) + break; + + /* Conflict! */ + GET_VXID_FROM_PGPROC(vxid, *proc); + + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + /* else, xact already committed or aborted */ + + /* No need to examine remaining slots. */ + break; + } + + LWLockRelease(&proc->fpInfoLock); + } + } + + /* Remember how many fast-path conflicts we found. */ + fast_count = count; + + /* + * Look up the lock object matching the tag. + */ + LWLockAcquire(partitionLock, LW_SHARED); + + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + locktag, + hashcode, + HASH_FIND, + NULL); + if (!lock) + { + /* + * If the lock object doesn't exist, there is nothing holding a lock + * on this lockable object. + */ + LWLockRelease(partitionLock); + vxids[count].backendId = InvalidBackendId; + vxids[count].localTransactionId = InvalidLocalTransactionId; + if (countp) + *countp = count; + return vxids; + } + + /* + * Examine each existing holder (or awaiter) of the lock. + */ + dlist_foreach(proclock_iter, &lock->procLocks) + { + proclock = dlist_container(PROCLOCK, lockLink, proclock_iter.cur); + + if (conflictMask & proclock->holdMask) + { + PGPROC *proc = proclock->tag.myProc; + + /* A backend never blocks itself */ + if (proc != MyProc) + { + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + + if (VirtualTransactionIdIsValid(vxid)) + { + int i; + + /* Avoid duplicate entries. */ + for (i = 0; i < fast_count; ++i) + if (VirtualTransactionIdEquals(vxids[i], vxid)) + break; + if (i >= fast_count) + vxids[count++] = vxid; + } + /* else, xact already committed or aborted */ + } + } + } + + LWLockRelease(partitionLock); + + if (count > MaxBackends + max_prepared_xacts) /* should never happen */ + elog(PANIC, "too many conflicting locks found"); + + vxids[count].backendId = InvalidBackendId; + vxids[count].localTransactionId = InvalidLocalTransactionId; + if (countp) + *countp = count; + return vxids; +} + +/* + * Find a lock in the shared lock table and release it. It is the caller's + * responsibility to verify that this is a sane thing to do. (For example, it + * would be bad to release a lock here if there might still be a LOCALLOCK + * object with pointers to it.) + * + * We currently use this in two situations: first, to release locks held by + * prepared transactions on commit (see lock_twophase_postcommit); and second, + * to release locks taken via the fast-path, transferred to the main hash + * table, and then released (see LockReleaseAll). + */ +static void +LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc, + LOCKTAG *locktag, LOCKMODE lockmode, + bool decrement_strong_lock_count) +{ + LOCK *lock; + PROCLOCK *proclock; + PROCLOCKTAG proclocktag; + uint32 hashcode; + uint32 proclock_hashcode; + LWLock *partitionLock; + bool wakeupNeeded; + + hashcode = LockTagHashCode(locktag); + partitionLock = LockHashPartitionLock(hashcode); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * Re-find the lock object (it had better be there). + */ + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + locktag, + hashcode, + HASH_FIND, + NULL); + if (!lock) + elog(PANIC, "failed to re-find shared lock object"); + + /* + * Re-find the proclock object (ditto). + */ + proclocktag.myLock = lock; + proclocktag.myProc = proc; + + proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode); + + proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, + &proclocktag, + proclock_hashcode, + HASH_FIND, + NULL); + if (!proclock) + elog(PANIC, "failed to re-find shared proclock object"); + + /* + * Double-check that we are actually holding a lock of the type we want to + * release. + */ + if (!(proclock->holdMask & LOCKBIT_ON(lockmode))) + { + PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock); + LWLockRelease(partitionLock); + elog(WARNING, "you don't own a lock of type %s", + lockMethodTable->lockModeNames[lockmode]); + return; + } + + /* + * Do the releasing. CleanUpLock will waken any now-wakable waiters. + */ + wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable); + + CleanUpLock(lock, proclock, + lockMethodTable, hashcode, + wakeupNeeded); + + LWLockRelease(partitionLock); + + /* + * Decrement strong lock count. This logic is needed only for 2PC. + */ + if (decrement_strong_lock_count + && ConflictsWithRelationFastPath(locktag, lockmode)) + { + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0); + FastPathStrongRelationLocks->count[fasthashcode]--; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); + } +} + +/* + * CheckForSessionAndXactLocks + * Check to see if transaction holds both session-level and xact-level + * locks on the same object; if so, throw an error. + * + * If we have both session- and transaction-level locks on the same object, + * PREPARE TRANSACTION must fail. This should never happen with regular + * locks, since we only take those at session level in some special operations + * like VACUUM. It's possible to hit this with advisory locks, though. + * + * It would be nice if we could keep the session hold and give away the + * transactional hold to the prepared xact. However, that would require two + * PROCLOCK objects, and we cannot be sure that another PROCLOCK will be + * available when it comes time for PostPrepare_Locks to do the deed. + * So for now, we error out while we can still do so safely. + * + * Since the LOCALLOCK table stores a separate entry for each lockmode, + * we can't implement this check by examining LOCALLOCK entries in isolation. + * We must build a transient hashtable that is indexed by locktag only. + */ +static void +CheckForSessionAndXactLocks(void) +{ + typedef struct + { + LOCKTAG lock; /* identifies the lockable object */ + bool sessLock; /* is any lockmode held at session level? */ + bool xactLock; /* is any lockmode held at xact level? */ + } PerLockTagEntry; + + HASHCTL hash_ctl; + HTAB *lockhtab; + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + /* Create a local hash table keyed by LOCKTAG only */ + hash_ctl.keysize = sizeof(LOCKTAG); + hash_ctl.entrysize = sizeof(PerLockTagEntry); + hash_ctl.hcxt = CurrentMemoryContext; + + lockhtab = hash_create("CheckForSessionAndXactLocks table", + 256, /* arbitrary initial size */ + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + + /* Scan local lock table to find entries for each LOCKTAG */ + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + PerLockTagEntry *hentry; + bool found; + int i; + + /* + * Ignore VXID locks. We don't want those to be held by prepared + * transactions, since they aren't meaningful after a restart. + */ + if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION) + continue; + + /* Ignore it if we don't actually hold the lock */ + if (locallock->nLocks <= 0) + continue; + + /* Otherwise, find or make an entry in lockhtab */ + hentry = (PerLockTagEntry *) hash_search(lockhtab, + &locallock->tag.lock, + HASH_ENTER, &found); + if (!found) /* initialize, if newly created */ + hentry->sessLock = hentry->xactLock = false; + + /* Scan to see if we hold lock at session or xact level or both */ + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == NULL) + hentry->sessLock = true; + else + hentry->xactLock = true; + } + + /* + * We can throw error immediately when we see both types of locks; no + * need to wait around to see if there are more violations. + */ + if (hentry->sessLock && hentry->xactLock) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object"))); + } + + /* Success, so clean up */ + hash_destroy(lockhtab); +} + +/* + * AtPrepare_Locks + * Do the preparatory work for a PREPARE: make 2PC state file records + * for all locks currently held. + * + * Session-level locks are ignored, as are VXID locks. + * + * For the most part, we don't need to touch shared memory for this --- + * all the necessary state information is in the locallock table. + * Fast-path locks are an exception, however: we move any such locks to + * the main table before allowing PREPARE TRANSACTION to succeed. + */ +void +AtPrepare_Locks(void) +{ + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + + /* First, verify there aren't locks of both xact and session level */ + CheckForSessionAndXactLocks(); + + /* Now do the per-locallock cleanup work */ + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + TwoPhaseLockRecord record; + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + bool haveSessionLock; + bool haveXactLock; + int i; + + /* + * Ignore VXID locks. We don't want those to be held by prepared + * transactions, since they aren't meaningful after a restart. + */ + if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION) + continue; + + /* Ignore it if we don't actually hold the lock */ + if (locallock->nLocks <= 0) + continue; + + /* Scan to see whether we hold it at session or transaction level */ + haveSessionLock = haveXactLock = false; + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == NULL) + haveSessionLock = true; + else + haveXactLock = true; + } + + /* Ignore it if we have only session lock */ + if (!haveXactLock) + continue; + + /* This can't happen, because we already checked it */ + if (haveSessionLock) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object"))); + + /* + * If the local lock was taken via the fast-path, we need to move it + * to the primary lock table, or just get a pointer to the existing + * primary lock table entry if by chance it's already been + * transferred. + */ + if (locallock->proclock == NULL) + { + locallock->proclock = FastPathGetRelationLockEntry(locallock); + locallock->lock = locallock->proclock->tag.myLock; + } + + /* + * Arrange to not release any strong lock count held by this lock + * entry. We must retain the count until the prepared transaction is + * committed or rolled back. + */ + locallock->holdsStrongLockCount = false; + + /* + * Create a 2PC record. + */ + memcpy(&(record.locktag), &(locallock->tag.lock), sizeof(LOCKTAG)); + record.lockmode = locallock->tag.mode; + + RegisterTwoPhaseRecord(TWOPHASE_RM_LOCK_ID, 0, + &record, sizeof(TwoPhaseLockRecord)); + } +} + +/* + * PostPrepare_Locks + * Clean up after successful PREPARE + * + * Here, we want to transfer ownership of our locks to a dummy PGPROC + * that's now associated with the prepared transaction, and we want to + * clean out the corresponding entries in the LOCALLOCK table. + * + * Note: by removing the LOCALLOCK entries, we are leaving dangling + * pointers in the transaction's resource owner. This is OK at the + * moment since resowner.c doesn't try to free locks retail at a toplevel + * transaction commit or abort. We could alternatively zero out nLocks + * and leave the LOCALLOCK entries to be garbage-collected by LockReleaseAll, + * but that probably costs more cycles. + */ +void +PostPrepare_Locks(TransactionId xid) +{ + PGPROC *newproc = TwoPhaseGetDummyProc(xid, false); + HASH_SEQ_STATUS status; + LOCALLOCK *locallock; + LOCK *lock; + PROCLOCK *proclock; + PROCLOCKTAG proclocktag; + int partition; + + /* Can't prepare a lock group follower. */ + Assert(MyProc->lockGroupLeader == NULL || + MyProc->lockGroupLeader == MyProc); + + /* This is a critical section: any error means big trouble */ + START_CRIT_SECTION(); + + /* + * First we run through the locallock table and get rid of unwanted + * entries, then we scan the process's proclocks and transfer them to the + * target proc. + * + * We do this separately because we may have multiple locallock entries + * pointing to the same proclock, and we daren't end up with any dangling + * pointers. + */ + hash_seq_init(&status, LockMethodLocalHash); + + while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL) + { + LOCALLOCKOWNER *lockOwners = locallock->lockOwners; + bool haveSessionLock; + bool haveXactLock; + int i; + + if (locallock->proclock == NULL || locallock->lock == NULL) + { + /* + * We must've run out of shared memory while trying to set up this + * lock. Just forget the local entry. + */ + Assert(locallock->nLocks == 0); + RemoveLocalLock(locallock); + continue; + } + + /* Ignore VXID locks */ + if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION) + continue; + + /* Scan to see whether we hold it at session or transaction level */ + haveSessionLock = haveXactLock = false; + for (i = locallock->numLockOwners - 1; i >= 0; i--) + { + if (lockOwners[i].owner == NULL) + haveSessionLock = true; + else + haveXactLock = true; + } + + /* Ignore it if we have only session lock */ + if (!haveXactLock) + continue; + + /* This can't happen, because we already checked it */ + if (haveSessionLock) + ereport(PANIC, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object"))); + + /* Mark the proclock to show we need to release this lockmode */ + if (locallock->nLocks > 0) + locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode); + + /* And remove the locallock hashtable entry */ + RemoveLocalLock(locallock); + } + + /* + * Now, scan each lock partition separately. + */ + for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++) + { + LWLock *partitionLock; + dlist_head *procLocks = &(MyProc->myProcLocks[partition]); + dlist_mutable_iter proclock_iter; + + partitionLock = LockHashPartitionLockByIndex(partition); + + /* + * If the proclock list for this partition is empty, we can skip + * acquiring the partition lock. This optimization is safer than the + * situation in LockReleaseAll, because we got rid of any fast-path + * locks during AtPrepare_Locks, so there cannot be any case where + * another backend is adding something to our lists now. For safety, + * though, we code this the same way as in LockReleaseAll. + */ + if (dlist_is_empty(procLocks)) + continue; /* needn't examine this partition */ + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + dlist_foreach_modify(proclock_iter, procLocks) + { + proclock = dlist_container(PROCLOCK, procLink, proclock_iter.cur); + + Assert(proclock->tag.myProc == MyProc); + + lock = proclock->tag.myLock; + + /* Ignore VXID locks */ + if (lock->tag.locktag_type == LOCKTAG_VIRTUALTRANSACTION) + continue; + + PROCLOCK_PRINT("PostPrepare_Locks", proclock); + LOCK_PRINT("PostPrepare_Locks", lock, 0); + Assert(lock->nRequested >= 0); + Assert(lock->nGranted >= 0); + Assert(lock->nGranted <= lock->nRequested); + Assert((proclock->holdMask & ~lock->grantMask) == 0); + + /* Ignore it if nothing to release (must be a session lock) */ + if (proclock->releaseMask == 0) + continue; + + /* Else we should be releasing all locks */ + if (proclock->releaseMask != proclock->holdMask) + elog(PANIC, "we seem to have dropped a bit somewhere"); + + /* + * We cannot simply modify proclock->tag.myProc to reassign + * ownership of the lock, because that's part of the hash key and + * the proclock would then be in the wrong hash chain. Instead + * use hash_update_hash_key. (We used to create a new hash entry, + * but that risks out-of-memory failure if other processes are + * busy making proclocks too.) We must unlink the proclock from + * our procLink chain and put it into the new proc's chain, too. + * + * Note: the updated proclock hash key will still belong to the + * same hash partition, cf proclock_hash(). So the partition lock + * we already hold is sufficient for this. + */ + dlist_delete(&proclock->procLink); + + /* + * Create the new hash key for the proclock. + */ + proclocktag.myLock = lock; + proclocktag.myProc = newproc; + + /* + * Update groupLeader pointer to point to the new proc. (We'd + * better not be a member of somebody else's lock group!) + */ + Assert(proclock->groupLeader == proclock->tag.myProc); + proclock->groupLeader = newproc; + + /* + * Update the proclock. We should not find any existing entry for + * the same hash key, since there can be only one entry for any + * given lock with my own proc. + */ + if (!hash_update_hash_key(LockMethodProcLockHash, + proclock, + &proclocktag)) + elog(PANIC, "duplicate entry found while reassigning a prepared transaction's locks"); + + /* Re-link into the new proc's proclock list */ + dlist_push_tail(&newproc->myProcLocks[partition], &proclock->procLink); + + PROCLOCK_PRINT("PostPrepare_Locks: updated", proclock); + } /* loop over PROCLOCKs within this partition */ + + LWLockRelease(partitionLock); + } /* loop over partitions */ + + END_CRIT_SECTION(); +} + + +/* + * Estimate shared-memory space used for lock tables + */ +Size +LockShmemSize(void) +{ + Size size = 0; + long max_table_size; + + /* lock hash table */ + max_table_size = NLOCKENTS(); + size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK))); + + /* proclock hash table */ + max_table_size *= 2; + size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK))); + + /* + * Since NLOCKENTS is only an estimate, add 10% safety margin. + */ + size = add_size(size, size / 10); + + return size; +} + +/* + * GetLockStatusData - Return a summary of the lock manager's internal + * status, for use in a user-level reporting function. + * + * The return data consists of an array of LockInstanceData objects, + * which are a lightly abstracted version of the PROCLOCK data structures, + * i.e. there is one entry for each unique lock and interested PGPROC. + * It is the caller's responsibility to match up related items (such as + * references to the same lockable object or PGPROC) if wanted. + * + * The design goal is to hold the LWLocks for as short a time as possible; + * thus, this function simply makes a copy of the necessary data and releases + * the locks, allowing the caller to contemplate and format the data for as + * long as it pleases. + */ +LockData * +GetLockStatusData(void) +{ + LockData *data; + PROCLOCK *proclock; + HASH_SEQ_STATUS seqstat; + int els; + int el; + int i; + + data = (LockData *) palloc(sizeof(LockData)); + + /* Guess how much space we'll need. */ + els = MaxBackends; + el = 0; + data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * els); + + /* + * First, we iterate through the per-backend fast-path arrays, locking + * them one at a time. This might produce an inconsistent picture of the + * system state, but taking all of those LWLocks at the same time seems + * impractical (in particular, note MAX_SIMUL_LWLOCKS). It shouldn't + * matter too much, because none of these locks can be involved in lock + * conflicts anyway - anything that might must be present in the main lock + * table. (For the same reason, we don't sweat about making leaderPid + * completely valid. We cannot safely dereference another backend's + * lockGroupLeader field without holding all lock partition locks, and + * it's not worth that.) + */ + for (i = 0; i < ProcGlobal->allProcCount; ++i) + { + PGPROC *proc = &ProcGlobal->allProcs[i]; + uint32 f; + + LWLockAcquire(&proc->fpInfoLock, LW_SHARED); + + for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; ++f) + { + LockInstanceData *instance; + uint32 lockbits = FAST_PATH_GET_BITS(proc, f); + + /* Skip unallocated slots. */ + if (!lockbits) + continue; + + if (el >= els) + { + els += MaxBackends; + data->locks = (LockInstanceData *) + repalloc(data->locks, sizeof(LockInstanceData) * els); + } + + instance = &data->locks[el]; + SET_LOCKTAG_RELATION(instance->locktag, proc->databaseId, + proc->fpRelId[f]); + instance->holdMask = lockbits << FAST_PATH_LOCKNUMBER_OFFSET; + instance->waitLockMode = NoLock; + instance->backend = proc->backendId; + instance->lxid = proc->lxid; + instance->pid = proc->pid; + instance->leaderPid = proc->pid; + instance->fastpath = true; + + /* + * Successfully taking fast path lock means there were no + * conflicting locks. + */ + instance->waitStart = 0; + + el++; + } + + if (proc->fpVXIDLock) + { + VirtualTransactionId vxid; + LockInstanceData *instance; + + if (el >= els) + { + els += MaxBackends; + data->locks = (LockInstanceData *) + repalloc(data->locks, sizeof(LockInstanceData) * els); + } + + vxid.backendId = proc->backendId; + vxid.localTransactionId = proc->fpLocalTransactionId; + + instance = &data->locks[el]; + SET_LOCKTAG_VIRTUALTRANSACTION(instance->locktag, vxid); + instance->holdMask = LOCKBIT_ON(ExclusiveLock); + instance->waitLockMode = NoLock; + instance->backend = proc->backendId; + instance->lxid = proc->lxid; + instance->pid = proc->pid; + instance->leaderPid = proc->pid; + instance->fastpath = true; + instance->waitStart = 0; + + el++; + } + + LWLockRelease(&proc->fpInfoLock); + } + + /* + * Next, acquire lock on the entire shared lock data structure. We do + * this so that, at least for locks in the primary lock table, the state + * will be self-consistent. + * + * Since this is a read-only operation, we take shared instead of + * exclusive lock. There's not a whole lot of point to this, because all + * the normal operations require exclusive lock, but it doesn't hurt + * anything either. It will at least allow two backends to do + * GetLockStatusData in parallel. + * + * Must grab LWLocks in partition-number order to avoid LWLock deadlock. + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED); + + /* Now we can safely count the number of proclocks */ + data->nelements = el + hash_get_num_entries(LockMethodProcLockHash); + if (data->nelements > els) + { + els = data->nelements; + data->locks = (LockInstanceData *) + repalloc(data->locks, sizeof(LockInstanceData) * els); + } + + /* Now scan the tables to copy the data */ + hash_seq_init(&seqstat, LockMethodProcLockHash); + + while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat))) + { + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; + LockInstanceData *instance = &data->locks[el]; + + memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG)); + instance->holdMask = proclock->holdMask; + if (proc->waitLock == proclock->tag.myLock) + instance->waitLockMode = proc->waitLockMode; + else + instance->waitLockMode = NoLock; + instance->backend = proc->backendId; + instance->lxid = proc->lxid; + instance->pid = proc->pid; + instance->leaderPid = proclock->groupLeader->pid; + instance->fastpath = false; + instance->waitStart = (TimestampTz) pg_atomic_read_u64(&proc->waitStart); + + el++; + } + + /* + * And release locks. We do this in reverse order for two reasons: (1) + * Anyone else who needs more than one of the locks will be trying to lock + * them in increasing order; we don't want to release the other process + * until it can get all the locks it needs. (2) This avoids O(N^2) + * behavior inside LWLockRelease. + */ + for (i = NUM_LOCK_PARTITIONS; --i >= 0;) + LWLockRelease(LockHashPartitionLockByIndex(i)); + + Assert(el == data->nelements); + + return data; +} + +/* + * GetBlockerStatusData - Return a summary of the lock manager's state + * concerning locks that are blocking the specified PID or any member of + * the PID's lock group, for use in a user-level reporting function. + * + * For each PID within the lock group that is awaiting some heavyweight lock, + * the return data includes an array of LockInstanceData objects, which are + * the same data structure used by GetLockStatusData; but unlike that function, + * this one reports only the PROCLOCKs associated with the lock that that PID + * is blocked on. (Hence, all the locktags should be the same for any one + * blocked PID.) In addition, we return an array of the PIDs of those backends + * that are ahead of the blocked PID in the lock's wait queue. These can be + * compared with the PIDs in the LockInstanceData objects to determine which + * waiters are ahead of or behind the blocked PID in the queue. + * + * If blocked_pid isn't a valid backend PID or nothing in its lock group is + * waiting on any heavyweight lock, return empty arrays. + * + * The design goal is to hold the LWLocks for as short a time as possible; + * thus, this function simply makes a copy of the necessary data and releases + * the locks, allowing the caller to contemplate and format the data for as + * long as it pleases. + */ +BlockedProcsData * +GetBlockerStatusData(int blocked_pid) +{ + BlockedProcsData *data; + PGPROC *proc; + int i; + + data = (BlockedProcsData *) palloc(sizeof(BlockedProcsData)); + + /* + * Guess how much space we'll need, and preallocate. Most of the time + * this will avoid needing to do repalloc while holding the LWLocks. (We + * assume, but check with an Assert, that MaxBackends is enough entries + * for the procs[] array; the other two could need enlargement, though.) + */ + data->nprocs = data->nlocks = data->npids = 0; + data->maxprocs = data->maxlocks = data->maxpids = MaxBackends; + data->procs = (BlockedProcData *) palloc(sizeof(BlockedProcData) * data->maxprocs); + data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * data->maxlocks); + data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids); + + /* + * In order to search the ProcArray for blocked_pid and assume that that + * entry won't immediately disappear under us, we must hold ProcArrayLock. + * In addition, to examine the lock grouping fields of any other backend, + * we must hold all the hash partition locks. (Only one of those locks is + * actually relevant for any one lock group, but we can't know which one + * ahead of time.) It's fairly annoying to hold all those locks + * throughout this, but it's no worse than GetLockStatusData(), and it + * does have the advantage that we're guaranteed to return a + * self-consistent instantaneous state. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + proc = BackendPidGetProcWithLock(blocked_pid); + + /* Nothing to do if it's gone */ + if (proc != NULL) + { + /* + * Acquire lock on the entire shared lock data structure. See notes + * in GetLockStatusData(). + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED); + + if (proc->lockGroupLeader == NULL) + { + /* Easy case, proc is not a lock group member */ + GetSingleProcBlockerStatusData(proc, data); + } + else + { + /* Examine all procs in proc's lock group */ + dlist_iter iter; + + dlist_foreach(iter, &proc->lockGroupLeader->lockGroupMembers) + { + PGPROC *memberProc; + + memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur); + GetSingleProcBlockerStatusData(memberProc, data); + } + } + + /* + * And release locks. See notes in GetLockStatusData(). + */ + for (i = NUM_LOCK_PARTITIONS; --i >= 0;) + LWLockRelease(LockHashPartitionLockByIndex(i)); + + Assert(data->nprocs <= data->maxprocs); + } + + LWLockRelease(ProcArrayLock); + + return data; +} + +/* Accumulate data about one possibly-blocked proc for GetBlockerStatusData */ +static void +GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data) +{ + LOCK *theLock = blocked_proc->waitLock; + BlockedProcData *bproc; + dlist_iter proclock_iter; + dlist_iter proc_iter; + dclist_head *waitQueue; + int queue_size; + + /* Nothing to do if this proc is not blocked */ + if (theLock == NULL) + return; + + /* Set up a procs[] element */ + bproc = &data->procs[data->nprocs++]; + bproc->pid = blocked_proc->pid; + bproc->first_lock = data->nlocks; + bproc->first_waiter = data->npids; + + /* + * We may ignore the proc's fast-path arrays, since nothing in those could + * be related to a contended lock. + */ + + /* Collect all PROCLOCKs associated with theLock */ + dlist_foreach(proclock_iter, &theLock->procLocks) + { + PROCLOCK *proclock = + dlist_container(PROCLOCK, lockLink, proclock_iter.cur); + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; + LockInstanceData *instance; + + if (data->nlocks >= data->maxlocks) + { + data->maxlocks += MaxBackends; + data->locks = (LockInstanceData *) + repalloc(data->locks, sizeof(LockInstanceData) * data->maxlocks); + } + + instance = &data->locks[data->nlocks]; + memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG)); + instance->holdMask = proclock->holdMask; + if (proc->waitLock == lock) + instance->waitLockMode = proc->waitLockMode; + else + instance->waitLockMode = NoLock; + instance->backend = proc->backendId; + instance->lxid = proc->lxid; + instance->pid = proc->pid; + instance->leaderPid = proclock->groupLeader->pid; + instance->fastpath = false; + data->nlocks++; + } + + /* Enlarge waiter_pids[] if it's too small to hold all wait queue PIDs */ + waitQueue = &(theLock->waitProcs); + queue_size = dclist_count(waitQueue); + + if (queue_size > data->maxpids - data->npids) + { + data->maxpids = Max(data->maxpids + MaxBackends, + data->npids + queue_size); + data->waiter_pids = (int *) repalloc(data->waiter_pids, + sizeof(int) * data->maxpids); + } + + /* Collect PIDs from the lock's wait queue, stopping at blocked_proc */ + dclist_foreach(proc_iter, waitQueue) + { + PGPROC *queued_proc = dlist_container(PGPROC, links, proc_iter.cur); + + if (queued_proc == blocked_proc) + break; + data->waiter_pids[data->npids++] = queued_proc->pid; + queued_proc = (PGPROC *) queued_proc->links.next; + } + + bproc->num_locks = data->nlocks - bproc->first_lock; + bproc->num_waiters = data->npids - bproc->first_waiter; +} + +/* + * Returns a list of currently held AccessExclusiveLocks, for use by + * LogStandbySnapshot(). The result is a palloc'd array, + * with the number of elements returned into *nlocks. + * + * XXX This currently takes a lock on all partitions of the lock table, + * but it's possible to do better. By reference counting locks and storing + * the value in the ProcArray entry for each backend we could tell if any + * locks need recording without having to acquire the partition locks and + * scan the lock table. Whether that's worth the additional overhead + * is pretty dubious though. + */ +xl_standby_lock * +GetRunningTransactionLocks(int *nlocks) +{ + xl_standby_lock *accessExclusiveLocks; + PROCLOCK *proclock; + HASH_SEQ_STATUS seqstat; + int i; + int index; + int els; + + /* + * Acquire lock on the entire shared lock data structure. + * + * Must grab LWLocks in partition-number order to avoid LWLock deadlock. + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED); + + /* Now we can safely count the number of proclocks */ + els = hash_get_num_entries(LockMethodProcLockHash); + + /* + * Allocating enough space for all locks in the lock table is overkill, + * but it's more convenient and faster than having to enlarge the array. + */ + accessExclusiveLocks = palloc(els * sizeof(xl_standby_lock)); + + /* Now scan the tables to copy the data */ + hash_seq_init(&seqstat, LockMethodProcLockHash); + + /* + * If lock is a currently granted AccessExclusiveLock then it will have + * just one proclock holder, so locks are never accessed twice in this + * particular case. Don't copy this code for use elsewhere because in the + * general case this will give you duplicate locks when looking at + * non-exclusive lock types. + */ + index = 0; + while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat))) + { + /* make sure this definition matches the one used in LockAcquire */ + if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) && + proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION) + { + PGPROC *proc = proclock->tag.myProc; + LOCK *lock = proclock->tag.myLock; + TransactionId xid = proc->xid; + + /* + * Don't record locks for transactions if we know they have + * already issued their WAL record for commit but not yet released + * lock. It is still possible that we see locks held by already + * complete transactions, if they haven't yet zeroed their xids. + */ + if (!TransactionIdIsValid(xid)) + continue; + + accessExclusiveLocks[index].xid = xid; + accessExclusiveLocks[index].dbOid = lock->tag.locktag_field1; + accessExclusiveLocks[index].relOid = lock->tag.locktag_field2; + + index++; + } + } + + Assert(index <= els); + + /* + * And release locks. We do this in reverse order for two reasons: (1) + * Anyone else who needs more than one of the locks will be trying to lock + * them in increasing order; we don't want to release the other process + * until it can get all the locks it needs. (2) This avoids O(N^2) + * behavior inside LWLockRelease. + */ + for (i = NUM_LOCK_PARTITIONS; --i >= 0;) + LWLockRelease(LockHashPartitionLockByIndex(i)); + + *nlocks = index; + return accessExclusiveLocks; +} + +/* Provide the textual name of any lock mode */ +const char * +GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode) +{ + Assert(lockmethodid > 0 && lockmethodid < lengthof(LockMethods)); + Assert(mode > 0 && mode <= LockMethods[lockmethodid]->numLockModes); + return LockMethods[lockmethodid]->lockModeNames[mode]; +} + +#ifdef LOCK_DEBUG +/* + * Dump all locks in the given proc's myProcLocks lists. + * + * Caller is responsible for having acquired appropriate LWLocks. + */ +void +DumpLocks(PGPROC *proc) +{ + int i; + + if (proc == NULL) + return; + + if (proc->waitLock) + LOCK_PRINT("DumpLocks: waiting on", proc->waitLock, 0); + + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + { + dlist_head *procLocks = &proc->myProcLocks[i]; + dlist_iter iter; + + dlist_foreach(iter, procLocks) + { + PROCLOCK *proclock = dlist_container(PROCLOCK, procLink, iter.cur); + LOCK *lock = proclock->tag.myLock; + + Assert(proclock->tag.myProc == proc); + PROCLOCK_PRINT("DumpLocks", proclock); + LOCK_PRINT("DumpLocks", lock, 0); + } + } +} + +/* + * Dump all lmgr locks. + * + * Caller is responsible for having acquired appropriate LWLocks. + */ +void +DumpAllLocks(void) +{ + PGPROC *proc; + PROCLOCK *proclock; + LOCK *lock; + HASH_SEQ_STATUS status; + + proc = MyProc; + + if (proc && proc->waitLock) + LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0); + + hash_seq_init(&status, LockMethodProcLockHash); + + while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL) + { + PROCLOCK_PRINT("DumpAllLocks", proclock); + + lock = proclock->tag.myLock; + if (lock) + LOCK_PRINT("DumpAllLocks", lock, 0); + else + elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL"); + } +} +#endif /* LOCK_DEBUG */ + +/* + * LOCK 2PC resource manager's routines + */ + +/* + * Re-acquire a lock belonging to a transaction that was prepared. + * + * Because this function is run at db startup, re-acquiring the locks should + * never conflict with running transactions because there are none. We + * assume that the lock state represented by the stored 2PC files is legal. + * + * When switching from Hot Standby mode to normal operation, the locks will + * be already held by the startup process. The locks are acquired for the new + * procs without checking for conflicts, so we don't get a conflict between the + * startup process and the dummy procs, even though we will momentarily have + * a situation where two procs are holding the same AccessExclusiveLock, + * which isn't normally possible because the conflict. If we're in standby + * mode, but a recovery snapshot hasn't been established yet, it's possible + * that some but not all of the locks are already held by the startup process. + * + * This approach is simple, but also a bit dangerous, because if there isn't + * enough shared memory to acquire the locks, an error will be thrown, which + * is promoted to FATAL and recovery will abort, bringing down postmaster. + * A safer approach would be to transfer the locks like we do in + * AtPrepare_Locks, but then again, in hot standby mode it's possible for + * read-only backends to use up all the shared lock memory anyway, so that + * replaying the WAL record that needs to acquire a lock will throw an error + * and PANIC anyway. + */ +void +lock_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; + PGPROC *proc = TwoPhaseGetDummyProc(xid, false); + LOCKTAG *locktag; + LOCKMODE lockmode; + LOCKMETHODID lockmethodid; + LOCK *lock; + PROCLOCK *proclock; + PROCLOCKTAG proclocktag; + bool found; + uint32 hashcode; + uint32 proclock_hashcode; + int partition; + LWLock *partitionLock; + LockMethod lockMethodTable; + + Assert(len == sizeof(TwoPhaseLockRecord)); + locktag = &rec->locktag; + lockmode = rec->lockmode; + lockmethodid = locktag->locktag_lockmethodid; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + + hashcode = LockTagHashCode(locktag); + partition = LockHashPartition(hashcode); + partitionLock = LockHashPartitionLock(hashcode); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * Find or create a lock with this tag. + */ + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + locktag, + hashcode, + HASH_ENTER_NULL, + &found); + if (!lock) + { + LWLockRelease(partitionLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_locks_per_transaction"))); + } + + /* + * if it's a new lock object, initialize it + */ + if (!found) + { + lock->grantMask = 0; + lock->waitMask = 0; + dlist_init(&lock->procLocks); + dclist_init(&lock->waitProcs); + lock->nRequested = 0; + lock->nGranted = 0; + MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES); + MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES); + LOCK_PRINT("lock_twophase_recover: new", lock, lockmode); + } + else + { + LOCK_PRINT("lock_twophase_recover: found", lock, lockmode); + Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0)); + Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0)); + Assert(lock->nGranted <= lock->nRequested); + } + + /* + * Create the hash key for the proclock table. + */ + proclocktag.myLock = lock; + proclocktag.myProc = proc; + + proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode); + + /* + * Find or create a proclock entry with this tag + */ + proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash, + &proclocktag, + proclock_hashcode, + HASH_ENTER_NULL, + &found); + if (!proclock) + { + /* Oops, not enough shmem for the proclock */ + if (lock->nRequested == 0) + { + /* + * There are no other requestors of this lock, so garbage-collect + * the lock object. We *must* do this to avoid a permanent leak + * of shared memory, because there won't be anything to cause + * anyone to release the lock object later. + */ + Assert(dlist_is_empty(&lock->procLocks)); + if (!hash_search_with_hash_value(LockMethodLockHash, + &(lock->tag), + hashcode, + HASH_REMOVE, + NULL)) + elog(PANIC, "lock table corrupted"); + } + LWLockRelease(partitionLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_locks_per_transaction"))); + } + + /* + * If new, initialize the new entry + */ + if (!found) + { + Assert(proc->lockGroupLeader == NULL); + proclock->groupLeader = proc; + proclock->holdMask = 0; + proclock->releaseMask = 0; + /* Add proclock to appropriate lists */ + dlist_push_tail(&lock->procLocks, &proclock->lockLink); + dlist_push_tail(&proc->myProcLocks[partition], + &proclock->procLink); + PROCLOCK_PRINT("lock_twophase_recover: new", proclock); + } + else + { + PROCLOCK_PRINT("lock_twophase_recover: found", proclock); + Assert((proclock->holdMask & ~lock->grantMask) == 0); + } + + /* + * lock->nRequested and lock->requested[] count the total number of + * requests, whether granted or waiting, so increment those immediately. + */ + lock->nRequested++; + lock->requested[lockmode]++; + Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0)); + + /* + * We shouldn't already hold the desired lock. + */ + if (proclock->holdMask & LOCKBIT_ON(lockmode)) + elog(ERROR, "lock %s on object %u/%u/%u is already held", + lockMethodTable->lockModeNames[lockmode], + lock->tag.locktag_field1, lock->tag.locktag_field2, + lock->tag.locktag_field3); + + /* + * We ignore any possible conflicts and just grant ourselves the lock. Not + * only because we don't bother, but also to avoid deadlocks when + * switching from standby to normal mode. See function comment. + */ + GrantLock(lock, proclock, lockmode); + + /* + * Bump strong lock count, to make sure any fast-path lock requests won't + * be granted without consulting the primary lock table. + */ + if (ConflictsWithRelationFastPath(&lock->tag, lockmode)) + { + uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode); + + SpinLockAcquire(&FastPathStrongRelationLocks->mutex); + FastPathStrongRelationLocks->count[fasthashcode]++; + SpinLockRelease(&FastPathStrongRelationLocks->mutex); + } + + LWLockRelease(partitionLock); +} + +/* + * Re-acquire a lock belonging to a transaction that was prepared, when + * starting up into hot standby mode. + */ +void +lock_twophase_standby_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; + LOCKTAG *locktag; + LOCKMODE lockmode; + LOCKMETHODID lockmethodid; + + Assert(len == sizeof(TwoPhaseLockRecord)); + locktag = &rec->locktag; + lockmode = rec->lockmode; + lockmethodid = locktag->locktag_lockmethodid; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + + if (lockmode == AccessExclusiveLock && + locktag->locktag_type == LOCKTAG_RELATION) + { + StandbyAcquireAccessExclusiveLock(xid, + locktag->locktag_field1 /* dboid */ , + locktag->locktag_field2 /* reloid */ ); + } +} + + +/* + * 2PC processing routine for COMMIT PREPARED case. + * + * Find and release the lock indicated by the 2PC record. + */ +void +lock_twophase_postcommit(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata; + PGPROC *proc = TwoPhaseGetDummyProc(xid, true); + LOCKTAG *locktag; + LOCKMETHODID lockmethodid; + LockMethod lockMethodTable; + + Assert(len == sizeof(TwoPhaseLockRecord)); + locktag = &rec->locktag; + lockmethodid = locktag->locktag_lockmethodid; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + lockMethodTable = LockMethods[lockmethodid]; + + LockRefindAndRelease(lockMethodTable, proc, locktag, rec->lockmode, true); +} + +/* + * 2PC processing routine for ROLLBACK PREPARED case. + * + * This is actually just the same as the COMMIT case. + */ +void +lock_twophase_postabort(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + lock_twophase_postcommit(xid, info, recdata, len); +} + +/* + * VirtualXactLockTableInsert + * + * Take vxid lock via the fast-path. There can't be any pre-existing + * lockers, as we haven't advertised this vxid via the ProcArray yet. + * + * Since MyProc->fpLocalTransactionId will normally contain the same data + * as MyProc->lxid, you might wonder if we really need both. The + * difference is that MyProc->lxid is set and cleared unlocked, and + * examined by procarray.c, while fpLocalTransactionId is protected by + * fpInfoLock and is used only by the locking subsystem. Doing it this + * way makes it easier to verify that there are no funny race conditions. + * + * We don't bother recording this lock in the local lock table, since it's + * only ever released at the end of a transaction. Instead, + * LockReleaseAll() calls VirtualXactLockTableCleanup(). + */ +void +VirtualXactLockTableInsert(VirtualTransactionId vxid) +{ + Assert(VirtualTransactionIdIsValid(vxid)); + + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + + Assert(MyProc->backendId == vxid.backendId); + Assert(MyProc->fpLocalTransactionId == InvalidLocalTransactionId); + Assert(MyProc->fpVXIDLock == false); + + MyProc->fpVXIDLock = true; + MyProc->fpLocalTransactionId = vxid.localTransactionId; + + LWLockRelease(&MyProc->fpInfoLock); +} + +/* + * VirtualXactLockTableCleanup + * + * Check whether a VXID lock has been materialized; if so, release it, + * unblocking waiters. + */ +void +VirtualXactLockTableCleanup(void) +{ + bool fastpath; + LocalTransactionId lxid; + + Assert(MyProc->backendId != InvalidBackendId); + + /* + * Clean up shared memory state. + */ + LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE); + + fastpath = MyProc->fpVXIDLock; + lxid = MyProc->fpLocalTransactionId; + MyProc->fpVXIDLock = false; + MyProc->fpLocalTransactionId = InvalidLocalTransactionId; + + LWLockRelease(&MyProc->fpInfoLock); + + /* + * If fpVXIDLock has been cleared without touching fpLocalTransactionId, + * that means someone transferred the lock to the main lock table. + */ + if (!fastpath && LocalTransactionIdIsValid(lxid)) + { + VirtualTransactionId vxid; + LOCKTAG locktag; + + vxid.backendId = MyBackendId; + vxid.localTransactionId = lxid; + SET_LOCKTAG_VIRTUALTRANSACTION(locktag, vxid); + + LockRefindAndRelease(LockMethods[DEFAULT_LOCKMETHOD], MyProc, + &locktag, ExclusiveLock, false); + } +} + +/* + * XactLockForVirtualXact + * + * If TransactionIdIsValid(xid), this is essentially XactLockTableWait(xid, + * NULL, NULL, XLTW_None) or ConditionalXactLockTableWait(xid). Unlike those + * functions, it assumes "xid" is never a subtransaction and that "xid" is + * prepared, committed, or aborted. + * + * If !TransactionIdIsValid(xid), this locks every prepared XID having been + * known as "vxid" before its PREPARE TRANSACTION. + */ +static bool +XactLockForVirtualXact(VirtualTransactionId vxid, + TransactionId xid, bool wait) +{ + bool more = false; + + /* There is no point to wait for 2PCs if you have no 2PCs. */ + if (max_prepared_xacts == 0) + return true; + + do + { + LockAcquireResult lar; + LOCKTAG tag; + + /* Clear state from previous iterations. */ + if (more) + { + xid = InvalidTransactionId; + more = false; + } + + /* If we have no xid, try to find one. */ + if (!TransactionIdIsValid(xid)) + xid = TwoPhaseGetXidByVirtualXID(vxid, &more); + if (!TransactionIdIsValid(xid)) + { + Assert(!more); + return true; + } + + /* Check or wait for XID completion. */ + SET_LOCKTAG_TRANSACTION(tag, xid); + lar = LockAcquire(&tag, ShareLock, false, !wait); + if (lar == LOCKACQUIRE_NOT_AVAIL) + return false; + LockRelease(&tag, ShareLock, false); + } while (more); + + return true; +} + +/* + * VirtualXactLock + * + * If wait = true, wait as long as the given VXID or any XID acquired by the + * same transaction is still running. Then, return true. + * + * If wait = false, just check whether that VXID or one of those XIDs is still + * running, and return true or false. + */ +bool +VirtualXactLock(VirtualTransactionId vxid, bool wait) +{ + LOCKTAG tag; + PGPROC *proc; + TransactionId xid = InvalidTransactionId; + + Assert(VirtualTransactionIdIsValid(vxid)); + + if (VirtualTransactionIdIsRecoveredPreparedXact(vxid)) + /* no vxid lock; localTransactionId is a normal, locked XID */ + return XactLockForVirtualXact(vxid, vxid.localTransactionId, wait); + + SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid); + + /* + * If a lock table entry must be made, this is the PGPROC on whose behalf + * it must be done. Note that the transaction might end or the PGPROC + * might be reassigned to a new backend before we get around to examining + * it, but it doesn't matter. If we find upon examination that the + * relevant lxid is no longer running here, that's enough to prove that + * it's no longer running anywhere. + */ + proc = BackendIdGetProc(vxid.backendId); + if (proc == NULL) + return XactLockForVirtualXact(vxid, InvalidTransactionId, wait); + + /* + * We must acquire this lock before checking the backendId and lxid + * against the ones we're waiting for. The target backend will only set + * or clear lxid while holding this lock. + */ + LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE); + + if (proc->backendId != vxid.backendId + || proc->fpLocalTransactionId != vxid.localTransactionId) + { + /* VXID ended */ + LWLockRelease(&proc->fpInfoLock); + return XactLockForVirtualXact(vxid, InvalidTransactionId, wait); + } + + /* + * If we aren't asked to wait, there's no need to set up a lock table + * entry. The transaction is still in progress, so just return false. + */ + if (!wait) + { + LWLockRelease(&proc->fpInfoLock); + return false; + } + + /* + * OK, we're going to need to sleep on the VXID. But first, we must set + * up the primary lock table entry, if needed (ie, convert the proc's + * fast-path lock on its VXID to a regular lock). + */ + if (proc->fpVXIDLock) + { + PROCLOCK *proclock; + uint32 hashcode; + LWLock *partitionLock; + + hashcode = LockTagHashCode(&tag); + + partitionLock = LockHashPartitionLock(hashcode); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + proclock = SetupLockInTable(LockMethods[DEFAULT_LOCKMETHOD], proc, + &tag, hashcode, ExclusiveLock); + if (!proclock) + { + LWLockRelease(partitionLock); + LWLockRelease(&proc->fpInfoLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_locks_per_transaction"))); + } + GrantLock(proclock->tag.myLock, proclock, ExclusiveLock); + + LWLockRelease(partitionLock); + + proc->fpVXIDLock = false; + } + + /* + * If the proc has an XID now, we'll avoid a TwoPhaseGetXidByVirtualXID() + * search. The proc might have assigned this XID but not yet locked it, + * in which case the proc will lock this XID before releasing the VXID. + * The fpInfoLock critical section excludes VirtualXactLockTableCleanup(), + * so we won't save an XID of a different VXID. It doesn't matter whether + * we save this before or after setting up the primary lock table entry. + */ + xid = proc->xid; + + /* Done with proc->fpLockBits */ + LWLockRelease(&proc->fpInfoLock); + + /* Time to wait. */ + (void) LockAcquire(&tag, ShareLock, false, false); + + LockRelease(&tag, ShareLock, false); + return XactLockForVirtualXact(vxid, xid, wait); +} + +/* + * LockWaiterCount + * + * Find the number of lock requester on this locktag + */ +int +LockWaiterCount(const LOCKTAG *locktag) +{ + LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid; + LOCK *lock; + bool found; + uint32 hashcode; + LWLock *partitionLock; + int waiters = 0; + + if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods)) + elog(ERROR, "unrecognized lock method: %d", lockmethodid); + + hashcode = LockTagHashCode(locktag); + partitionLock = LockHashPartitionLock(hashcode); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash, + locktag, + hashcode, + HASH_FIND, + &found); + if (found) + { + Assert(lock != NULL); + waiters = lock->nRequested; + } + LWLockRelease(partitionLock); + + return waiters; +} diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c new file mode 100644 index 0000000..01d738f --- /dev/null +++ b/src/backend/storage/lmgr/lwlock.c @@ -0,0 +1,1973 @@ +/*------------------------------------------------------------------------- + * + * lwlock.c + * Lightweight lock manager + * + * Lightweight locks are intended primarily to provide mutual exclusion of + * access to shared-memory data structures. Therefore, they offer both + * exclusive and shared lock modes (to support read/write and read-only + * access to a shared object). There are few other frammishes. User-level + * locking should be done with the full lock manager --- which depends on + * LWLocks to protect its shared state. + * + * In addition to exclusive and shared modes, lightweight locks can be used to + * wait until a variable changes value. The variable is initially not set + * when the lock is acquired with LWLockAcquire, i.e. it remains set to the + * value it was set to when the lock was released last, and can be updated + * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar + * waits for the variable to be updated, or until the lock is free. When + * releasing the lock with LWLockReleaseClearVar() the value can be set to an + * appropriate value for a free lock. The meaning of the variable is up to + * the caller, the lightweight lock code just assigns and compares it. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/lmgr/lwlock.c + * + * NOTES: + * + * This used to be a pretty straight forward reader-writer lock + * implementation, in which the internal state was protected by a + * spinlock. Unfortunately the overhead of taking the spinlock proved to be + * too high for workloads/locks that were taken in shared mode very + * frequently. Often we were spinning in the (obviously exclusive) spinlock, + * while trying to acquire a shared lock that was actually free. + * + * Thus a new implementation was devised that provides wait-free shared lock + * acquisition for locks that aren't exclusively locked. + * + * The basic idea is to have a single atomic variable 'lockcount' instead of + * the formerly separate shared and exclusive counters and to use atomic + * operations to acquire the lock. That's fairly easy to do for plain + * rw-spinlocks, but a lot harder for something like LWLocks that want to wait + * in the OS. + * + * For lock acquisition we use an atomic compare-and-exchange on the lockcount + * variable. For exclusive lock we swap in a sentinel value + * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders. + * + * To release the lock we use an atomic decrement to release the lock. If the + * new value is zero (we get that atomically), we know we can/have to release + * waiters. + * + * Obviously it is important that the sentinel value for exclusive locks + * doesn't conflict with the maximum number of possible share lockers - + * luckily MAX_BACKENDS makes that easily possible. + * + * + * The attentive reader might have noticed that naively doing the above has a + * glaring race condition: We try to lock using the atomic operations and + * notice that we have to wait. Unfortunately by the time we have finished + * queuing, the former locker very well might have already finished it's + * work. That's problematic because we're now stuck waiting inside the OS. + + * To mitigate those races we use a two phased attempt at locking: + * Phase 1: Try to do it atomically, if we succeed, nice + * Phase 2: Add ourselves to the waitqueue of the lock + * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from + * the queue + * Phase 4: Sleep till wake-up, goto Phase 1 + * + * This protects us against the problem from above as nobody can release too + * quick, before we're queued, since after Phase 2 we're already queued. + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "port/pg_bitutils.h" +#include "postmaster/postmaster.h" +#include "replication/slot.h" +#include "storage/ipc.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/proclist.h" +#include "storage/spin.h" +#include "utils/memutils.h" + +#ifdef LWLOCK_STATS +#include "utils/hsearch.h" +#endif + + +/* We use the ShmemLock spinlock to protect LWLockCounter */ +extern slock_t *ShmemLock; + +#define LW_FLAG_HAS_WAITERS ((uint32) 1 << 30) +#define LW_FLAG_RELEASE_OK ((uint32) 1 << 29) +#define LW_FLAG_LOCKED ((uint32) 1 << 28) + +#define LW_VAL_EXCLUSIVE ((uint32) 1 << 24) +#define LW_VAL_SHARED 1 + +#define LW_LOCK_MASK ((uint32) ((1 << 25)-1)) +/* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */ +#define LW_SHARED_MASK ((uint32) ((1 << 24)-1)) + +StaticAssertDecl(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS, + "MAX_BACKENDS too big for lwlock.c"); + +/* + * There are three sorts of LWLock "tranches": + * + * 1. The individually-named locks defined in lwlocknames.h each have their + * own tranche. The names of these tranches appear in IndividualLWLockNames[] + * in lwlocknames.c. + * + * 2. There are some predefined tranches for built-in groups of locks. + * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names + * appear in BuiltinTrancheNames[] below. + * + * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche + * or LWLockRegisterTranche. The names of these that are known in the current + * process appear in LWLockTrancheNames[]. + * + * All these names are user-visible as wait event names, so choose with care + * ... and do not forget to update the documentation's list of wait events. + */ +extern const char *const IndividualLWLockNames[]; /* in lwlocknames.c */ + +static const char *const BuiltinTrancheNames[] = { + /* LWTRANCHE_XACT_BUFFER: */ + "XactBuffer", + /* LWTRANCHE_COMMITTS_BUFFER: */ + "CommitTsBuffer", + /* LWTRANCHE_SUBTRANS_BUFFER: */ + "SubtransBuffer", + /* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */ + "MultiXactOffsetBuffer", + /* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */ + "MultiXactMemberBuffer", + /* LWTRANCHE_NOTIFY_BUFFER: */ + "NotifyBuffer", + /* LWTRANCHE_SERIAL_BUFFER: */ + "SerialBuffer", + /* LWTRANCHE_WAL_INSERT: */ + "WALInsert", + /* LWTRANCHE_BUFFER_CONTENT: */ + "BufferContent", + /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */ + "ReplicationOriginState", + /* LWTRANCHE_REPLICATION_SLOT_IO: */ + "ReplicationSlotIO", + /* LWTRANCHE_LOCK_FASTPATH: */ + "LockFastPath", + /* LWTRANCHE_BUFFER_MAPPING: */ + "BufferMapping", + /* LWTRANCHE_LOCK_MANAGER: */ + "LockManager", + /* LWTRANCHE_PREDICATE_LOCK_MANAGER: */ + "PredicateLockManager", + /* LWTRANCHE_PARALLEL_HASH_JOIN: */ + "ParallelHashJoin", + /* LWTRANCHE_PARALLEL_QUERY_DSA: */ + "ParallelQueryDSA", + /* LWTRANCHE_PER_SESSION_DSA: */ + "PerSessionDSA", + /* LWTRANCHE_PER_SESSION_RECORD_TYPE: */ + "PerSessionRecordType", + /* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */ + "PerSessionRecordTypmod", + /* LWTRANCHE_SHARED_TUPLESTORE: */ + "SharedTupleStore", + /* LWTRANCHE_SHARED_TIDBITMAP: */ + "SharedTidBitmap", + /* LWTRANCHE_PARALLEL_APPEND: */ + "ParallelAppend", + /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */ + "PerXactPredicateList", + /* LWTRANCHE_PGSTATS_DSA: */ + "PgStatsDSA", + /* LWTRANCHE_PGSTATS_HASH: */ + "PgStatsHash", + /* LWTRANCHE_PGSTATS_DATA: */ + "PgStatsData", + /* LWTRANCHE_LAUNCHER_DSA: */ + "LogicalRepLauncherDSA", + /* LWTRANCHE_LAUNCHER_HASH: */ + "LogicalRepLauncherHash", +}; + +StaticAssertDecl(lengthof(BuiltinTrancheNames) == + LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS, + "missing entries in BuiltinTrancheNames[]"); + +/* + * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and + * stores the names of all dynamically-created tranches known to the current + * process. Any unused entries in the array will contain NULL. + */ +static const char **LWLockTrancheNames = NULL; +static int LWLockTrancheNamesAllocated = 0; + +/* + * This points to the main array of LWLocks in shared memory. Backends inherit + * the pointer by fork from the postmaster (except in the EXEC_BACKEND case, + * where we have special measures to pass it down). + */ +LWLockPadded *MainLWLockArray = NULL; + +/* + * We use this structure to keep track of locked LWLocks for release + * during error recovery. Normally, only a few will be held at once, but + * occasionally the number can be much higher; for example, the pg_buffercache + * extension locks all buffer partitions simultaneously. + */ +#define MAX_SIMUL_LWLOCKS 200 + +/* struct representing the LWLocks we're holding */ +typedef struct LWLockHandle +{ + LWLock *lock; + LWLockMode mode; +} LWLockHandle; + +static int num_held_lwlocks = 0; +static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS]; + +/* struct representing the LWLock tranche request for named tranche */ +typedef struct NamedLWLockTrancheRequest +{ + char tranche_name[NAMEDATALEN]; + int num_lwlocks; +} NamedLWLockTrancheRequest; + +static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; +static int NamedLWLockTrancheRequestsAllocated = 0; + +/* + * NamedLWLockTrancheRequests is both the valid length of the request array, + * and the length of the shared-memory NamedLWLockTrancheArray later on. + * This variable and NamedLWLockTrancheArray are non-static so that + * postmaster.c can copy them to child processes in EXEC_BACKEND builds. + */ +int NamedLWLockTrancheRequests = 0; + +/* points to data in shared memory: */ +NamedLWLockTranche *NamedLWLockTrancheArray = NULL; + +static void InitializeLWLocks(void); +static inline void LWLockReportWaitStart(LWLock *lock); +static inline void LWLockReportWaitEnd(void); +static const char *GetLWTrancheName(uint16 trancheId); + +#define T_NAME(lock) \ + GetLWTrancheName((lock)->tranche) + +#ifdef LWLOCK_STATS +typedef struct lwlock_stats_key +{ + int tranche; + void *instance; +} lwlock_stats_key; + +typedef struct lwlock_stats +{ + lwlock_stats_key key; + int sh_acquire_count; + int ex_acquire_count; + int block_count; + int dequeue_self_count; + int spin_delay_count; +} lwlock_stats; + +static HTAB *lwlock_stats_htab; +static lwlock_stats lwlock_stats_dummy; +#endif + +#ifdef LOCK_DEBUG +bool Trace_lwlocks = false; + +inline static void +PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode) +{ + /* hide statement & context here, otherwise the log is just too verbose */ + if (Trace_lwlocks) + { + uint32 state = pg_atomic_read_u32(&lock->state); + + ereport(LOG, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d", + MyProcPid, + where, T_NAME(lock), lock, + (state & LW_VAL_EXCLUSIVE) != 0, + state & LW_SHARED_MASK, + (state & LW_FLAG_HAS_WAITERS) != 0, + pg_atomic_read_u32(&lock->nwaiters), + (state & LW_FLAG_RELEASE_OK) != 0))); + } +} + +inline static void +LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg) +{ + /* hide statement & context here, otherwise the log is just too verbose */ + if (Trace_lwlocks) + { + ereport(LOG, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("%s(%s %p): %s", where, + T_NAME(lock), lock, msg))); + } +} + +#else /* not LOCK_DEBUG */ +#define PRINT_LWDEBUG(a,b,c) ((void)0) +#define LOG_LWDEBUG(a,b,c) ((void)0) +#endif /* LOCK_DEBUG */ + +#ifdef LWLOCK_STATS + +static void init_lwlock_stats(void); +static void print_lwlock_stats(int code, Datum arg); +static lwlock_stats * get_lwlock_stats_entry(LWLock *lock); + +static void +init_lwlock_stats(void) +{ + HASHCTL ctl; + static MemoryContext lwlock_stats_cxt = NULL; + static bool exit_registered = false; + + if (lwlock_stats_cxt != NULL) + MemoryContextDelete(lwlock_stats_cxt); + + /* + * The LWLock stats will be updated within a critical section, which + * requires allocating new hash entries. Allocations within a critical + * section are normally not allowed because running out of memory would + * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally + * turned on in production, so that's an acceptable risk. The hash entries + * are small, so the risk of running out of memory is minimal in practice. + */ + lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext, + "LWLock stats", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true); + + ctl.keysize = sizeof(lwlock_stats_key); + ctl.entrysize = sizeof(lwlock_stats); + ctl.hcxt = lwlock_stats_cxt; + lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + if (!exit_registered) + { + on_shmem_exit(print_lwlock_stats, 0); + exit_registered = true; + } +} + +static void +print_lwlock_stats(int code, Datum arg) +{ + HASH_SEQ_STATUS scan; + lwlock_stats *lwstats; + + hash_seq_init(&scan, lwlock_stats_htab); + + /* Grab an LWLock to keep different backends from mixing reports */ + LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE); + + while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL) + { + fprintf(stderr, + "PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n", + MyProcPid, GetLWTrancheName(lwstats->key.tranche), + lwstats->key.instance, lwstats->sh_acquire_count, + lwstats->ex_acquire_count, lwstats->block_count, + lwstats->spin_delay_count, lwstats->dequeue_self_count); + } + + LWLockRelease(&MainLWLockArray[0].lock); +} + +static lwlock_stats * +get_lwlock_stats_entry(LWLock *lock) +{ + lwlock_stats_key key; + lwlock_stats *lwstats; + bool found; + + /* + * During shared memory initialization, the hash table doesn't exist yet. + * Stats of that phase aren't very interesting, so just collect operations + * on all locks in a single dummy entry. + */ + if (lwlock_stats_htab == NULL) + return &lwlock_stats_dummy; + + /* Fetch or create the entry. */ + MemSet(&key, 0, sizeof(key)); + key.tranche = lock->tranche; + key.instance = lock; + lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found); + if (!found) + { + lwstats->sh_acquire_count = 0; + lwstats->ex_acquire_count = 0; + lwstats->block_count = 0; + lwstats->dequeue_self_count = 0; + lwstats->spin_delay_count = 0; + } + return lwstats; +} +#endif /* LWLOCK_STATS */ + + +/* + * Compute number of LWLocks required by named tranches. These will be + * allocated in the main array. + */ +static int +NumLWLocksForNamedTranches(void) +{ + int numLocks = 0; + int i; + + for (i = 0; i < NamedLWLockTrancheRequests; i++) + numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks; + + return numLocks; +} + +/* + * Compute shmem space needed for LWLocks and named tranches. + */ +Size +LWLockShmemSize(void) +{ + Size size; + int i; + int numLocks = NUM_FIXED_LWLOCKS; + + /* Calculate total number of locks needed in the main array. */ + numLocks += NumLWLocksForNamedTranches(); + + /* Space for the LWLock array. */ + size = mul_size(numLocks, sizeof(LWLockPadded)); + + /* Space for dynamic allocation counter, plus room for alignment. */ + size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE); + + /* space for named tranches. */ + size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche))); + + /* space for name of each tranche. */ + for (i = 0; i < NamedLWLockTrancheRequests; i++) + size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1); + + return size; +} + +/* + * Allocate shmem space for the main LWLock array and all tranches and + * initialize it. We also register extension LWLock tranches here. + */ +void +CreateLWLocks(void) +{ + if (!IsUnderPostmaster) + { + Size spaceLocks = LWLockShmemSize(); + int *LWLockCounter; + char *ptr; + + /* Allocate space */ + ptr = (char *) ShmemAlloc(spaceLocks); + + /* Leave room for dynamic allocation of tranches */ + ptr += sizeof(int); + + /* Ensure desired alignment of LWLock array */ + ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; + + MainLWLockArray = (LWLockPadded *) ptr; + + /* + * Initialize the dynamic-allocation counter for tranches, which is + * stored just before the first LWLock. + */ + LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); + *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; + + /* Initialize all LWLocks */ + InitializeLWLocks(); + } + + /* Register named extension LWLock tranches in the current process. */ + for (int i = 0; i < NamedLWLockTrancheRequests; i++) + LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId, + NamedLWLockTrancheArray[i].trancheName); +} + +/* + * Initialize LWLocks that are fixed and those belonging to named tranches. + */ +static void +InitializeLWLocks(void) +{ + int numNamedLocks = NumLWLocksForNamedTranches(); + int id; + int i; + int j; + LWLockPadded *lock; + + /* Initialize all individual LWLocks in main array */ + for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++) + LWLockInitialize(&lock->lock, id); + + /* Initialize buffer mapping LWLocks in main array */ + lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET; + for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING); + + /* Initialize lmgrs' LWLocks in main array */ + lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET; + for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER); + + /* Initialize predicate lmgrs' LWLocks in main array */ + lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET; + for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER); + + /* + * Copy the info about any named tranches into shared memory (so that + * other processes can see it), and initialize the requested LWLocks. + */ + if (NamedLWLockTrancheRequests > 0) + { + char *trancheNames; + + NamedLWLockTrancheArray = (NamedLWLockTranche *) + &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks]; + + trancheNames = (char *) NamedLWLockTrancheArray + + (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche)); + lock = &MainLWLockArray[NUM_FIXED_LWLOCKS]; + + for (i = 0; i < NamedLWLockTrancheRequests; i++) + { + NamedLWLockTrancheRequest *request; + NamedLWLockTranche *tranche; + char *name; + + request = &NamedLWLockTrancheRequestArray[i]; + tranche = &NamedLWLockTrancheArray[i]; + + name = trancheNames; + trancheNames += strlen(request->tranche_name) + 1; + strcpy(name, request->tranche_name); + tranche->trancheId = LWLockNewTrancheId(); + tranche->trancheName = name; + + for (j = 0; j < request->num_lwlocks; j++, lock++) + LWLockInitialize(&lock->lock, tranche->trancheId); + } + } +} + +/* + * InitLWLockAccess - initialize backend-local state needed to hold LWLocks + */ +void +InitLWLockAccess(void) +{ +#ifdef LWLOCK_STATS + init_lwlock_stats(); +#endif +} + +/* + * GetNamedLWLockTranche - returns the base address of LWLock from the + * specified tranche. + * + * Caller needs to retrieve the requested number of LWLocks starting from + * the base lock address returned by this API. This can be used for + * tranches that are requested by using RequestNamedLWLockTranche() API. + */ +LWLockPadded * +GetNamedLWLockTranche(const char *tranche_name) +{ + int lock_pos; + int i; + + /* + * Obtain the position of base address of LWLock belonging to requested + * tranche_name in MainLWLockArray. LWLocks for named tranches are placed + * in MainLWLockArray after fixed locks. + */ + lock_pos = NUM_FIXED_LWLOCKS; + for (i = 0; i < NamedLWLockTrancheRequests; i++) + { + if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name, + tranche_name) == 0) + return &MainLWLockArray[lock_pos]; + + lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks; + } + + elog(ERROR, "requested tranche is not registered"); + + /* just to keep compiler quiet */ + return NULL; +} + +/* + * Allocate a new tranche ID. + */ +int +LWLockNewTrancheId(void) +{ + int result; + int *LWLockCounter; + + LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); + SpinLockAcquire(ShmemLock); + result = (*LWLockCounter)++; + SpinLockRelease(ShmemLock); + + return result; +} + +/* + * Register a dynamic tranche name in the lookup table of the current process. + * + * This routine will save a pointer to the tranche name passed as an argument, + * so the name should be allocated in a backend-lifetime context + * (shared memory, TopMemoryContext, static constant, or similar). + * + * The tranche name will be user-visible as a wait event name, so try to + * use a name that fits the style for those. + */ +void +LWLockRegisterTranche(int tranche_id, const char *tranche_name) +{ + /* This should only be called for user-defined tranches. */ + if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED) + return; + + /* Convert to array index. */ + tranche_id -= LWTRANCHE_FIRST_USER_DEFINED; + + /* If necessary, create or enlarge array. */ + if (tranche_id >= LWLockTrancheNamesAllocated) + { + int newalloc; + + newalloc = pg_nextpower2_32(Max(8, tranche_id + 1)); + + if (LWLockTrancheNames == NULL) + LWLockTrancheNames = (const char **) + MemoryContextAllocZero(TopMemoryContext, + newalloc * sizeof(char *)); + else + LWLockTrancheNames = + repalloc0_array(LWLockTrancheNames, const char *, LWLockTrancheNamesAllocated, newalloc); + LWLockTrancheNamesAllocated = newalloc; + } + + LWLockTrancheNames[tranche_id] = tranche_name; +} + +/* + * RequestNamedLWLockTranche + * Request that extra LWLocks be allocated during postmaster + * startup. + * + * This may only be called via the shmem_request_hook of a library that is + * loaded into the postmaster via shared_preload_libraries. Calls from + * elsewhere will fail. + * + * The tranche name will be user-visible as a wait event name, so try to + * use a name that fits the style for those. + */ +void +RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) +{ + NamedLWLockTrancheRequest *request; + + if (!process_shmem_requests_in_progress) + elog(FATAL, "cannot request additional LWLocks outside shmem_request_hook"); + + if (NamedLWLockTrancheRequestArray == NULL) + { + NamedLWLockTrancheRequestsAllocated = 16; + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) + MemoryContextAlloc(TopMemoryContext, + NamedLWLockTrancheRequestsAllocated + * sizeof(NamedLWLockTrancheRequest)); + } + + if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated) + { + int i = pg_nextpower2_32(NamedLWLockTrancheRequests + 1); + + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) + repalloc(NamedLWLockTrancheRequestArray, + i * sizeof(NamedLWLockTrancheRequest)); + NamedLWLockTrancheRequestsAllocated = i; + } + + request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests]; + Assert(strlen(tranche_name) + 1 <= NAMEDATALEN); + strlcpy(request->tranche_name, tranche_name, NAMEDATALEN); + request->num_lwlocks = num_lwlocks; + NamedLWLockTrancheRequests++; +} + +/* + * LWLockInitialize - initialize a new lwlock; it's initially unlocked + */ +void +LWLockInitialize(LWLock *lock, int tranche_id) +{ + pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK); +#ifdef LOCK_DEBUG + pg_atomic_init_u32(&lock->nwaiters, 0); +#endif + lock->tranche = tranche_id; + proclist_init(&lock->waiters); +} + +/* + * Report start of wait event for light-weight locks. + * + * This function will be used by all the light-weight lock calls which + * needs to wait to acquire the lock. This function distinguishes wait + * event based on tranche and lock id. + */ +static inline void +LWLockReportWaitStart(LWLock *lock) +{ + pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche); +} + +/* + * Report end of wait event for light-weight locks. + */ +static inline void +LWLockReportWaitEnd(void) +{ + pgstat_report_wait_end(); +} + +/* + * Return the name of an LWLock tranche. + */ +static const char * +GetLWTrancheName(uint16 trancheId) +{ + /* Individual LWLock? */ + if (trancheId < NUM_INDIVIDUAL_LWLOCKS) + return IndividualLWLockNames[trancheId]; + + /* Built-in tranche? */ + if (trancheId < LWTRANCHE_FIRST_USER_DEFINED) + return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS]; + + /* + * It's an extension tranche, so look in LWLockTrancheNames[]. However, + * it's possible that the tranche has never been registered in the current + * process, in which case give up and return "extension". + */ + trancheId -= LWTRANCHE_FIRST_USER_DEFINED; + + if (trancheId >= LWLockTrancheNamesAllocated || + LWLockTrancheNames[trancheId] == NULL) + return "extension"; + + return LWLockTrancheNames[trancheId]; +} + +/* + * Return an identifier for an LWLock based on the wait class and event. + */ +const char * +GetLWLockIdentifier(uint32 classId, uint16 eventId) +{ + Assert(classId == PG_WAIT_LWLOCK); + /* The event IDs are just tranche numbers. */ + return GetLWTrancheName(eventId); +} + +/* + * Internal function that tries to atomically acquire the lwlock in the passed + * in mode. + * + * This function will not block waiting for a lock to become free - that's the + * callers job. + * + * Returns true if the lock isn't free and we need to wait. + */ +static bool +LWLockAttemptLock(LWLock *lock, LWLockMode mode) +{ + uint32 old_state; + + Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED); + + /* + * Read once outside the loop, later iterations will get the newer value + * via compare & exchange. + */ + old_state = pg_atomic_read_u32(&lock->state); + + /* loop until we've determined whether we could acquire the lock or not */ + while (true) + { + uint32 desired_state; + bool lock_free; + + desired_state = old_state; + + if (mode == LW_EXCLUSIVE) + { + lock_free = (old_state & LW_LOCK_MASK) == 0; + if (lock_free) + desired_state += LW_VAL_EXCLUSIVE; + } + else + { + lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0; + if (lock_free) + desired_state += LW_VAL_SHARED; + } + + /* + * Attempt to swap in the state we are expecting. If we didn't see + * lock to be free, that's just the old value. If we saw it as free, + * we'll attempt to mark it acquired. The reason that we always swap + * in the value is that this doubles as a memory barrier. We could try + * to be smarter and only swap in values if we saw the lock as free, + * but benchmark haven't shown it as beneficial so far. + * + * Retry if the value changed since we last looked at it. + */ + if (pg_atomic_compare_exchange_u32(&lock->state, + &old_state, desired_state)) + { + if (lock_free) + { + /* Great! Got the lock. */ +#ifdef LOCK_DEBUG + if (mode == LW_EXCLUSIVE) + lock->owner = MyProc; +#endif + return false; + } + else + return true; /* somebody else has the lock */ + } + } + pg_unreachable(); +} + +/* + * Lock the LWLock's wait list against concurrent activity. + * + * NB: even though the wait list is locked, non-conflicting lock operations + * may still happen concurrently. + * + * Time spent holding mutex should be short! + */ +static void +LWLockWaitListLock(LWLock *lock) +{ + uint32 old_state; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + uint32 delays = 0; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + while (true) + { + /* always try once to acquire lock directly */ + old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED); + if (!(old_state & LW_FLAG_LOCKED)) + break; /* got lock */ + + /* and then spin without atomic operations until lock is released */ + { + SpinDelayStatus delayStatus; + + init_local_spin_delay(&delayStatus); + + while (old_state & LW_FLAG_LOCKED) + { + perform_spin_delay(&delayStatus); + old_state = pg_atomic_read_u32(&lock->state); + } +#ifdef LWLOCK_STATS + delays += delayStatus.delays; +#endif + finish_spin_delay(&delayStatus); + } + + /* + * Retry. The lock might obviously already be re-acquired by the time + * we're attempting to get it again. + */ + } + +#ifdef LWLOCK_STATS + lwstats->spin_delay_count += delays; +#endif +} + +/* + * Unlock the LWLock's wait list. + * + * Note that it can be more efficient to manipulate flags and release the + * locks in a single atomic operation. + */ +static void +LWLockWaitListUnlock(LWLock *lock) +{ + uint32 old_state PG_USED_FOR_ASSERTS_ONLY; + + old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED); + + Assert(old_state & LW_FLAG_LOCKED); +} + +/* + * Wakeup all the lockers that currently have a chance to acquire the lock. + */ +static void +LWLockWakeup(LWLock *lock) +{ + bool new_release_ok; + bool wokeup_somebody = false; + proclist_head wakeup; + proclist_mutable_iter iter; + + proclist_init(&wakeup); + + new_release_ok = true; + + /* lock wait list while collecting backends to wake up */ + LWLockWaitListLock(lock); + + proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE) + continue; + + proclist_delete(&lock->waiters, iter.cur, lwWaitLink); + proclist_push_tail(&wakeup, iter.cur, lwWaitLink); + + if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) + { + /* + * Prevent additional wakeups until retryer gets to run. Backends + * that are just waiting for the lock to become free don't retry + * automatically. + */ + new_release_ok = false; + + /* + * Don't wakeup (further) exclusive locks. + */ + wokeup_somebody = true; + } + + /* + * Signal that the process isn't on the wait list anymore. This allows + * LWLockDequeueSelf() to remove itself of the waitlist with a + * proclist_delete(), rather than having to check if it has been + * removed from the list. + */ + Assert(waiter->lwWaiting == LW_WS_WAITING); + waiter->lwWaiting = LW_WS_PENDING_WAKEUP; + + /* + * Once we've woken up an exclusive lock, there's no point in waking + * up anybody else. + */ + if (waiter->lwWaitMode == LW_EXCLUSIVE) + break; + } + + Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS); + + /* unset required flags, and release lock, in one fell swoop */ + { + uint32 old_state; + uint32 desired_state; + + old_state = pg_atomic_read_u32(&lock->state); + while (true) + { + desired_state = old_state; + + /* compute desired flags */ + + if (new_release_ok) + desired_state |= LW_FLAG_RELEASE_OK; + else + desired_state &= ~LW_FLAG_RELEASE_OK; + + if (proclist_is_empty(&wakeup)) + desired_state &= ~LW_FLAG_HAS_WAITERS; + + desired_state &= ~LW_FLAG_LOCKED; /* release lock */ + + if (pg_atomic_compare_exchange_u32(&lock->state, &old_state, + desired_state)) + break; + } + } + + /* Awaken any waiters I removed from the queue. */ + proclist_foreach_modify(iter, &wakeup, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + LOG_LWDEBUG("LWLockRelease", lock, "release waiter"); + proclist_delete(&wakeup, iter.cur, lwWaitLink); + + /* + * Guarantee that lwWaiting being unset only becomes visible once the + * unlink from the link has completed. Otherwise the target backend + * could be woken up for other reason and enqueue for a new lock - if + * that happens before the list unlink happens, the list would end up + * being corrupted. + * + * The barrier pairs with the LWLockWaitListLock() when enqueuing for + * another lock. + */ + pg_write_barrier(); + waiter->lwWaiting = LW_WS_NOT_WAITING; + PGSemaphoreUnlock(waiter->sem); + } +} + +/* + * Add ourselves to the end of the queue. + * + * NB: Mode can be LW_WAIT_UNTIL_FREE here! + */ +static void +LWLockQueueSelf(LWLock *lock, LWLockMode mode) +{ + /* + * If we don't have a PGPROC structure, there's no way to wait. This + * should never occur, since MyProc should only be null during shared + * memory initialization. + */ + if (MyProc == NULL) + elog(PANIC, "cannot wait without a PGPROC structure"); + + if (MyProc->lwWaiting != LW_WS_NOT_WAITING) + elog(PANIC, "queueing for lock while waiting on another one"); + + LWLockWaitListLock(lock); + + /* setting the flag is protected by the spinlock */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS); + + MyProc->lwWaiting = LW_WS_WAITING; + MyProc->lwWaitMode = mode; + + /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */ + if (mode == LW_WAIT_UNTIL_FREE) + proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink); + else + proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink); + + /* Can release the mutex now */ + LWLockWaitListUnlock(lock); + +#ifdef LOCK_DEBUG + pg_atomic_fetch_add_u32(&lock->nwaiters, 1); +#endif +} + +/* + * Remove ourselves from the waitlist. + * + * This is used if we queued ourselves because we thought we needed to sleep + * but, after further checking, we discovered that we don't actually need to + * do so. + */ +static void +LWLockDequeueSelf(LWLock *lock) +{ + bool on_waitlist; + +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); + + lwstats->dequeue_self_count++; +#endif + + LWLockWaitListLock(lock); + + /* + * Remove ourselves from the waitlist, unless we've already been removed. + * The removal happens with the wait list lock held, so there's no race in + * this check. + */ + on_waitlist = MyProc->lwWaiting == LW_WS_WAITING; + if (on_waitlist) + proclist_delete(&lock->waiters, MyProc->pgprocno, lwWaitLink); + + if (proclist_is_empty(&lock->waiters) && + (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0) + { + pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS); + } + + /* XXX: combine with fetch_and above? */ + LWLockWaitListUnlock(lock); + + /* clear waiting state again, nice for debugging */ + if (on_waitlist) + MyProc->lwWaiting = LW_WS_NOT_WAITING; + else + { + int extraWaits = 0; + + /* + * Somebody else dequeued us and has or will wake us up. Deal with the + * superfluous absorption of a wakeup. + */ + + /* + * Reset RELEASE_OK flag if somebody woke us before we removed + * ourselves - they'll have set it to false. + */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + + /* + * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would + * get reset at some inconvenient point later. Most of the time this + * will immediately return. + */ + for (;;) + { + PGSemaphoreLock(MyProc->sem); + if (MyProc->lwWaiting == LW_WS_NOT_WAITING) + break; + extraWaits++; + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(MyProc->sem); + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif +} + +/* + * LWLockAcquire - acquire a lightweight lock in the specified mode + * + * If the lock is not available, sleep until it is. Returns true if the lock + * was available immediately, false if we had to sleep. + * + * Side effect: cancel/die interrupts are held off until lock release. + */ +bool +LWLockAcquire(LWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + bool result = true; + int extraWaits = 0; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockAcquire", lock, mode); + +#ifdef LWLOCK_STATS + /* Count lock acquisition attempts */ + if (mode == LW_EXCLUSIVE) + lwstats->ex_acquire_count++; + else + lwstats->sh_acquire_count++; +#endif /* LWLOCK_STATS */ + + /* + * We can't wait if we haven't got a PGPROC. This should only occur + * during bootstrap or shared memory initialization. Put an Assert here + * to catch unsafe coding practices. + */ + Assert(!(proc == NULL && IsUnderPostmaster)); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* + * Loop here to try to acquire lock after each time we are signaled by + * LWLockRelease. + * + * NOTE: it might seem better to have LWLockRelease actually grant us the + * lock, rather than retrying and possibly having to go back to sleep. But + * in practice that is no good because it means a process swap for every + * lock acquisition when two or more processes are contending for the same + * lock. Since LWLocks are normally used to protect not-very-long + * sections of computation, a process needs to be able to acquire and + * release the same lock many times during a single CPU time slice, even + * in the presence of contention. The efficiency of being able to do that + * outweighs the inefficiency of sometimes wasting a process dispatch + * cycle because the lock is not free when a released waiter finally gets + * to run. See pgsql-hackers archives for 29-Dec-01. + */ + for (;;) + { + bool mustwait; + + /* + * Try to grab the lock the first time, we're not in the waitqueue + * yet/anymore. + */ + mustwait = LWLockAttemptLock(lock, mode); + + if (!mustwait) + { + LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock"); + break; /* got the lock */ + } + + /* + * Ok, at this point we couldn't grab the lock on the first try. We + * cannot simply queue ourselves to the end of the list and wait to be + * woken up because by now the lock could long have been released. + * Instead add us to the queue and try to grab the lock again. If we + * succeed we need to revert the queuing and be happy, otherwise we + * recheck the lock. If we still couldn't grab it, we know that the + * other locker will see our queue entries when releasing since they + * existed before we checked for the lock. + */ + + /* add to the queue */ + LWLockQueueSelf(lock, mode); + + /* we're now guaranteed to be woken up if necessary */ + mustwait = LWLockAttemptLock(lock, mode); + + /* ok, grabbed the lock the second time round, need to undo queueing */ + if (!mustwait) + { + LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue"); + + LWLockDequeueSelf(lock); + break; + } + + /* + * Wait until awakened. + * + * It is possible that we get awakened for a reason other than being + * signaled by LWLockRelease. If so, loop back and wait again. Once + * we've gotten the LWLock, re-increment the sema by the number of + * additional signals received. + */ + LOG_LWDEBUG("LWLockAcquire", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (proc->lwWaiting == LW_WS_NOT_WAITING) + break; + extraWaits++; + } + + /* Retrying, allow LWLockRelease to release waiters again. */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockAcquire", lock, "awakened"); + + /* Now loop back and try to acquire lock again. */ + result = false; + } + + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode); + + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + return result; +} + +/* + * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode + * + * If the lock is not available, return false with no side-effects. + * + * If successful, cancel/die interrupts are held off until lock release. + */ +bool +LWLockConditionalAcquire(LWLock *lock, LWLockMode mode) +{ + bool mustwait; + + Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* Check for the lock */ + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + /* Failed to get lock, so release interrupt holdoff */ + RESUME_INTERRUPTS(); + + LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed"); + if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode); + } + else + { + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode); + } + return !mustwait; +} + +/* + * LWLockAcquireOrWait - Acquire lock, or wait until it's free + * + * The semantics of this function are a bit funky. If the lock is currently + * free, it is acquired in the given mode, and the function returns true. If + * the lock isn't immediately free, the function waits until it is released + * and returns false, but does not acquire the lock. + * + * This is currently used for WALWriteLock: when a backend flushes the WAL, + * holding WALWriteLock, it can flush the commit records of many other + * backends as a side-effect. Those other backends need to wait until the + * flush finishes, but don't need to acquire the lock anymore. They can just + * wake up, observe that their records have already been flushed, and return. + */ +bool +LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + bool mustwait; + int extraWaits = 0; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* + * NB: We're using nearly the same twice-in-a-row lock acquisition + * protocol as LWLockAcquire(). Check its comments for details. + */ + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); + + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + /* + * Wait until awakened. Like in LWLockAcquire, be prepared for + * bogus wakeups. + */ + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (proc->lwWaiting == LW_WS_NOT_WAITING) + break; + extraWaits++; + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened"); + } + else + { + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue"); + + /* + * Got lock in the second attempt, undo queueing. We need to treat + * this as having successfully acquired the lock, otherwise we'd + * not necessarily wake up people we've prevented from acquiring + * the lock. + */ + LWLockDequeueSelf(lock); + } + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + if (mustwait) + { + /* Failed to get lock, so release interrupt holdoff */ + RESUME_INTERRUPTS(); + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed"); + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode); + } + else + { + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded"); + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode); + } + + return !mustwait; +} + +/* + * Does the lwlock in its current state need to wait for the variable value to + * change? + * + * If we don't need to wait, and it's because the value of the variable has + * changed, store the current value in newval. + * + * *result is set to true if the lock was free, and false otherwise. + */ +static bool +LWLockConflictsWithVar(LWLock *lock, + uint64 *valptr, uint64 oldval, uint64 *newval, + bool *result) +{ + bool mustwait; + uint64 value; + + /* + * Test first to see if it the slot is free right now. + * + * XXX: the caller uses a spinlock before this, so we don't need a memory + * barrier here as far as the current usage is concerned. But that might + * not be safe in general. + */ + mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0; + + if (!mustwait) + { + *result = true; + return false; + } + + *result = false; + + /* + * Read value using the lwlock's wait list lock, as we can't generally + * rely on atomic 64 bit reads/stores. TODO: On platforms with a way to + * do atomic 64 bit reads/writes the spinlock should be optimized away. + */ + LWLockWaitListLock(lock); + value = *valptr; + LWLockWaitListUnlock(lock); + + if (value != oldval) + { + mustwait = false; + *newval = value; + } + else + { + mustwait = true; + } + + return mustwait; +} + +/* + * LWLockWaitForVar - Wait until lock is free, or a variable is updated. + * + * If the lock is held and *valptr equals oldval, waits until the lock is + * either freed, or the lock holder updates *valptr by calling + * LWLockUpdateVar. If the lock is free on exit (immediately or after + * waiting), returns true. If the lock is still held, but *valptr no longer + * matches oldval, returns false and sets *newval to the current value in + * *valptr. + * + * Note: this function ignores shared lock holders; if the lock is held + * in shared mode, returns 'true'. + */ +bool +LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) +{ + PGPROC *proc = MyProc; + int extraWaits = 0; + bool result = false; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE); + + /* + * Lock out cancel/die interrupts while we sleep on the lock. There is no + * cleanup mechanism to remove us from the wait queue if we got + * interrupted. + */ + HOLD_INTERRUPTS(); + + /* + * Loop here to check the lock's status after each time we are signaled. + */ + for (;;) + { + bool mustwait; + + mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, + &result); + + if (!mustwait) + break; /* the lock was free or value didn't match */ + + /* + * Add myself to wait queue. Note that this is racy, somebody else + * could wakeup before we're finished queuing. NB: We're using nearly + * the same twice-in-a-row lock acquisition protocol as + * LWLockAcquire(). Check its comments for details. The only + * difference is that we also have to check the variable's values when + * checking the state of the lock. + */ + LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); + + /* + * Set RELEASE_OK flag, to make sure we get woken up as soon as the + * lock is released. + */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + + /* + * We're now guaranteed to be woken up if necessary. Recheck the lock + * and variables state. + */ + mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, + &result); + + /* Ok, no conflict after we queued ourselves. Undo queueing. */ + if (!mustwait) + { + LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue"); + + LWLockDequeueSelf(lock); + break; + } + + /* + * Wait until awakened. + * + * It is possible that we get awakened for a reason other than being + * signaled by LWLockRelease. If so, loop back and wait again. Once + * we've gotten the LWLock, re-increment the sema by the number of + * additional signals received. + */ + LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (proc->lwWaiting == LW_WS_NOT_WAITING) + break; + extraWaits++; + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened"); + + /* Now loop back and check the status of the lock again. */ + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + /* + * Now okay to allow cancel/die interrupts. + */ + RESUME_INTERRUPTS(); + + return result; +} + + +/* + * LWLockUpdateVar - Update a variable and wake up waiters atomically + * + * Sets *valptr to 'val', and wakes up all processes waiting for us with + * LWLockWaitForVar(). Setting the value and waking up the processes happen + * atomically so that any process calling LWLockWaitForVar() on the same lock + * is guaranteed to see the new value, and act accordingly. + * + * The caller must be holding the lock in exclusive mode. + */ +void +LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val) +{ + proclist_head wakeup; + proclist_mutable_iter iter; + + PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE); + + proclist_init(&wakeup); + + LWLockWaitListLock(lock); + + Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE); + + /* Update the lock's value */ + *valptr = val; + + /* + * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken + * up. They are always in the front of the queue. + */ + proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) + break; + + proclist_delete(&lock->waiters, iter.cur, lwWaitLink); + proclist_push_tail(&wakeup, iter.cur, lwWaitLink); + + /* see LWLockWakeup() */ + Assert(waiter->lwWaiting == LW_WS_WAITING); + waiter->lwWaiting = LW_WS_PENDING_WAKEUP; + } + + /* We are done updating shared state of the lock itself. */ + LWLockWaitListUnlock(lock); + + /* + * Awaken any waiters I removed from the queue. + */ + proclist_foreach_modify(iter, &wakeup, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + proclist_delete(&wakeup, iter.cur, lwWaitLink); + /* check comment in LWLockWakeup() about this barrier */ + pg_write_barrier(); + waiter->lwWaiting = LW_WS_NOT_WAITING; + PGSemaphoreUnlock(waiter->sem); + } +} + + +/* + * LWLockRelease - release a previously acquired lock + */ +void +LWLockRelease(LWLock *lock) +{ + LWLockMode mode; + uint32 oldstate; + bool check_waiters; + int i; + + /* + * Remove lock from list of locks held. Usually, but not always, it will + * be the latest-acquired lock; so search array backwards. + */ + for (i = num_held_lwlocks; --i >= 0;) + if (lock == held_lwlocks[i].lock) + break; + + if (i < 0) + elog(ERROR, "lock %s is not held", T_NAME(lock)); + + mode = held_lwlocks[i].mode; + + num_held_lwlocks--; + for (; i < num_held_lwlocks; i++) + held_lwlocks[i] = held_lwlocks[i + 1]; + + PRINT_LWDEBUG("LWLockRelease", lock, mode); + + /* + * Release my hold on lock, after that it can immediately be acquired by + * others, even if we still have to wakeup other waiters. + */ + if (mode == LW_EXCLUSIVE) + oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE); + else + oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED); + + /* nobody else can have that kind of lock */ + Assert(!(oldstate & LW_VAL_EXCLUSIVE)); + + if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock)); + + /* + * We're still waiting for backends to get scheduled, don't wake them up + * again. + */ + if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) == + (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) && + (oldstate & LW_LOCK_MASK) == 0) + check_waiters = true; + else + check_waiters = false; + + /* + * As waking up waiters requires the spinlock to be acquired, only do so + * if necessary. + */ + if (check_waiters) + { + /* XXX: remove before commit? */ + LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters"); + LWLockWakeup(lock); + } + + /* + * Now okay to allow cancel/die interrupts. + */ + RESUME_INTERRUPTS(); +} + +/* + * LWLockReleaseClearVar - release a previously acquired lock, reset variable + */ +void +LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val) +{ + LWLockWaitListLock(lock); + + /* + * Set the variable's value before releasing the lock, that prevents race + * a race condition wherein a new locker acquires the lock, but hasn't yet + * set the variables value. + */ + *valptr = val; + LWLockWaitListUnlock(lock); + + LWLockRelease(lock); +} + + +/* + * LWLockReleaseAll - release all currently-held locks + * + * Used to clean up after ereport(ERROR). An important difference between this + * function and retail LWLockRelease calls is that InterruptHoldoffCount is + * unchanged by this operation. This is necessary since InterruptHoldoffCount + * has been set to an appropriate level earlier in error recovery. We could + * decrement it below zero if we allow it to drop for each released lock! + */ +void +LWLockReleaseAll(void) +{ + while (num_held_lwlocks > 0) + { + HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ + + LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock); + } +} + + +/* + * LWLockHeldByMe - test whether my process holds a lock in any mode + * + * This is meant as debug support only. + */ +bool +LWLockHeldByMe(LWLock *lock) +{ + int i; + + for (i = 0; i < num_held_lwlocks; i++) + { + if (held_lwlocks[i].lock == lock) + return true; + } + return false; +} + +/* + * LWLockHeldByMe - test whether my process holds any of an array of locks + * + * This is meant as debug support only. + */ +bool +LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride) +{ + char *held_lock_addr; + char *begin; + char *end; + int i; + + begin = (char *) lock; + end = begin + nlocks * stride; + for (i = 0; i < num_held_lwlocks; i++) + { + held_lock_addr = (char *) held_lwlocks[i].lock; + if (held_lock_addr >= begin && + held_lock_addr < end && + (held_lock_addr - begin) % stride == 0) + return true; + } + return false; +} + +/* + * LWLockHeldByMeInMode - test whether my process holds a lock in given mode + * + * This is meant as debug support only. + */ +bool +LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode) +{ + int i; + + for (i = 0; i < num_held_lwlocks; i++) + { + if (held_lwlocks[i].lock == lock && held_lwlocks[i].mode == mode) + return true; + } + return false; +} diff --git a/src/backend/storage/lmgr/lwlocknames.c b/src/backend/storage/lmgr/lwlocknames.c new file mode 100644 index 0000000..65f7c5b --- /dev/null +++ b/src/backend/storage/lmgr/lwlocknames.c @@ -0,0 +1,52 @@ +/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */ + +const char *const IndividualLWLockNames[] = { + "", + "ShmemIndex", + "OidGen", + "XidGen", + "ProcArray", + "SInvalRead", + "SInvalWrite", + "WALBufMapping", + "WALWrite", + "ControlFile", + "", + "XactSLRU", + "SubtransSLRU", + "MultiXactGen", + "MultiXactOffsetSLRU", + "MultiXactMemberSLRU", + "RelCacheInit", + "CheckpointerComm", + "TwoPhaseState", + "TablespaceCreate", + "BtreeVacuum", + "AddinShmemInit", + "Autovacuum", + "AutovacuumSchedule", + "SyncScan", + "RelationMapping", + "NotifySLRU", + "NotifyQueue", + "SerializableXactHash", + "SerializableFinishedList", + "SerializablePredicateList", + "SerialSLRU", + "SyncRep", + "BackgroundWorker", + "DynamicSharedMemoryControl", + "AutoFile", + "ReplicationSlotAllocation", + "ReplicationSlotControl", + "CommitTsSLRU", + "CommitTs", + "ReplicationOrigin", + "MultiXactTruncation", + "OldSnapshotTimeMap", + "LogicalRepWorker", + "XactTruncation", + "", + "WrapLimitsVacuum", + "NotifyQueueTail" +}; diff --git a/src/backend/storage/lmgr/lwlocknames.h b/src/backend/storage/lmgr/lwlocknames.h new file mode 100644 index 0000000..e279f72 --- /dev/null +++ b/src/backend/storage/lmgr/lwlocknames.h @@ -0,0 +1,50 @@ +/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */ +/* there is deliberately not an #ifndef LWLOCKNAMES_H here */ + +#define ShmemIndexLock (&MainLWLockArray[1].lock) +#define OidGenLock (&MainLWLockArray[2].lock) +#define XidGenLock (&MainLWLockArray[3].lock) +#define ProcArrayLock (&MainLWLockArray[4].lock) +#define SInvalReadLock (&MainLWLockArray[5].lock) +#define SInvalWriteLock (&MainLWLockArray[6].lock) +#define WALBufMappingLock (&MainLWLockArray[7].lock) +#define WALWriteLock (&MainLWLockArray[8].lock) +#define ControlFileLock (&MainLWLockArray[9].lock) +#define XactSLRULock (&MainLWLockArray[11].lock) +#define SubtransSLRULock (&MainLWLockArray[12].lock) +#define MultiXactGenLock (&MainLWLockArray[13].lock) +#define MultiXactOffsetSLRULock (&MainLWLockArray[14].lock) +#define MultiXactMemberSLRULock (&MainLWLockArray[15].lock) +#define RelCacheInitLock (&MainLWLockArray[16].lock) +#define CheckpointerCommLock (&MainLWLockArray[17].lock) +#define TwoPhaseStateLock (&MainLWLockArray[18].lock) +#define TablespaceCreateLock (&MainLWLockArray[19].lock) +#define BtreeVacuumLock (&MainLWLockArray[20].lock) +#define AddinShmemInitLock (&MainLWLockArray[21].lock) +#define AutovacuumLock (&MainLWLockArray[22].lock) +#define AutovacuumScheduleLock (&MainLWLockArray[23].lock) +#define SyncScanLock (&MainLWLockArray[24].lock) +#define RelationMappingLock (&MainLWLockArray[25].lock) +#define NotifySLRULock (&MainLWLockArray[26].lock) +#define NotifyQueueLock (&MainLWLockArray[27].lock) +#define SerializableXactHashLock (&MainLWLockArray[28].lock) +#define SerializableFinishedListLock (&MainLWLockArray[29].lock) +#define SerializablePredicateListLock (&MainLWLockArray[30].lock) +#define SerialSLRULock (&MainLWLockArray[31].lock) +#define SyncRepLock (&MainLWLockArray[32].lock) +#define BackgroundWorkerLock (&MainLWLockArray[33].lock) +#define DynamicSharedMemoryControlLock (&MainLWLockArray[34].lock) +#define AutoFileLock (&MainLWLockArray[35].lock) +#define ReplicationSlotAllocationLock (&MainLWLockArray[36].lock) +#define ReplicationSlotControlLock (&MainLWLockArray[37].lock) +#define CommitTsSLRULock (&MainLWLockArray[38].lock) +#define CommitTsLock (&MainLWLockArray[39].lock) +#define ReplicationOriginLock (&MainLWLockArray[40].lock) +#define MultiXactTruncationLock (&MainLWLockArray[41].lock) +#define OldSnapshotTimeMapLock (&MainLWLockArray[42].lock) +#define LogicalRepWorkerLock (&MainLWLockArray[43].lock) +#define XactTruncationLock (&MainLWLockArray[44].lock) +#define WrapLimitsVacuumLock (&MainLWLockArray[46].lock) +#define NotifyQueueTailLock (&MainLWLockArray[47].lock) + +#define NUM_INDIVIDUAL_LWLOCKS 48 diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt new file mode 100644 index 0000000..6c7cf6c --- /dev/null +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -0,0 +1,55 @@ +# Some commonly-used locks have predefined positions within MainLWLockArray; +# these are defined here. If you add a lock, add it to the end to avoid +# renumbering the existing locks; if you remove a lock, consider leaving a gap +# in the numbering sequence for the benefit of DTrace and other external +# debugging scripts. Also, do not forget to update the list of wait events +# in the user documentation. + +# 0 is available; was formerly BufFreelistLock +ShmemIndexLock 1 +OidGenLock 2 +XidGenLock 3 +ProcArrayLock 4 +SInvalReadLock 5 +SInvalWriteLock 6 +WALBufMappingLock 7 +WALWriteLock 8 +ControlFileLock 9 +# 10 was CheckpointLock +XactSLRULock 11 +SubtransSLRULock 12 +MultiXactGenLock 13 +MultiXactOffsetSLRULock 14 +MultiXactMemberSLRULock 15 +RelCacheInitLock 16 +CheckpointerCommLock 17 +TwoPhaseStateLock 18 +TablespaceCreateLock 19 +BtreeVacuumLock 20 +AddinShmemInitLock 21 +AutovacuumLock 22 +AutovacuumScheduleLock 23 +SyncScanLock 24 +RelationMappingLock 25 +NotifySLRULock 26 +NotifyQueueLock 27 +SerializableXactHashLock 28 +SerializableFinishedListLock 29 +SerializablePredicateListLock 30 +SerialSLRULock 31 +SyncRepLock 32 +BackgroundWorkerLock 33 +DynamicSharedMemoryControlLock 34 +AutoFileLock 35 +ReplicationSlotAllocationLock 36 +ReplicationSlotControlLock 37 +CommitTsSLRULock 38 +CommitTsLock 39 +ReplicationOriginLock 40 +MultiXactTruncationLock 41 +OldSnapshotTimeMapLock 42 +LogicalRepWorkerLock 43 +XactTruncationLock 44 +# 45 was XactTruncationLock until removal of BackendRandomLock +WrapLimitsVacuumLock 46 +NotifyQueueTailLock 47 diff --git a/src/backend/storage/lmgr/meson.build b/src/backend/storage/lmgr/meson.build new file mode 100644 index 0000000..0b2c93d --- /dev/null +++ b/src/backend/storage/lmgr/meson.build @@ -0,0 +1,15 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'condition_variable.c', + 'deadlock.c', + 'lmgr.c', + 'lock.c', + 'lwlock.c', + 'predicate.c', + 'proc.c', + 's_lock.c', + 'spin.c', +) + +generated_backend_sources += lwlocknames[1] diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c new file mode 100644 index 0000000..1af4121 --- /dev/null +++ b/src/backend/storage/lmgr/predicate.c @@ -0,0 +1,4997 @@ +/*------------------------------------------------------------------------- + * + * predicate.c + * POSTGRES predicate locking + * to support full serializable transaction isolation + * + * + * The approach taken is to implement Serializable Snapshot Isolation (SSI) + * as initially described in this paper: + * + * Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008. + * Serializable isolation for snapshot databases. + * In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD + * international conference on Management of data, + * pages 729-738, New York, NY, USA. ACM. + * http://doi.acm.org/10.1145/1376616.1376690 + * + * and further elaborated in Cahill's doctoral thesis: + * + * Michael James Cahill. 2009. + * Serializable Isolation for Snapshot Databases. + * Sydney Digital Theses. + * University of Sydney, School of Information Technologies. + * http://hdl.handle.net/2123/5353 + * + * + * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD + * locks, which are so different from normal locks that a distinct set of + * structures is required to handle them. They are needed to detect + * rw-conflicts when the read happens before the write. (When the write + * occurs first, the reading transaction can check for a conflict by + * examining the MVCC data.) + * + * (1) Besides tuples actually read, they must cover ranges of tuples + * which would have been read based on the predicate. This will + * require modelling the predicates through locks against database + * objects such as pages, index ranges, or entire tables. + * + * (2) They must be kept in RAM for quick access. Because of this, it + * isn't possible to always maintain tuple-level granularity -- when + * the space allocated to store these approaches exhaustion, a + * request for a lock may need to scan for situations where a single + * transaction holds many fine-grained locks which can be coalesced + * into a single coarser-grained lock. + * + * (3) They never block anything; they are more like flags than locks + * in that regard; although they refer to database objects and are + * used to identify rw-conflicts with normal write locks. + * + * (4) While they are associated with a transaction, they must survive + * a successful COMMIT of that transaction, and remain until all + * overlapping transactions complete. This even means that they + * must survive termination of the transaction's process. If a + * top level transaction is rolled back, however, it is immediately + * flagged so that it can be ignored, and its SIREAD locks can be + * released any time after that. + * + * (5) The only transactions which create SIREAD locks or check for + * conflicts with them are serializable transactions. + * + * (6) When a write lock for a top level transaction is found to cover + * an existing SIREAD lock for the same transaction, the SIREAD lock + * can be deleted. + * + * (7) A write from a serializable transaction must ensure that an xact + * record exists for the transaction, with the same lifespan (until + * all concurrent transaction complete or the transaction is rolled + * back) so that rw-dependencies to that transaction can be + * detected. + * + * We use an optimization for read-only transactions. Under certain + * circumstances, a read-only transaction's snapshot can be shown to + * never have conflicts with other transactions. This is referred to + * as a "safe" snapshot (and one known not to be is "unsafe"). + * However, it can't be determined whether a snapshot is safe until + * all concurrent read/write transactions complete. + * + * Once a read-only transaction is known to have a safe snapshot, it + * can release its predicate locks and exempt itself from further + * predicate lock tracking. READ ONLY DEFERRABLE transactions run only + * on safe snapshots, waiting as necessary for one to be available. + * + * + * Lightweight locks to manage access to the predicate locking shared + * memory objects must be taken in this order, and should be released in + * reverse order: + * + * SerializableFinishedListLock + * - Protects the list of transactions which have completed but which + * may yet matter because they overlap still-active transactions. + * + * SerializablePredicateListLock + * - Protects the linked list of locks held by a transaction. Note + * that the locks themselves are also covered by the partition + * locks of their respective lock targets; this lock only affects + * the linked list connecting the locks related to a transaction. + * - All transactions share this single lock (with no partitioning). + * - There is never a need for a process other than the one running + * an active transaction to walk the list of locks held by that + * transaction, except parallel query workers sharing the leader's + * transaction. In the parallel case, an extra per-sxact lock is + * taken; see below. + * - It is relatively infrequent that another process needs to + * modify the list for a transaction, but it does happen for such + * things as index page splits for pages with predicate locks and + * freeing of predicate locked pages by a vacuum process. When + * removing a lock in such cases, the lock itself contains the + * pointers needed to remove it from the list. When adding a + * lock in such cases, the lock can be added using the anchor in + * the transaction structure. Neither requires walking the list. + * - Cleaning up the list for a terminated transaction is sometimes + * not done on a retail basis, in which case no lock is required. + * - Due to the above, a process accessing its active transaction's + * list always uses a shared lock, regardless of whether it is + * walking or maintaining the list. This improves concurrency + * for the common access patterns. + * - A process which needs to alter the list of a transaction other + * than its own active transaction must acquire an exclusive + * lock. + * + * SERIALIZABLEXACT's member 'perXactPredicateListLock' + * - Protects the linked list of predicate locks held by a transaction. + * Only needed for parallel mode, where multiple backends share the + * same SERIALIZABLEXACT object. Not needed if + * SerializablePredicateListLock is held exclusively. + * + * PredicateLockHashPartitionLock(hashcode) + * - The same lock protects a target, all locks on that target, and + * the linked list of locks on the target. + * - When more than one is needed, acquire in ascending address order. + * - When all are needed (rare), acquire in ascending index order with + * PredicateLockHashPartitionLockByIndex(index). + * + * SerializableXactHashLock + * - Protects both PredXact and SerializableXidHash. + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/predicate.c + * + *------------------------------------------------------------------------- + */ +/* + * INTERFACE ROUTINES + * + * housekeeping for setting up shared memory predicate lock structures + * InitPredicateLocks(void) + * PredicateLockShmemSize(void) + * + * predicate lock reporting + * GetPredicateLockStatusData(void) + * PageIsPredicateLocked(Relation relation, BlockNumber blkno) + * + * predicate lock maintenance + * GetSerializableTransactionSnapshot(Snapshot snapshot) + * SetSerializableTransactionSnapshot(Snapshot snapshot, + * VirtualTransactionId *sourcevxid) + * RegisterPredicateLockingXid(void) + * PredicateLockRelation(Relation relation, Snapshot snapshot) + * PredicateLockPage(Relation relation, BlockNumber blkno, + * Snapshot snapshot) + * PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, + * TransactionId tuple_xid) + * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, + * BlockNumber newblkno) + * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, + * BlockNumber newblkno) + * TransferPredicateLocksToHeapRelation(Relation relation) + * ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe) + * + * conflict detection (may also trigger rollback) + * CheckForSerializableConflictOut(Relation relation, TransactionId xid, + * Snapshot snapshot) + * CheckForSerializableConflictIn(Relation relation, ItemPointer tid, + * BlockNumber blkno) + * CheckTableForSerializableConflictIn(Relation relation) + * + * final rollback checking + * PreCommit_CheckForSerializationFailure(void) + * + * two-phase commit support + * AtPrepare_PredicateLocks(void); + * PostPrepare_PredicateLocks(TransactionId xid); + * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit); + * predicatelock_twophase_recover(TransactionId xid, uint16 info, + * void *recdata, uint32 len); + */ + +#include "postgres.h" + +#include "access/parallel.h" +#include "access/slru.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/twophase_rmgr.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/pg_lfind.h" +#include "storage/bufmgr.h" +#include "storage/predicate.h" +#include "storage/predicate_internals.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +/* Uncomment the next line to test the graceful degradation code. */ +/* #define TEST_SUMMARIZE_SERIAL */ + +/* + * Test the most selective fields first, for performance. + * + * a is covered by b if all of the following hold: + * 1) a.database = b.database + * 2) a.relation = b.relation + * 3) b.offset is invalid (b is page-granularity or higher) + * 4) either of the following: + * 4a) a.offset is valid (a is tuple-granularity) and a.page = b.page + * or 4b) a.offset is invalid and b.page is invalid (a is + * page-granularity and b is relation-granularity + */ +#define TargetTagIsCoveredBy(covered_target, covering_target) \ + ((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */ \ + GET_PREDICATELOCKTARGETTAG_RELATION(covering_target)) \ + && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) == \ + InvalidOffsetNumber) /* (3) */ \ + && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) != \ + InvalidOffsetNumber) /* (4a) */ \ + && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \ + GET_PREDICATELOCKTARGETTAG_PAGE(covered_target))) \ + || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \ + InvalidBlockNumber) /* (4b) */ \ + && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target) \ + != InvalidBlockNumber))) \ + && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) == /* (1) */ \ + GET_PREDICATELOCKTARGETTAG_DB(covering_target))) + +/* + * The predicate locking target and lock shared hash tables are partitioned to + * reduce contention. To determine which partition a given target belongs to, + * compute the tag's hash code with PredicateLockTargetTagHashCode(), then + * apply one of these macros. + * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2! + */ +#define PredicateLockHashPartition(hashcode) \ + ((hashcode) % NUM_PREDICATELOCK_PARTITIONS) +#define PredicateLockHashPartitionLock(hashcode) \ + (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \ + PredicateLockHashPartition(hashcode)].lock) +#define PredicateLockHashPartitionLockByIndex(i) \ + (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock) + +#define NPREDICATELOCKTARGETENTS() \ + mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts)) + +#define SxactIsOnFinishedList(sxact) (!dlist_node_is_detached(&(sxact)->finishedLink)) + +/* + * Note that a sxact is marked "prepared" once it has passed + * PreCommit_CheckForSerializationFailure, even if it isn't using + * 2PC. This is the point at which it can no longer be aborted. + * + * The PREPARED flag remains set after commit, so SxactIsCommitted + * implies SxactIsPrepared. + */ +#define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0) +#define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0) +#define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0) +#define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0) +#define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0) +#define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0) +#define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0) +/* + * The following macro actually means that the specified transaction has a + * conflict out *to a transaction which committed ahead of it*. It's hard + * to get that into a name of a reasonable length. + */ +#define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0) +#define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0) +#define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0) +#define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0) +#define SxactIsPartiallyReleased(sxact) (((sxact)->flags & SXACT_FLAG_PARTIALLY_RELEASED) != 0) + +/* + * Compute the hash code associated with a PREDICATELOCKTARGETTAG. + * + * To avoid unnecessary recomputations of the hash code, we try to do this + * just once per function, and then pass it around as needed. Aside from + * passing the hashcode to hash_search_with_hash_value(), we can extract + * the lock partition number from the hashcode. + */ +#define PredicateLockTargetTagHashCode(predicatelocktargettag) \ + get_hash_value(PredicateLockTargetHash, predicatelocktargettag) + +/* + * Given a predicate lock tag, and the hash for its target, + * compute the lock hash. + * + * To make the hash code also depend on the transaction, we xor the sxid + * struct's address into the hash code, left-shifted so that the + * partition-number bits don't change. Since this is only a hash, we + * don't care if we lose high-order bits of the address; use an + * intermediate variable to suppress cast-pointer-to-int warnings. + */ +#define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \ + ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \ + << LOG2_NUM_PREDICATELOCK_PARTITIONS) + + +/* + * The SLRU buffer area through which we access the old xids. + */ +static SlruCtlData SerialSlruCtlData; + +#define SerialSlruCtl (&SerialSlruCtlData) + +#define SERIAL_PAGESIZE BLCKSZ +#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo) +#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE) + +/* + * Set maximum pages based on the number needed to track all transactions. + */ +#define SERIAL_MAX_PAGE (MaxTransactionId / SERIAL_ENTRIESPERPAGE) + +#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1) + +#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \ + (SerialSlruCtl->shared->page_buffer[slotno] + \ + ((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE)))) + +#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE) + +typedef struct SerialControlData +{ + int headPage; /* newest initialized page */ + TransactionId headXid; /* newest valid Xid in the SLRU */ + TransactionId tailXid; /* oldest xmin we might be interested in */ +} SerialControlData; + +typedef struct SerialControlData *SerialControl; + +static SerialControl serialControl; + +/* + * When the oldest committed transaction on the "finished" list is moved to + * SLRU, its predicate locks will be moved to this "dummy" transaction, + * collapsing duplicate targets. When a duplicate is found, the later + * commitSeqNo is used. + */ +static SERIALIZABLEXACT *OldCommittedSxact; + + +/* + * These configuration variables are used to set the predicate lock table size + * and to control promotion of predicate locks to coarser granularity in an + * attempt to degrade performance (mostly as false positive serialization + * failure) gracefully in the face of memory pressure. + */ +int max_predicate_locks_per_xact; /* in guc_tables.c */ +int max_predicate_locks_per_relation; /* in guc_tables.c */ +int max_predicate_locks_per_page; /* in guc_tables.c */ + +/* + * This provides a list of objects in order to track transactions + * participating in predicate locking. Entries in the list are fixed size, + * and reside in shared memory. The memory address of an entry must remain + * fixed during its lifetime. The list will be protected from concurrent + * update externally; no provision is made in this code to manage that. The + * number of entries in the list, and the size allowed for each entry is + * fixed upon creation. + */ +static PredXactList PredXact; + +/* + * This provides a pool of RWConflict data elements to use in conflict lists + * between transactions. + */ +static RWConflictPoolHeader RWConflictPool; + +/* + * The predicate locking hash tables are in shared memory. + * Each backend keeps pointers to them. + */ +static HTAB *SerializableXidHash; +static HTAB *PredicateLockTargetHash; +static HTAB *PredicateLockHash; +static dlist_head *FinishedSerializableTransactions; + +/* + * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing + * this entry, you can ensure that there's enough scratch space available for + * inserting one entry in the hash table. This is an otherwise-invalid tag. + */ +static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0}; +static uint32 ScratchTargetTagHash; +static LWLock *ScratchPartitionLock; + +/* + * The local hash table used to determine when to combine multiple fine- + * grained locks into a single courser-grained lock. + */ +static HTAB *LocalPredicateLockHash = NULL; + +/* + * Keep a pointer to the currently-running serializable transaction (if any) + * for quick reference. Also, remember if we have written anything that could + * cause a rw-conflict. + */ +static SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact; +static bool MyXactDidWrite = false; + +/* + * The SXACT_FLAG_RO_UNSAFE optimization might lead us to release + * MySerializableXact early. If that happens in a parallel query, the leader + * needs to defer the destruction of the SERIALIZABLEXACT until end of + * transaction, because the workers still have a reference to it. In that + * case, the leader stores it here. + */ +static SERIALIZABLEXACT *SavedSerializableXact = InvalidSerializableXact; + +/* local functions */ + +static SERIALIZABLEXACT *CreatePredXact(void); +static void ReleasePredXact(SERIALIZABLEXACT *sxact); + +static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer); +static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer); +static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact); +static void ReleaseRWConflict(RWConflict conflict); +static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact); + +static bool SerialPagePrecedesLogically(int page1, int page2); +static void SerialInit(void); +static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo); +static SerCommitSeqNo SerialGetMinConflictCommitSeqNo(TransactionId xid); +static void SerialSetActiveSerXmin(TransactionId xid); + +static uint32 predicatelock_hash(const void *key, Size keysize); +static void SummarizeOldestCommittedSxact(void); +static Snapshot GetSafeSnapshot(Snapshot origSnapshot); +static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot, + VirtualTransactionId *sourcevxid, + int sourcepid); +static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag); +static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag, + PREDICATELOCKTARGETTAG *parent); +static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag); +static void RemoveScratchTarget(bool lockheld); +static void RestoreScratchTarget(bool lockheld); +static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, + uint32 targettaghash); +static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag); +static int MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag); +static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag); +static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag); +static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag, + uint32 targettaghash, + SERIALIZABLEXACT *sxact); +static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash); +static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag, + PREDICATELOCKTARGETTAG newtargettag, + bool removeOld); +static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag); +static void DropAllPredicateLocksFromTable(Relation relation, + bool transfer); +static void SetNewSxactGlobalXmin(void); +static void ClearOldPredicateLocks(void); +static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial, + bool summarize); +static bool XidIsConcurrent(TransactionId xid); +static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag); +static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer); +static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, + SERIALIZABLEXACT *writer); +static void CreateLocalPredicateLockHash(void); +static void ReleasePredicateLocksLocal(void); + + +/*------------------------------------------------------------------------*/ + +/* + * Does this relation participate in predicate locking? Temporary and system + * relations are exempt. + */ +static inline bool +PredicateLockingNeededForRelation(Relation relation) +{ + return !(relation->rd_id < FirstUnpinnedObjectId || + RelationUsesLocalBuffers(relation)); +} + +/* + * When a public interface method is called for a read, this is the test to + * see if we should do a quick return. + * + * Note: this function has side-effects! If this transaction has been flagged + * as RO-safe since the last call, we release all predicate locks and reset + * MySerializableXact. That makes subsequent calls to return quickly. + * + * This is marked as 'inline' to eliminate the function call overhead in the + * common case that serialization is not needed. + */ +static inline bool +SerializationNeededForRead(Relation relation, Snapshot snapshot) +{ + /* Nothing to do if this is not a serializable transaction */ + if (MySerializableXact == InvalidSerializableXact) + return false; + + /* + * Don't acquire locks or conflict when scanning with a special snapshot. + * This excludes things like CLUSTER and REINDEX. They use the wholesale + * functions TransferPredicateLocksToHeapRelation() and + * CheckTableForSerializableConflictIn() to participate in serialization, + * but the scans involved don't need serialization. + */ + if (!IsMVCCSnapshot(snapshot)) + return false; + + /* + * Check if we have just become "RO-safe". If we have, immediately release + * all locks as they're not needed anymore. This also resets + * MySerializableXact, so that subsequent calls to this function can exit + * quickly. + * + * A transaction is flagged as RO_SAFE if all concurrent R/W transactions + * commit without having conflicts out to an earlier snapshot, thus + * ensuring that no conflicts are possible for this transaction. + */ + if (SxactIsROSafe(MySerializableXact)) + { + ReleasePredicateLocks(false, true); + return false; + } + + /* Check if the relation doesn't participate in predicate locking */ + if (!PredicateLockingNeededForRelation(relation)) + return false; + + return true; /* no excuse to skip predicate locking */ +} + +/* + * Like SerializationNeededForRead(), but called on writes. + * The logic is the same, but there is no snapshot and we can't be RO-safe. + */ +static inline bool +SerializationNeededForWrite(Relation relation) +{ + /* Nothing to do if this is not a serializable transaction */ + if (MySerializableXact == InvalidSerializableXact) + return false; + + /* Check if the relation doesn't participate in predicate locking */ + if (!PredicateLockingNeededForRelation(relation)) + return false; + + return true; /* no excuse to skip predicate locking */ +} + + +/*------------------------------------------------------------------------*/ + +/* + * These functions are a simple implementation of a list for this specific + * type of struct. If there is ever a generalized shared memory list, we + * should probably switch to that. + */ +static SERIALIZABLEXACT * +CreatePredXact(void) +{ + SERIALIZABLEXACT *sxact; + + if (dlist_is_empty(&PredXact->availableList)) + return NULL; + + sxact = dlist_container(SERIALIZABLEXACT, xactLink, + dlist_pop_head_node(&PredXact->availableList)); + dlist_push_tail(&PredXact->activeList, &sxact->xactLink); + return sxact; +} + +static void +ReleasePredXact(SERIALIZABLEXACT *sxact) +{ + Assert(ShmemAddrIsValid(sxact)); + + dlist_delete(&sxact->xactLink); + dlist_push_tail(&PredXact->availableList, &sxact->xactLink); +} + +/*------------------------------------------------------------------------*/ + +/* + * These functions manage primitive access to the RWConflict pool and lists. + */ +static bool +RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer) +{ + dlist_iter iter; + + Assert(reader != writer); + + /* Check the ends of the purported conflict first. */ + if (SxactIsDoomed(reader) + || SxactIsDoomed(writer) + || dlist_is_empty(&reader->outConflicts) + || dlist_is_empty(&writer->inConflicts)) + return false; + + /* + * A conflict is possible; walk the list to find out. + * + * The unconstify is needed as we have no const version of + * dlist_foreach(). + */ + dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->outConflicts) + { + RWConflict conflict = + dlist_container(RWConflictData, outLink, iter.cur); + + if (conflict->sxactIn == writer) + return true; + } + + /* No conflict found. */ + return false; +} + +static void +SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer) +{ + RWConflict conflict; + + Assert(reader != writer); + Assert(!RWConflictExists(reader, writer)); + + if (dlist_is_empty(&RWConflictPool->availableList)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("not enough elements in RWConflictPool to record a read/write conflict"), + errhint("You might need to run fewer transactions at a time or increase max_connections."))); + + conflict = dlist_head_element(RWConflictData, outLink, &RWConflictPool->availableList); + dlist_delete(&conflict->outLink); + + conflict->sxactOut = reader; + conflict->sxactIn = writer; + dlist_push_tail(&reader->outConflicts, &conflict->outLink); + dlist_push_tail(&writer->inConflicts, &conflict->inLink); +} + +static void +SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, + SERIALIZABLEXACT *activeXact) +{ + RWConflict conflict; + + Assert(roXact != activeXact); + Assert(SxactIsReadOnly(roXact)); + Assert(!SxactIsReadOnly(activeXact)); + + if (dlist_is_empty(&RWConflictPool->availableList)) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"), + errhint("You might need to run fewer transactions at a time or increase max_connections."))); + + conflict = dlist_head_element(RWConflictData, outLink, &RWConflictPool->availableList); + dlist_delete(&conflict->outLink); + + conflict->sxactOut = activeXact; + conflict->sxactIn = roXact; + dlist_push_tail(&activeXact->possibleUnsafeConflicts, &conflict->outLink); + dlist_push_tail(&roXact->possibleUnsafeConflicts, &conflict->inLink); +} + +static void +ReleaseRWConflict(RWConflict conflict) +{ + dlist_delete(&conflict->inLink); + dlist_delete(&conflict->outLink); + dlist_push_tail(&RWConflictPool->availableList, &conflict->outLink); +} + +static void +FlagSxactUnsafe(SERIALIZABLEXACT *sxact) +{ + dlist_mutable_iter iter; + + Assert(SxactIsReadOnly(sxact)); + Assert(!SxactIsROSafe(sxact)); + + sxact->flags |= SXACT_FLAG_RO_UNSAFE; + + /* + * We know this isn't a safe snapshot, so we can stop looking for other + * potential conflicts. + */ + dlist_foreach_modify(iter, &sxact->possibleUnsafeConflicts) + { + RWConflict conflict = + dlist_container(RWConflictData, inLink, iter.cur); + + Assert(!SxactIsReadOnly(conflict->sxactOut)); + Assert(sxact == conflict->sxactIn); + + ReleaseRWConflict(conflict); + } +} + +/*------------------------------------------------------------------------*/ + +/* + * Decide whether a Serial page number is "older" for truncation purposes. + * Analogous to CLOGPagePrecedes(). + */ +static bool +SerialPagePrecedesLogically(int page1, int page2) +{ + TransactionId xid1; + TransactionId xid2; + + xid1 = ((TransactionId) page1) * SERIAL_ENTRIESPERPAGE; + xid1 += FirstNormalTransactionId + 1; + xid2 = ((TransactionId) page2) * SERIAL_ENTRIESPERPAGE; + xid2 += FirstNormalTransactionId + 1; + + return (TransactionIdPrecedes(xid1, xid2) && + TransactionIdPrecedes(xid1, xid2 + SERIAL_ENTRIESPERPAGE - 1)); +} + +#ifdef USE_ASSERT_CHECKING +static void +SerialPagePrecedesLogicallyUnitTests(void) +{ + int per_page = SERIAL_ENTRIESPERPAGE, + offset = per_page / 2; + int newestPage, + oldestPage, + headPage, + targetPage; + TransactionId newestXact, + oldestXact; + + /* GetNewTransactionId() has assigned the last XID it can safely use. */ + newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1; /* nothing special */ + newestXact = newestPage * per_page + offset; + Assert(newestXact / per_page == newestPage); + oldestXact = newestXact + 1; + oldestXact -= 1U << 31; + oldestPage = oldestXact / per_page; + + /* + * In this scenario, the SLRU headPage pertains to the last ~1000 XIDs + * assigned. oldestXact finishes, ~2B XIDs having elapsed since it + * started. Further transactions cause us to summarize oldestXact to + * tailPage. Function must return false so SerialAdd() doesn't zero + * tailPage (which may contain entries for other old, recently-finished + * XIDs) and half the SLRU. Reaching this requires burning ~2B XIDs in + * single-user mode, a negligible possibility. + */ + headPage = newestPage; + targetPage = oldestPage; + Assert(!SerialPagePrecedesLogically(headPage, targetPage)); + + /* + * In this scenario, the SLRU headPage pertains to oldestXact. We're + * summarizing an XID near newestXact. (Assume few other XIDs used + * SERIALIZABLE, hence the minimal headPage advancement. Assume + * oldestXact was long-running and only recently reached the SLRU.) + * Function must return true to make SerialAdd() create targetPage. + * + * Today's implementation mishandles this case, but it doesn't matter + * enough to fix. Verify that the defect affects just one page by + * asserting correct treatment of its prior page. Reaching this case + * requires burning ~2B XIDs in single-user mode, a negligible + * possibility. Moreover, if it does happen, the consequence would be + * mild, namely a new transaction failing in SimpleLruReadPage(). + */ + headPage = oldestPage; + targetPage = newestPage; + Assert(SerialPagePrecedesLogically(headPage, targetPage - 1)); +#if 0 + Assert(SerialPagePrecedesLogically(headPage, targetPage)); +#endif +} +#endif + +/* + * Initialize for the tracking of old serializable committed xids. + */ +static void +SerialInit(void) +{ + bool found; + + /* + * Set up SLRU management of the pg_serial data. + */ + SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically; + SimpleLruInit(SerialSlruCtl, "Serial", + NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial", + LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE); +#ifdef USE_ASSERT_CHECKING + SerialPagePrecedesLogicallyUnitTests(); +#endif + SlruPagePrecedesUnitTests(SerialSlruCtl, SERIAL_ENTRIESPERPAGE); + + /* + * Create or attach to the SerialControl structure. + */ + serialControl = (SerialControl) + ShmemInitStruct("SerialControlData", sizeof(SerialControlData), &found); + + Assert(found == IsUnderPostmaster); + if (!found) + { + /* + * Set control information to reflect empty SLRU. + */ + serialControl->headPage = -1; + serialControl->headXid = InvalidTransactionId; + serialControl->tailXid = InvalidTransactionId; + } +} + +/* + * Record a committed read write serializable xid and the minimum + * commitSeqNo of any transactions to which this xid had a rw-conflict out. + * An invalid commitSeqNo means that there were no conflicts out from xid. + */ +static void +SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo) +{ + TransactionId tailXid; + int targetPage; + int slotno; + int firstZeroPage; + bool isNewPage; + + Assert(TransactionIdIsValid(xid)); + + targetPage = SerialPage(xid); + + LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + + /* + * If no serializable transactions are active, there shouldn't be anything + * to push out to the SLRU. Hitting this assert would mean there's + * something wrong with the earlier cleanup logic. + */ + tailXid = serialControl->tailXid; + Assert(TransactionIdIsValid(tailXid)); + + /* + * If the SLRU is currently unused, zero out the whole active region from + * tailXid to headXid before taking it into use. Otherwise zero out only + * any new pages that enter the tailXid-headXid range as we advance + * headXid. + */ + if (serialControl->headPage < 0) + { + firstZeroPage = SerialPage(tailXid); + isNewPage = true; + } + else + { + firstZeroPage = SerialNextPage(serialControl->headPage); + isNewPage = SerialPagePrecedesLogically(serialControl->headPage, + targetPage); + } + + if (!TransactionIdIsValid(serialControl->headXid) + || TransactionIdFollows(xid, serialControl->headXid)) + serialControl->headXid = xid; + if (isNewPage) + serialControl->headPage = targetPage; + + if (isNewPage) + { + /* Initialize intervening pages. */ + while (firstZeroPage != targetPage) + { + (void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage); + firstZeroPage = SerialNextPage(firstZeroPage); + } + slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage); + } + else + slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid); + + SerialValue(slotno, xid) = minConflictCommitSeqNo; + SerialSlruCtl->shared->page_dirty[slotno] = true; + + LWLockRelease(SerialSLRULock); +} + +/* + * Get the minimum commitSeqNo for any conflict out for the given xid. For + * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo + * will be returned. + */ +static SerCommitSeqNo +SerialGetMinConflictCommitSeqNo(TransactionId xid) +{ + TransactionId headXid; + TransactionId tailXid; + SerCommitSeqNo val; + int slotno; + + Assert(TransactionIdIsValid(xid)); + + LWLockAcquire(SerialSLRULock, LW_SHARED); + headXid = serialControl->headXid; + tailXid = serialControl->tailXid; + LWLockRelease(SerialSLRULock); + + if (!TransactionIdIsValid(headXid)) + return 0; + + Assert(TransactionIdIsValid(tailXid)); + + if (TransactionIdPrecedes(xid, tailXid) + || TransactionIdFollows(xid, headXid)) + return 0; + + /* + * The following function must be called without holding SerialSLRULock, + * but will return with that lock held, which must then be released. + */ + slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl, + SerialPage(xid), xid); + val = SerialValue(slotno, xid); + LWLockRelease(SerialSLRULock); + return val; +} + +/* + * Call this whenever there is a new xmin for active serializable + * transactions. We don't need to keep information on transactions which + * precede that. InvalidTransactionId means none active, so everything in + * the SLRU can be discarded. + */ +static void +SerialSetActiveSerXmin(TransactionId xid) +{ + LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + + /* + * When no sxacts are active, nothing overlaps, set the xid values to + * invalid to show that there are no valid entries. Don't clear headPage, + * though. A new xmin might still land on that page, and we don't want to + * repeatedly zero out the same page. + */ + if (!TransactionIdIsValid(xid)) + { + serialControl->tailXid = InvalidTransactionId; + serialControl->headXid = InvalidTransactionId; + LWLockRelease(SerialSLRULock); + return; + } + + /* + * When we're recovering prepared transactions, the global xmin might move + * backwards depending on the order they're recovered. Normally that's not + * OK, but during recovery no serializable transactions will commit, so + * the SLRU is empty and we can get away with it. + */ + if (RecoveryInProgress()) + { + Assert(serialControl->headPage < 0); + if (!TransactionIdIsValid(serialControl->tailXid) + || TransactionIdPrecedes(xid, serialControl->tailXid)) + { + serialControl->tailXid = xid; + } + LWLockRelease(SerialSLRULock); + return; + } + + Assert(!TransactionIdIsValid(serialControl->tailXid) + || TransactionIdFollows(xid, serialControl->tailXid)); + + serialControl->tailXid = xid; + + LWLockRelease(SerialSLRULock); +} + +/* + * Perform a checkpoint --- either during shutdown, or on-the-fly + * + * We don't have any data that needs to survive a restart, but this is a + * convenient place to truncate the SLRU. + */ +void +CheckPointPredicate(void) +{ + int tailPage; + + LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE); + + /* Exit quickly if the SLRU is currently not in use. */ + if (serialControl->headPage < 0) + { + LWLockRelease(SerialSLRULock); + return; + } + + if (TransactionIdIsValid(serialControl->tailXid)) + { + /* We can truncate the SLRU up to the page containing tailXid */ + tailPage = SerialPage(serialControl->tailXid); + } + else + { + /*---------- + * The SLRU is no longer needed. Truncate to head before we set head + * invalid. + * + * XXX: It's possible that the SLRU is not needed again until XID + * wrap-around has happened, so that the segment containing headPage + * that we leave behind will appear to be new again. In that case it + * won't be removed until XID horizon advances enough to make it + * current again. + * + * XXX: This should happen in vac_truncate_clog(), not in checkpoints. + * Consider this scenario, starting from a system with no in-progress + * transactions and VACUUM FREEZE having maximized oldestXact: + * - Start a SERIALIZABLE transaction. + * - Start, finish, and summarize a SERIALIZABLE transaction, creating + * one SLRU page. + * - Consume XIDs to reach xidStopLimit. + * - Finish all transactions. Due to the long-running SERIALIZABLE + * transaction, earlier checkpoints did not touch headPage. The + * next checkpoint will change it, but that checkpoint happens after + * the end of the scenario. + * - VACUUM to advance XID limits. + * - Consume ~2M XIDs, crossing the former xidWrapLimit. + * - Start, finish, and summarize a SERIALIZABLE transaction. + * SerialAdd() declines to create the targetPage, because headPage + * is not regarded as in the past relative to that targetPage. The + * transaction instigating the summarize fails in + * SimpleLruReadPage(). + */ + tailPage = serialControl->headPage; + serialControl->headPage = -1; + } + + LWLockRelease(SerialSLRULock); + + /* Truncate away pages that are no longer required */ + SimpleLruTruncate(SerialSlruCtl, tailPage); + + /* + * Write dirty SLRU pages to disk + * + * This is not actually necessary from a correctness point of view. We do + * it merely as a debugging aid. + * + * We're doing this after the truncation to avoid writing pages right + * before deleting the file in which they sit, which would be completely + * pointless. + */ + SimpleLruWriteAll(SerialSlruCtl, true); +} + +/*------------------------------------------------------------------------*/ + +/* + * InitPredicateLocks -- Initialize the predicate locking data structures. + * + * This is called from CreateSharedMemoryAndSemaphores(), which see for + * more comments. In the normal postmaster case, the shared hash tables + * are created here. Backends inherit the pointers + * to the shared tables via fork(). In the EXEC_BACKEND case, each + * backend re-executes this code to obtain pointers to the already existing + * shared hash tables. + */ +void +InitPredicateLocks(void) +{ + HASHCTL info; + long max_table_size; + Size requestSize; + bool found; + +#ifndef EXEC_BACKEND + Assert(!IsUnderPostmaster); +#endif + + /* + * Compute size of predicate lock target hashtable. Note these + * calculations must agree with PredicateLockShmemSize! + */ + max_table_size = NPREDICATELOCKTARGETENTS(); + + /* + * Allocate hash table for PREDICATELOCKTARGET structs. This stores + * per-predicate-lock-target information. + */ + info.keysize = sizeof(PREDICATELOCKTARGETTAG); + info.entrysize = sizeof(PREDICATELOCKTARGET); + info.num_partitions = NUM_PREDICATELOCK_PARTITIONS; + + PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash", + max_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | + HASH_PARTITION | HASH_FIXED_SIZE); + + /* + * Reserve a dummy entry in the hash table; we use it to make sure there's + * always one entry available when we need to split or combine a page, + * because running out of space there could mean aborting a + * non-serializable transaction. + */ + if (!IsUnderPostmaster) + { + (void) hash_search(PredicateLockTargetHash, &ScratchTargetTag, + HASH_ENTER, &found); + Assert(!found); + } + + /* Pre-calculate the hash and partition lock of the scratch entry */ + ScratchTargetTagHash = PredicateLockTargetTagHashCode(&ScratchTargetTag); + ScratchPartitionLock = PredicateLockHashPartitionLock(ScratchTargetTagHash); + + /* + * Allocate hash table for PREDICATELOCK structs. This stores per + * xact-lock-of-a-target information. + */ + info.keysize = sizeof(PREDICATELOCKTAG); + info.entrysize = sizeof(PREDICATELOCK); + info.hash = predicatelock_hash; + info.num_partitions = NUM_PREDICATELOCK_PARTITIONS; + + /* Assume an average of 2 xacts per target */ + max_table_size *= 2; + + PredicateLockHash = ShmemInitHash("PREDICATELOCK hash", + max_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_FUNCTION | + HASH_PARTITION | HASH_FIXED_SIZE); + + /* + * Compute size for serializable transaction hashtable. Note these + * calculations must agree with PredicateLockShmemSize! + */ + max_table_size = (MaxBackends + max_prepared_xacts); + + /* + * Allocate a list to hold information on transactions participating in + * predicate locking. + * + * Assume an average of 10 predicate locking transactions per backend. + * This allows aggressive cleanup while detail is present before data must + * be summarized for storage in SLRU and the "dummy" transaction. + */ + max_table_size *= 10; + + PredXact = ShmemInitStruct("PredXactList", + PredXactListDataSize, + &found); + Assert(found == IsUnderPostmaster); + if (!found) + { + int i; + + dlist_init(&PredXact->availableList); + dlist_init(&PredXact->activeList); + PredXact->SxactGlobalXmin = InvalidTransactionId; + PredXact->SxactGlobalXminCount = 0; + PredXact->WritableSxactCount = 0; + PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1; + PredXact->CanPartialClearThrough = 0; + PredXact->HavePartialClearedThrough = 0; + requestSize = mul_size((Size) max_table_size, + sizeof(SERIALIZABLEXACT)); + PredXact->element = ShmemAlloc(requestSize); + /* Add all elements to available list, clean. */ + memset(PredXact->element, 0, requestSize); + for (i = 0; i < max_table_size; i++) + { + LWLockInitialize(&PredXact->element[i].perXactPredicateListLock, + LWTRANCHE_PER_XACT_PREDICATE_LIST); + dlist_push_tail(&PredXact->availableList, &PredXact->element[i].xactLink); + } + PredXact->OldCommittedSxact = CreatePredXact(); + SetInvalidVirtualTransactionId(PredXact->OldCommittedSxact->vxid); + PredXact->OldCommittedSxact->prepareSeqNo = 0; + PredXact->OldCommittedSxact->commitSeqNo = 0; + PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0; + dlist_init(&PredXact->OldCommittedSxact->outConflicts); + dlist_init(&PredXact->OldCommittedSxact->inConflicts); + dlist_init(&PredXact->OldCommittedSxact->predicateLocks); + dlist_node_init(&PredXact->OldCommittedSxact->finishedLink); + dlist_init(&PredXact->OldCommittedSxact->possibleUnsafeConflicts); + PredXact->OldCommittedSxact->topXid = InvalidTransactionId; + PredXact->OldCommittedSxact->finishedBefore = InvalidTransactionId; + PredXact->OldCommittedSxact->xmin = InvalidTransactionId; + PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED; + PredXact->OldCommittedSxact->pid = 0; + PredXact->OldCommittedSxact->pgprocno = INVALID_PGPROCNO; + } + /* This never changes, so let's keep a local copy. */ + OldCommittedSxact = PredXact->OldCommittedSxact; + + /* + * Allocate hash table for SERIALIZABLEXID structs. This stores per-xid + * information for serializable transactions which have accessed data. + */ + info.keysize = sizeof(SERIALIZABLEXIDTAG); + info.entrysize = sizeof(SERIALIZABLEXID); + + SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash", + max_table_size, + max_table_size, + &info, + HASH_ELEM | HASH_BLOBS | + HASH_FIXED_SIZE); + + /* + * Allocate space for tracking rw-conflicts in lists attached to the + * transactions. + * + * Assume an average of 5 conflicts per transaction. Calculations suggest + * that this will prevent resource exhaustion in even the most pessimal + * loads up to max_connections = 200 with all 200 connections pounding the + * database with serializable transactions. Beyond that, there may be + * occasional transactions canceled when trying to flag conflicts. That's + * probably OK. + */ + max_table_size *= 5; + + RWConflictPool = ShmemInitStruct("RWConflictPool", + RWConflictPoolHeaderDataSize, + &found); + Assert(found == IsUnderPostmaster); + if (!found) + { + int i; + + dlist_init(&RWConflictPool->availableList); + requestSize = mul_size((Size) max_table_size, + RWConflictDataSize); + RWConflictPool->element = ShmemAlloc(requestSize); + /* Add all elements to available list, clean. */ + memset(RWConflictPool->element, 0, requestSize); + for (i = 0; i < max_table_size; i++) + { + dlist_push_tail(&RWConflictPool->availableList, + &RWConflictPool->element[i].outLink); + } + } + + /* + * Create or attach to the header for the list of finished serializable + * transactions. + */ + FinishedSerializableTransactions = (dlist_head *) + ShmemInitStruct("FinishedSerializableTransactions", + sizeof(dlist_head), + &found); + Assert(found == IsUnderPostmaster); + if (!found) + dlist_init(FinishedSerializableTransactions); + + /* + * Initialize the SLRU storage for old committed serializable + * transactions. + */ + SerialInit(); +} + +/* + * Estimate shared-memory space used for predicate lock table + */ +Size +PredicateLockShmemSize(void) +{ + Size size = 0; + long max_table_size; + + /* predicate lock target hash table */ + max_table_size = NPREDICATELOCKTARGETENTS(); + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(PREDICATELOCKTARGET))); + + /* predicate lock hash table */ + max_table_size *= 2; + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(PREDICATELOCK))); + + /* + * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety + * margin. + */ + size = add_size(size, size / 10); + + /* transaction list */ + max_table_size = MaxBackends + max_prepared_xacts; + max_table_size *= 10; + size = add_size(size, PredXactListDataSize); + size = add_size(size, mul_size((Size) max_table_size, + sizeof(SERIALIZABLEXACT))); + + /* transaction xid table */ + size = add_size(size, hash_estimate_size(max_table_size, + sizeof(SERIALIZABLEXID))); + + /* rw-conflict pool */ + max_table_size *= 5; + size = add_size(size, RWConflictPoolHeaderDataSize); + size = add_size(size, mul_size((Size) max_table_size, + RWConflictDataSize)); + + /* Head for list of finished serializable transactions. */ + size = add_size(size, sizeof(dlist_head)); + + /* Shared memory structures for SLRU tracking of old committed xids. */ + size = add_size(size, sizeof(SerialControlData)); + size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0)); + + return size; +} + + +/* + * Compute the hash code associated with a PREDICATELOCKTAG. + * + * Because we want to use just one set of partition locks for both the + * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure + * that PREDICATELOCKs fall into the same partition number as their + * associated PREDICATELOCKTARGETs. dynahash.c expects the partition number + * to be the low-order bits of the hash code, and therefore a + * PREDICATELOCKTAG's hash code must have the same low-order bits as the + * associated PREDICATELOCKTARGETTAG's hash code. We achieve this with this + * specialized hash function. + */ +static uint32 +predicatelock_hash(const void *key, Size keysize) +{ + const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key; + uint32 targethash; + + Assert(keysize == sizeof(PREDICATELOCKTAG)); + + /* Look into the associated target object, and compute its hash code */ + targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag); + + return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash); +} + + +/* + * GetPredicateLockStatusData + * Return a table containing the internal state of the predicate + * lock manager for use in pg_lock_status. + * + * Like GetLockStatusData, this function tries to hold the partition LWLocks + * for as short a time as possible by returning two arrays that simply + * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock + * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and + * SERIALIZABLEXACT will likely appear. + */ +PredicateLockData * +GetPredicateLockStatusData(void) +{ + PredicateLockData *data; + int i; + int els, + el; + HASH_SEQ_STATUS seqstat; + PREDICATELOCK *predlock; + + data = (PredicateLockData *) palloc(sizeof(PredicateLockData)); + + /* + * To ensure consistency, take simultaneous locks on all partition locks + * in ascending order, then SerializableXactHashLock. + */ + for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + + /* Get number of locks and allocate appropriately-sized arrays. */ + els = hash_get_num_entries(PredicateLockHash); + data->nelements = els; + data->locktags = (PREDICATELOCKTARGETTAG *) + palloc(sizeof(PREDICATELOCKTARGETTAG) * els); + data->xacts = (SERIALIZABLEXACT *) + palloc(sizeof(SERIALIZABLEXACT) * els); + + + /* Scan through PredicateLockHash and copy contents */ + hash_seq_init(&seqstat, PredicateLockHash); + + el = 0; + + while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat))) + { + data->locktags[el] = predlock->tag.myTarget->tag; + data->xacts[el] = *predlock->tag.myXact; + el++; + } + + Assert(el == els); + + /* Release locks in reverse order */ + LWLockRelease(SerializableXactHashLock); + for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); + + return data; +} + +/* + * Free up shared memory structures by pushing the oldest sxact (the one at + * the front of the SummarizeOldestCommittedSxact queue) into summary form. + * Each call will free exactly one SERIALIZABLEXACT structure and may also + * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK, + * PREDICATELOCKTARGET, RWConflictData. + */ +static void +SummarizeOldestCommittedSxact(void) +{ + SERIALIZABLEXACT *sxact; + + LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE); + + /* + * This function is only called if there are no sxact slots available. + * Some of them must belong to old, already-finished transactions, so + * there should be something in FinishedSerializableTransactions list that + * we can summarize. However, there's a race condition: while we were not + * holding any locks, a transaction might have ended and cleaned up all + * the finished sxact entries already, freeing up their sxact slots. In + * that case, we have nothing to do here. The caller will find one of the + * slots released by the other backend when it retries. + */ + if (dlist_is_empty(FinishedSerializableTransactions)) + { + LWLockRelease(SerializableFinishedListLock); + return; + } + + /* + * Grab the first sxact off the finished list -- this will be the earliest + * commit. Remove it from the list. + */ + sxact = dlist_head_element(SERIALIZABLEXACT, finishedLink, + FinishedSerializableTransactions); + dlist_delete_thoroughly(&sxact->finishedLink); + + /* Add to SLRU summary information. */ + if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact)) + SerialAdd(sxact->topXid, SxactHasConflictOut(sxact) + ? sxact->SeqNo.earliestOutConflictCommit : InvalidSerCommitSeqNo); + + /* Summarize and release the detail. */ + ReleaseOneSerializableXact(sxact, false, true); + + LWLockRelease(SerializableFinishedListLock); +} + +/* + * GetSafeSnapshot + * Obtain and register a snapshot for a READ ONLY DEFERRABLE + * transaction. Ensures that the snapshot is "safe", i.e. a + * read-only transaction running on it can execute serializably + * without further checks. This requires waiting for concurrent + * transactions to complete, and retrying with a new snapshot if + * one of them could possibly create a conflict. + * + * As with GetSerializableTransactionSnapshot (which this is a subroutine + * for), the passed-in Snapshot pointer should reference a static data + * area that can safely be passed to GetSnapshotData. + */ +static Snapshot +GetSafeSnapshot(Snapshot origSnapshot) +{ + Snapshot snapshot; + + Assert(XactReadOnly && XactDeferrable); + + while (true) + { + /* + * GetSerializableTransactionSnapshotInt is going to call + * GetSnapshotData, so we need to provide it the static snapshot area + * our caller passed to us. The pointer returned is actually the same + * one passed to it, but we avoid assuming that here. + */ + snapshot = GetSerializableTransactionSnapshotInt(origSnapshot, + NULL, InvalidPid); + + if (MySerializableXact == InvalidSerializableXact) + return snapshot; /* no concurrent r/w xacts; it's safe */ + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Wait for concurrent transactions to finish. Stop early if one of + * them marked us as conflicted. + */ + MySerializableXact->flags |= SXACT_FLAG_DEFERRABLE_WAITING; + while (!(dlist_is_empty(&MySerializableXact->possibleUnsafeConflicts) || + SxactIsROUnsafe(MySerializableXact))) + { + LWLockRelease(SerializableXactHashLock); + ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + } + MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING; + + if (!SxactIsROUnsafe(MySerializableXact)) + { + LWLockRelease(SerializableXactHashLock); + break; /* success */ + } + + LWLockRelease(SerializableXactHashLock); + + /* else, need to retry... */ + ereport(DEBUG2, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg_internal("deferrable snapshot was unsafe; trying a new one"))); + ReleasePredicateLocks(false, false); + } + + /* + * Now we have a safe snapshot, so we don't need to do any further checks. + */ + Assert(SxactIsROSafe(MySerializableXact)); + ReleasePredicateLocks(false, true); + + return snapshot; +} + +/* + * GetSafeSnapshotBlockingPids + * If the specified process is currently blocked in GetSafeSnapshot, + * write the process IDs of all processes that it is blocked by + * into the caller-supplied buffer output[]. The list is truncated at + * output_size, and the number of PIDs written into the buffer is + * returned. Returns zero if the given PID is not currently blocked + * in GetSafeSnapshot. + */ +int +GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size) +{ + int num_written = 0; + dlist_iter iter; + SERIALIZABLEXACT *blocking_sxact = NULL; + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + + /* Find blocked_pid's SERIALIZABLEXACT by linear search. */ + dlist_foreach(iter, &PredXact->activeList) + { + SERIALIZABLEXACT *sxact = + dlist_container(SERIALIZABLEXACT, xactLink, iter.cur); + + if (sxact->pid == blocked_pid) + { + blocking_sxact = sxact; + break; + } + } + + /* Did we find it, and is it currently waiting in GetSafeSnapshot? */ + if (blocking_sxact != NULL && SxactIsDeferrableWaiting(blocking_sxact)) + { + /* Traverse the list of possible unsafe conflicts collecting PIDs. */ + dlist_foreach(iter, &blocking_sxact->possibleUnsafeConflicts) + { + RWConflict possibleUnsafeConflict = + dlist_container(RWConflictData, inLink, iter.cur); + + output[num_written++] = possibleUnsafeConflict->sxactOut->pid; + + if (num_written >= output_size) + break; + } + } + + LWLockRelease(SerializableXactHashLock); + + return num_written; +} + +/* + * Acquire a snapshot that can be used for the current transaction. + * + * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact. + * It should be current for this process and be contained in PredXact. + * + * The passed-in Snapshot pointer should reference a static data area that + * can safely be passed to GetSnapshotData. The return value is actually + * always this same pointer; no new snapshot data structure is allocated + * within this function. + */ +Snapshot +GetSerializableTransactionSnapshot(Snapshot snapshot) +{ + Assert(IsolationIsSerializable()); + + /* + * Can't use serializable mode while recovery is still active, as it is, + * for example, on a hot standby. We could get here despite the check in + * check_transaction_isolation() if default_transaction_isolation is set + * to serializable, so phrase the hint accordingly. + */ + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use serializable mode in a hot standby"), + errdetail("\"default_transaction_isolation\" is set to \"serializable\"."), + errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default."))); + + /* + * A special optimization is available for SERIALIZABLE READ ONLY + * DEFERRABLE transactions -- we can wait for a suitable snapshot and + * thereby avoid all SSI overhead once it's running. + */ + if (XactReadOnly && XactDeferrable) + return GetSafeSnapshot(snapshot); + + return GetSerializableTransactionSnapshotInt(snapshot, + NULL, InvalidPid); +} + +/* + * Import a snapshot to be used for the current transaction. + * + * This is nearly the same as GetSerializableTransactionSnapshot, except that + * we don't take a new snapshot, but rather use the data we're handed. + * + * The caller must have verified that the snapshot came from a serializable + * transaction; and if we're read-write, the source transaction must not be + * read-only. + */ +void +SetSerializableTransactionSnapshot(Snapshot snapshot, + VirtualTransactionId *sourcevxid, + int sourcepid) +{ + Assert(IsolationIsSerializable()); + + /* + * If this is called by parallel.c in a parallel worker, we don't want to + * create a SERIALIZABLEXACT just yet because the leader's + * SERIALIZABLEXACT will be installed with AttachSerializableXact(). We + * also don't want to reject SERIALIZABLE READ ONLY DEFERRABLE in this + * case, because the leader has already determined that the snapshot it + * has passed us is safe. So there is nothing for us to do. + */ + if (IsParallelWorker()) + return; + + /* + * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to + * import snapshots, since there's no way to wait for a safe snapshot when + * we're using the snap we're told to. (XXX instead of throwing an error, + * we could just ignore the XactDeferrable flag?) + */ + if (XactReadOnly && XactDeferrable) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE"))); + + (void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid, + sourcepid); +} + +/* + * Guts of GetSerializableTransactionSnapshot + * + * If sourcevxid is valid, this is actually an import operation and we should + * skip calling GetSnapshotData, because the snapshot contents are already + * loaded up. HOWEVER: to avoid race conditions, we must check that the + * source xact is still running after we acquire SerializableXactHashLock. + * We do that by calling ProcArrayInstallImportedXmin. + */ +static Snapshot +GetSerializableTransactionSnapshotInt(Snapshot snapshot, + VirtualTransactionId *sourcevxid, + int sourcepid) +{ + PGPROC *proc; + VirtualTransactionId vxid; + SERIALIZABLEXACT *sxact, + *othersxact; + + /* We only do this for serializable transactions. Once. */ + Assert(MySerializableXact == InvalidSerializableXact); + + Assert(!RecoveryInProgress()); + + /* + * Since all parts of a serializable transaction must use the same + * snapshot, it is too late to establish one after a parallel operation + * has begun. + */ + if (IsInParallelMode()) + elog(ERROR, "cannot establish serializable snapshot during a parallel operation"); + + proc = MyProc; + Assert(proc != NULL); + GET_VXID_FROM_PGPROC(vxid, *proc); + + /* + * First we get the sxact structure, which may involve looping and access + * to the "finished" list to free a structure for use. + * + * We must hold SerializableXactHashLock when taking/checking the snapshot + * to avoid race conditions, for much the same reasons that + * GetSnapshotData takes the ProcArrayLock. Since we might have to + * release SerializableXactHashLock to call SummarizeOldestCommittedSxact, + * this means we have to create the sxact first, which is a bit annoying + * (in particular, an elog(ERROR) in procarray.c would cause us to leak + * the sxact). Consider refactoring to avoid this. + */ +#ifdef TEST_SUMMARIZE_SERIAL + SummarizeOldestCommittedSxact(); +#endif + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + do + { + sxact = CreatePredXact(); + /* If null, push out committed sxact to SLRU summary & retry. */ + if (!sxact) + { + LWLockRelease(SerializableXactHashLock); + SummarizeOldestCommittedSxact(); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + } + } while (!sxact); + + /* Get the snapshot, or check that it's safe to use */ + if (!sourcevxid) + snapshot = GetSnapshotData(snapshot); + else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid)) + { + ReleasePredXact(sxact); + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("could not import the requested snapshot"), + errdetail("The source process with PID %d is not running anymore.", + sourcepid))); + } + + /* + * If there are no serializable transactions which are not read-only, we + * can "opt out" of predicate locking and conflict checking for a + * read-only transaction. + * + * The reason this is safe is that a read-only transaction can only become + * part of a dangerous structure if it overlaps a writable transaction + * which in turn overlaps a writable transaction which committed before + * the read-only transaction started. A new writable transaction can + * overlap this one, but it can't meet the other condition of overlapping + * a transaction which committed before this one started. + */ + if (XactReadOnly && PredXact->WritableSxactCount == 0) + { + ReleasePredXact(sxact); + LWLockRelease(SerializableXactHashLock); + return snapshot; + } + + /* Initialize the structure. */ + sxact->vxid = vxid; + sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo; + sxact->prepareSeqNo = InvalidSerCommitSeqNo; + sxact->commitSeqNo = InvalidSerCommitSeqNo; + dlist_init(&(sxact->outConflicts)); + dlist_init(&(sxact->inConflicts)); + dlist_init(&(sxact->possibleUnsafeConflicts)); + sxact->topXid = GetTopTransactionIdIfAny(); + sxact->finishedBefore = InvalidTransactionId; + sxact->xmin = snapshot->xmin; + sxact->pid = MyProcPid; + sxact->pgprocno = MyProc->pgprocno; + dlist_init(&sxact->predicateLocks); + dlist_node_init(&sxact->finishedLink); + sxact->flags = 0; + if (XactReadOnly) + { + dlist_iter iter; + + sxact->flags |= SXACT_FLAG_READ_ONLY; + + /* + * Register all concurrent r/w transactions as possible conflicts; if + * all of them commit without any outgoing conflicts to earlier + * transactions then this snapshot can be deemed safe (and we can run + * without tracking predicate locks). + */ + dlist_foreach(iter, &PredXact->activeList) + { + othersxact = dlist_container(SERIALIZABLEXACT, xactLink, iter.cur); + + if (!SxactIsCommitted(othersxact) + && !SxactIsDoomed(othersxact) + && !SxactIsReadOnly(othersxact)) + { + SetPossibleUnsafeConflict(sxact, othersxact); + } + } + + /* + * If we didn't find any possibly unsafe conflicts because every + * uncommitted writable transaction turned out to be doomed, then we + * can "opt out" immediately. See comments above the earlier check + * for PredXact->WritableSxactCount == 0. + */ + if (dlist_is_empty(&sxact->possibleUnsafeConflicts)) + { + ReleasePredXact(sxact); + LWLockRelease(SerializableXactHashLock); + return snapshot; + } + } + else + { + ++(PredXact->WritableSxactCount); + Assert(PredXact->WritableSxactCount <= + (MaxBackends + max_prepared_xacts)); + } + + /* Maintain serializable global xmin info. */ + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) + { + Assert(PredXact->SxactGlobalXminCount == 0); + PredXact->SxactGlobalXmin = snapshot->xmin; + PredXact->SxactGlobalXminCount = 1; + SerialSetActiveSerXmin(snapshot->xmin); + } + else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin)) + { + Assert(PredXact->SxactGlobalXminCount > 0); + PredXact->SxactGlobalXminCount++; + } + else + { + Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin)); + } + + MySerializableXact = sxact; + MyXactDidWrite = false; /* haven't written anything yet */ + + LWLockRelease(SerializableXactHashLock); + + CreateLocalPredicateLockHash(); + + return snapshot; +} + +static void +CreateLocalPredicateLockHash(void) +{ + HASHCTL hash_ctl; + + /* Initialize the backend-local hash table of parent locks */ + Assert(LocalPredicateLockHash == NULL); + hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG); + hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK); + LocalPredicateLockHash = hash_create("Local predicate lock", + max_predicate_locks_per_xact, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); +} + +/* + * Register the top level XID in SerializableXidHash. + * Also store it for easy reference in MySerializableXact. + */ +void +RegisterPredicateLockingXid(TransactionId xid) +{ + SERIALIZABLEXIDTAG sxidtag; + SERIALIZABLEXID *sxid; + bool found; + + /* + * If we're not tracking predicate lock data for this transaction, we + * should ignore the request and return quickly. + */ + if (MySerializableXact == InvalidSerializableXact) + return; + + /* We should have a valid XID and be at the top level. */ + Assert(TransactionIdIsValid(xid)); + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* This should only be done once per transaction. */ + Assert(MySerializableXact->topXid == InvalidTransactionId); + + MySerializableXact->topXid = xid; + + sxidtag.xid = xid; + sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash, + &sxidtag, + HASH_ENTER, &found); + Assert(!found); + + /* Initialize the structure. */ + sxid->myXact = MySerializableXact; + LWLockRelease(SerializableXactHashLock); +} + + +/* + * Check whether there are any predicate locks held by any transaction + * for the page at the given block number. + * + * Note that the transaction may be completed but not yet subject to + * cleanup due to overlapping serializable transactions. This must + * return valid information regardless of transaction isolation level. + * + * Also note that this doesn't check for a conflicting relation lock, + * just a lock specifically on the given page. + * + * One use is to support proper behavior during GiST index vacuum. + */ +bool +PageIsPredicateLocked(Relation relation, BlockNumber blkno) +{ + PREDICATELOCKTARGETTAG targettag; + uint32 targettaghash; + LWLock *partitionLock; + PREDICATELOCKTARGET *target; + + SET_PREDICATELOCKTARGETTAG_PAGE(targettag, + relation->rd_locator.dbOid, + relation->rd_id, + blkno); + + targettaghash = PredicateLockTargetTagHashCode(&targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + LWLockAcquire(partitionLock, LW_SHARED); + target = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + &targettag, targettaghash, + HASH_FIND, NULL); + LWLockRelease(partitionLock); + + return (target != NULL); +} + + +/* + * Check whether a particular lock is held by this transaction. + * + * Important note: this function may return false even if the lock is + * being held, because it uses the local lock table which is not + * updated if another transaction modifies our lock list (e.g. to + * split an index page). It can also return true when a coarser + * granularity lock that covers this target is being held. Be careful + * to only use this function in circumstances where such errors are + * acceptable! + */ +static bool +PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag) +{ + LOCALPREDICATELOCK *lock; + + /* check local hash table */ + lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash, + targettag, + HASH_FIND, NULL); + + if (!lock) + return false; + + /* + * Found entry in the table, but still need to check whether it's actually + * held -- it could just be a parent of some held lock. + */ + return lock->held; +} + +/* + * Return the parent lock tag in the lock hierarchy: the next coarser + * lock that covers the provided tag. + * + * Returns true and sets *parent to the parent tag if one exists, + * returns false if none exists. + */ +static bool +GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag, + PREDICATELOCKTARGETTAG *parent) +{ + switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag)) + { + case PREDLOCKTAG_RELATION: + /* relation locks have no parent lock */ + return false; + + case PREDLOCKTAG_PAGE: + /* parent lock is relation lock */ + SET_PREDICATELOCKTARGETTAG_RELATION(*parent, + GET_PREDICATELOCKTARGETTAG_DB(*tag), + GET_PREDICATELOCKTARGETTAG_RELATION(*tag)); + + return true; + + case PREDLOCKTAG_TUPLE: + /* parent lock is page lock */ + SET_PREDICATELOCKTARGETTAG_PAGE(*parent, + GET_PREDICATELOCKTARGETTAG_DB(*tag), + GET_PREDICATELOCKTARGETTAG_RELATION(*tag), + GET_PREDICATELOCKTARGETTAG_PAGE(*tag)); + return true; + } + + /* not reachable */ + Assert(false); + return false; +} + +/* + * Check whether the lock we are considering is already covered by a + * coarser lock for our transaction. + * + * Like PredicateLockExists, this function might return a false + * negative, but it will never return a false positive. + */ +static bool +CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag) +{ + PREDICATELOCKTARGETTAG targettag, + parenttag; + + targettag = *newtargettag; + + /* check parents iteratively until no more */ + while (GetParentPredicateLockTag(&targettag, &parenttag)) + { + targettag = parenttag; + if (PredicateLockExists(&targettag)) + return true; + } + + /* no more parents to check; lock is not covered */ + return false; +} + +/* + * Remove the dummy entry from the predicate lock target hash, to free up some + * scratch space. The caller must be holding SerializablePredicateListLock, + * and must restore the entry with RestoreScratchTarget() before releasing the + * lock. + * + * If lockheld is true, the caller is already holding the partition lock + * of the partition containing the scratch entry. + */ +static void +RemoveScratchTarget(bool lockheld) +{ + bool found; + + Assert(LWLockHeldByMe(SerializablePredicateListLock)); + + if (!lockheld) + LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE); + hash_search_with_hash_value(PredicateLockTargetHash, + &ScratchTargetTag, + ScratchTargetTagHash, + HASH_REMOVE, &found); + Assert(found); + if (!lockheld) + LWLockRelease(ScratchPartitionLock); +} + +/* + * Re-insert the dummy entry in predicate lock target hash. + */ +static void +RestoreScratchTarget(bool lockheld) +{ + bool found; + + Assert(LWLockHeldByMe(SerializablePredicateListLock)); + + if (!lockheld) + LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE); + hash_search_with_hash_value(PredicateLockTargetHash, + &ScratchTargetTag, + ScratchTargetTagHash, + HASH_ENTER, &found); + Assert(!found); + if (!lockheld) + LWLockRelease(ScratchPartitionLock); +} + +/* + * Check whether the list of related predicate locks is empty for a + * predicate lock target, and remove the target if it is. + */ +static void +RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash) +{ + PREDICATELOCKTARGET *rmtarget PG_USED_FOR_ASSERTS_ONLY; + + Assert(LWLockHeldByMe(SerializablePredicateListLock)); + + /* Can't remove it until no locks at this target. */ + if (!dlist_is_empty(&target->predicateLocks)) + return; + + /* Actually remove the target. */ + rmtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &target->tag, + targettaghash, + HASH_REMOVE, NULL); + Assert(rmtarget == target); +} + +/* + * Delete child target locks owned by this process. + * This implementation is assuming that the usage of each target tag field + * is uniform. No need to make this hard if we don't have to. + * + * We acquire an LWLock in the case of parallel mode, because worker + * backends have access to the leader's SERIALIZABLEXACT. Otherwise, + * we aren't acquiring LWLocks for the predicate lock or lock + * target structures associated with this transaction unless we're going + * to modify them, because no other process is permitted to modify our + * locks. + */ +static void +DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag) +{ + SERIALIZABLEXACT *sxact; + PREDICATELOCK *predlock; + dlist_mutable_iter iter; + + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + sxact = MySerializableXact; + if (IsInParallelMode()) + LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE); + + dlist_foreach_modify(iter, &sxact->predicateLocks) + { + PREDICATELOCKTAG oldlocktag; + PREDICATELOCKTARGET *oldtarget; + PREDICATELOCKTARGETTAG oldtargettag; + + predlock = dlist_container(PREDICATELOCK, xactLink, iter.cur); + + oldlocktag = predlock->tag; + Assert(oldlocktag.myXact == sxact); + oldtarget = oldlocktag.myTarget; + oldtargettag = oldtarget->tag; + + if (TargetTagIsCoveredBy(oldtargettag, *newtargettag)) + { + uint32 oldtargettaghash; + LWLock *partitionLock; + PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY; + + oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag); + partitionLock = PredicateLockHashPartitionLock(oldtargettaghash); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + dlist_delete(&predlock->xactLink); + dlist_delete(&predlock->targetLink); + rmpredlock = hash_search_with_hash_value + (PredicateLockHash, + &oldlocktag, + PredicateLockHashCodeFromTargetHashCode(&oldlocktag, + oldtargettaghash), + HASH_REMOVE, NULL); + Assert(rmpredlock == predlock); + + RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash); + + LWLockRelease(partitionLock); + + DecrementParentLocks(&oldtargettag); + } + } + if (IsInParallelMode()) + LWLockRelease(&sxact->perXactPredicateListLock); + LWLockRelease(SerializablePredicateListLock); +} + +/* + * Returns the promotion limit for a given predicate lock target. This is the + * max number of descendant locks allowed before promoting to the specified + * tag. Note that the limit includes non-direct descendants (e.g., both tuples + * and pages for a relation lock). + * + * Currently the default limit is 2 for a page lock, and half of the value of + * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior + * of earlier releases when upgrading. + * + * TODO SSI: We should probably add additional GUCs to allow a maximum ratio + * of page and tuple locks based on the pages in a relation, and the maximum + * ratio of tuple locks to tuples in a page. This would provide more + * generally "balanced" allocation of locks to where they are most useful, + * while still allowing the absolute numbers to prevent one relation from + * tying up all predicate lock resources. + */ +static int +MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag) +{ + switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag)) + { + case PREDLOCKTAG_RELATION: + return max_predicate_locks_per_relation < 0 + ? (max_predicate_locks_per_xact + / (-max_predicate_locks_per_relation)) - 1 + : max_predicate_locks_per_relation; + + case PREDLOCKTAG_PAGE: + return max_predicate_locks_per_page; + + case PREDLOCKTAG_TUPLE: + + /* + * not reachable: nothing is finer-granularity than a tuple, so we + * should never try to promote to it. + */ + Assert(false); + return 0; + } + + /* not reachable */ + Assert(false); + return 0; +} + +/* + * For all ancestors of a newly-acquired predicate lock, increment + * their child count in the parent hash table. If any of them have + * more descendants than their promotion threshold, acquire the + * coarsest such lock. + * + * Returns true if a parent lock was acquired and false otherwise. + */ +static bool +CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag) +{ + PREDICATELOCKTARGETTAG targettag, + nexttag, + promotiontag; + LOCALPREDICATELOCK *parentlock; + bool found, + promote; + + promote = false; + + targettag = *reqtag; + + /* check parents iteratively */ + while (GetParentPredicateLockTag(&targettag, &nexttag)) + { + targettag = nexttag; + parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash, + &targettag, + HASH_ENTER, + &found); + if (!found) + { + parentlock->held = false; + parentlock->childLocks = 1; + } + else + parentlock->childLocks++; + + if (parentlock->childLocks > + MaxPredicateChildLocks(&targettag)) + { + /* + * We should promote to this parent lock. Continue to check its + * ancestors, however, both to get their child counts right and to + * check whether we should just go ahead and promote to one of + * them. + */ + promotiontag = targettag; + promote = true; + } + } + + if (promote) + { + /* acquire coarsest ancestor eligible for promotion */ + PredicateLockAcquire(&promotiontag); + return true; + } + else + return false; +} + +/* + * When releasing a lock, decrement the child count on all ancestor + * locks. + * + * This is called only when releasing a lock via + * DeleteChildTargetLocks (i.e. when a lock becomes redundant because + * we've acquired its parent, possibly due to promotion) or when a new + * MVCC write lock makes the predicate lock unnecessary. There's no + * point in calling it when locks are released at transaction end, as + * this information is no longer needed. + */ +static void +DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag) +{ + PREDICATELOCKTARGETTAG parenttag, + nexttag; + + parenttag = *targettag; + + while (GetParentPredicateLockTag(&parenttag, &nexttag)) + { + uint32 targettaghash; + LOCALPREDICATELOCK *parentlock, + *rmlock PG_USED_FOR_ASSERTS_ONLY; + + parenttag = nexttag; + targettaghash = PredicateLockTargetTagHashCode(&parenttag); + parentlock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + &parenttag, targettaghash, + HASH_FIND, NULL); + + /* + * There's a small chance the parent lock doesn't exist in the lock + * table. This can happen if we prematurely removed it because an + * index split caused the child refcount to be off. + */ + if (parentlock == NULL) + continue; + + parentlock->childLocks--; + + /* + * Under similar circumstances the parent lock's refcount might be + * zero. This only happens if we're holding that lock (otherwise we + * would have removed the entry). + */ + if (parentlock->childLocks < 0) + { + Assert(parentlock->held); + parentlock->childLocks = 0; + } + + if ((parentlock->childLocks == 0) && (!parentlock->held)) + { + rmlock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + &parenttag, targettaghash, + HASH_REMOVE, NULL); + Assert(rmlock == parentlock); + } + } +} + +/* + * Indicate that a predicate lock on the given target is held by the + * specified transaction. Has no effect if the lock is already held. + * + * This updates the lock table and the sxact's lock list, and creates + * the lock target if necessary, but does *not* do anything related to + * granularity promotion or the local lock table. See + * PredicateLockAcquire for that. + */ +static void +CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag, + uint32 targettaghash, + SERIALIZABLEXACT *sxact) +{ + PREDICATELOCKTARGET *target; + PREDICATELOCKTAG locktag; + PREDICATELOCK *lock; + LWLock *partitionLock; + bool found; + + partitionLock = PredicateLockHashPartitionLock(targettaghash); + + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + if (IsInParallelMode()) + LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* Make sure that the target is represented. */ + target = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + targettag, targettaghash, + HASH_ENTER_NULL, &found); + if (!target) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_pred_locks_per_transaction"))); + if (!found) + dlist_init(&target->predicateLocks); + + /* We've got the sxact and target, make sure they're joined. */ + locktag.myTarget = target; + locktag.myXact = sxact; + lock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, &locktag, + PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash), + HASH_ENTER_NULL, &found); + if (!lock) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_pred_locks_per_transaction"))); + + if (!found) + { + dlist_push_tail(&target->predicateLocks, &lock->targetLink); + dlist_push_tail(&sxact->predicateLocks, &lock->xactLink); + lock->commitSeqNo = InvalidSerCommitSeqNo; + } + + LWLockRelease(partitionLock); + if (IsInParallelMode()) + LWLockRelease(&sxact->perXactPredicateListLock); + LWLockRelease(SerializablePredicateListLock); +} + +/* + * Acquire a predicate lock on the specified target for the current + * connection if not already held. This updates the local lock table + * and uses it to implement granularity promotion. It will consolidate + * multiple locks into a coarser lock if warranted, and will release + * any finer-grained locks covered by the new one. + */ +static void +PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag) +{ + uint32 targettaghash; + bool found; + LOCALPREDICATELOCK *locallock; + + /* Do we have the lock already, or a covering lock? */ + if (PredicateLockExists(targettag)) + return; + + if (CoarserLockCovers(targettag)) + return; + + /* the same hash and LW lock apply to the lock target and the local lock. */ + targettaghash = PredicateLockTargetTagHashCode(targettag); + + /* Acquire lock in local table */ + locallock = (LOCALPREDICATELOCK *) + hash_search_with_hash_value(LocalPredicateLockHash, + targettag, targettaghash, + HASH_ENTER, &found); + locallock->held = true; + if (!found) + locallock->childLocks = 0; + + /* Actually create the lock */ + CreatePredicateLock(targettag, targettaghash, MySerializableXact); + + /* + * Lock has been acquired. Check whether it should be promoted to a + * coarser granularity, or whether there are finer-granularity locks to + * clean up. + */ + if (CheckAndPromotePredicateLockRequest(targettag)) + { + /* + * Lock request was promoted to a coarser-granularity lock, and that + * lock was acquired. It will delete this lock and any of its + * children, so we're done. + */ + } + else + { + /* Clean up any finer-granularity locks */ + if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE) + DeleteChildTargetLocks(targettag); + } +} + + +/* + * PredicateLockRelation + * + * Gets a predicate lock at the relation level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + * Clear any finer-grained predicate locks this session has on the relation. + */ +void +PredicateLockRelation(Relation relation, Snapshot snapshot) +{ + PREDICATELOCKTARGETTAG tag; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + + SET_PREDICATELOCKTARGETTAG_RELATION(tag, + relation->rd_locator.dbOid, + relation->rd_id); + PredicateLockAcquire(&tag); +} + +/* + * PredicateLockPage + * + * Gets a predicate lock at the page level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + * Skip if a coarser predicate lock already covers this page. + * Clear any finer-grained predicate locks this session has on the relation. + */ +void +PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot) +{ + PREDICATELOCKTARGETTAG tag; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + + SET_PREDICATELOCKTARGETTAG_PAGE(tag, + relation->rd_locator.dbOid, + relation->rd_id, + blkno); + PredicateLockAcquire(&tag); +} + +/* + * PredicateLockTID + * + * Gets a predicate lock at the tuple level. + * Skip if not in full serializable transaction isolation level. + * Skip if this is a temporary table. + */ +void +PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot, + TransactionId tuple_xid) +{ + PREDICATELOCKTARGETTAG tag; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + + /* + * Return if this xact wrote it. + */ + if (relation->rd_index == NULL) + { + /* If we wrote it; we already have a write lock. */ + if (TransactionIdIsCurrentTransactionId(tuple_xid)) + return; + } + + /* + * Do quick-but-not-definitive test for a relation lock first. This will + * never cause a return when the relation is *not* locked, but will + * occasionally let the check continue when there really *is* a relation + * level lock. + */ + SET_PREDICATELOCKTARGETTAG_RELATION(tag, + relation->rd_locator.dbOid, + relation->rd_id); + if (PredicateLockExists(&tag)) + return; + + SET_PREDICATELOCKTARGETTAG_TUPLE(tag, + relation->rd_locator.dbOid, + relation->rd_id, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + PredicateLockAcquire(&tag); +} + + +/* + * DeleteLockTarget + * + * Remove a predicate lock target along with any locks held for it. + * + * Caller must hold SerializablePredicateListLock and the + * appropriate hash partition lock for the target. + */ +static void +DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash) +{ + dlist_mutable_iter iter; + + Assert(LWLockHeldByMeInMode(SerializablePredicateListLock, + LW_EXCLUSIVE)); + Assert(LWLockHeldByMe(PredicateLockHashPartitionLock(targettaghash))); + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + dlist_foreach_modify(iter, &target->predicateLocks) + { + PREDICATELOCK *predlock = + dlist_container(PREDICATELOCK, targetLink, iter.cur); + bool found; + + dlist_delete(&(predlock->xactLink)); + dlist_delete(&(predlock->targetLink)); + + hash_search_with_hash_value + (PredicateLockHash, + &predlock->tag, + PredicateLockHashCodeFromTargetHashCode(&predlock->tag, + targettaghash), + HASH_REMOVE, &found); + Assert(found); + } + LWLockRelease(SerializableXactHashLock); + + /* Remove the target itself, if possible. */ + RemoveTargetIfNoLongerUsed(target, targettaghash); +} + + +/* + * TransferPredicateLocksToNewTarget + * + * Move or copy all the predicate locks for a lock target, for use by + * index page splits/combines and other things that create or replace + * lock targets. If 'removeOld' is true, the old locks and the target + * will be removed. + * + * Returns true on success, or false if we ran out of shared memory to + * allocate the new target or locks. Guaranteed to always succeed if + * removeOld is set (by using the scratch entry in PredicateLockTargetHash + * for scratch space). + * + * Warning: the "removeOld" option should be used only with care, + * because this function does not (indeed, can not) update other + * backends' LocalPredicateLockHash. If we are only adding new + * entries, this is not a problem: the local lock table is used only + * as a hint, so missing entries for locks that are held are + * OK. Having entries for locks that are no longer held, as can happen + * when using "removeOld", is not in general OK. We can only use it + * safely when replacing a lock with a coarser-granularity lock that + * covers it, or if we are absolutely certain that no one will need to + * refer to that lock in the future. + * + * Caller must hold SerializablePredicateListLock exclusively. + */ +static bool +TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag, + PREDICATELOCKTARGETTAG newtargettag, + bool removeOld) +{ + uint32 oldtargettaghash; + LWLock *oldpartitionLock; + PREDICATELOCKTARGET *oldtarget; + uint32 newtargettaghash; + LWLock *newpartitionLock; + bool found; + bool outOfShmem = false; + + Assert(LWLockHeldByMeInMode(SerializablePredicateListLock, + LW_EXCLUSIVE)); + + oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag); + newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag); + oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash); + newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash); + + if (removeOld) + { + /* + * Remove the dummy entry to give us scratch space, so we know we'll + * be able to create the new lock target. + */ + RemoveScratchTarget(false); + } + + /* + * We must get the partition locks in ascending sequence to avoid + * deadlocks. If old and new partitions are the same, we must request the + * lock only once. + */ + if (oldpartitionLock < newpartitionLock) + { + LWLockAcquire(oldpartitionLock, + (removeOld ? LW_EXCLUSIVE : LW_SHARED)); + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + } + else if (oldpartitionLock > newpartitionLock) + { + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + LWLockAcquire(oldpartitionLock, + (removeOld ? LW_EXCLUSIVE : LW_SHARED)); + } + else + LWLockAcquire(newpartitionLock, LW_EXCLUSIVE); + + /* + * Look for the old target. If not found, that's OK; no predicate locks + * are affected, so we can just clean up and return. If it does exist, + * walk its list of predicate locks and move or copy them to the new + * target. + */ + oldtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &oldtargettag, + oldtargettaghash, + HASH_FIND, NULL); + + if (oldtarget) + { + PREDICATELOCKTARGET *newtarget; + PREDICATELOCKTAG newpredlocktag; + dlist_mutable_iter iter; + + newtarget = hash_search_with_hash_value(PredicateLockTargetHash, + &newtargettag, + newtargettaghash, + HASH_ENTER_NULL, &found); + + if (!newtarget) + { + /* Failed to allocate due to insufficient shmem */ + outOfShmem = true; + goto exit; + } + + /* If we created a new entry, initialize it */ + if (!found) + dlist_init(&newtarget->predicateLocks); + + newpredlocktag.myTarget = newtarget; + + /* + * Loop through all the locks on the old target, replacing them with + * locks on the new target. + */ + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + dlist_foreach_modify(iter, &oldtarget->predicateLocks) + { + PREDICATELOCK *oldpredlock = + dlist_container(PREDICATELOCK, targetLink, iter.cur); + PREDICATELOCK *newpredlock; + SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo; + + newpredlocktag.myXact = oldpredlock->tag.myXact; + + if (removeOld) + { + dlist_delete(&(oldpredlock->xactLink)); + dlist_delete(&(oldpredlock->targetLink)); + + hash_search_with_hash_value + (PredicateLockHash, + &oldpredlock->tag, + PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag, + oldtargettaghash), + HASH_REMOVE, &found); + Assert(found); + } + + newpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &newpredlocktag, + PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, + newtargettaghash), + HASH_ENTER_NULL, + &found); + if (!newpredlock) + { + /* Out of shared memory. Undo what we've done so far. */ + LWLockRelease(SerializableXactHashLock); + DeleteLockTarget(newtarget, newtargettaghash); + outOfShmem = true; + goto exit; + } + if (!found) + { + dlist_push_tail(&(newtarget->predicateLocks), + &(newpredlock->targetLink)); + dlist_push_tail(&(newpredlocktag.myXact->predicateLocks), + &(newpredlock->xactLink)); + newpredlock->commitSeqNo = oldCommitSeqNo; + } + else + { + if (newpredlock->commitSeqNo < oldCommitSeqNo) + newpredlock->commitSeqNo = oldCommitSeqNo; + } + + Assert(newpredlock->commitSeqNo != 0); + Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo) + || (newpredlock->tag.myXact == OldCommittedSxact)); + } + LWLockRelease(SerializableXactHashLock); + + if (removeOld) + { + Assert(dlist_is_empty(&oldtarget->predicateLocks)); + RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash); + } + } + + +exit: + /* Release partition locks in reverse order of acquisition. */ + if (oldpartitionLock < newpartitionLock) + { + LWLockRelease(newpartitionLock); + LWLockRelease(oldpartitionLock); + } + else if (oldpartitionLock > newpartitionLock) + { + LWLockRelease(oldpartitionLock); + LWLockRelease(newpartitionLock); + } + else + LWLockRelease(newpartitionLock); + + if (removeOld) + { + /* We shouldn't run out of memory if we're moving locks */ + Assert(!outOfShmem); + + /* Put the scratch entry back */ + RestoreScratchTarget(false); + } + + return !outOfShmem; +} + +/* + * Drop all predicate locks of any granularity from the specified relation, + * which can be a heap relation or an index relation. If 'transfer' is true, + * acquire a relation lock on the heap for any transactions with any lock(s) + * on the specified relation. + * + * This requires grabbing a lot of LW locks and scanning the entire lock + * target table for matches. That makes this more expensive than most + * predicate lock management functions, but it will only be called for DDL + * type commands that are expensive anyway, and there are fast returns when + * no serializable transactions are active or the relation is temporary. + * + * We don't use the TransferPredicateLocksToNewTarget function because it + * acquires its own locks on the partitions of the two targets involved, + * and we'll already be holding all partition locks. + * + * We can't throw an error from here, because the call could be from a + * transaction which is not serializable. + * + * NOTE: This is currently only called with transfer set to true, but that may + * change. If we decide to clean up the locks from a table on commit of a + * transaction which executed DROP TABLE, the false condition will be useful. + */ +static void +DropAllPredicateLocksFromTable(Relation relation, bool transfer) +{ + HASH_SEQ_STATUS seqstat; + PREDICATELOCKTARGET *oldtarget; + PREDICATELOCKTARGET *heaptarget; + Oid dbId; + Oid relId; + Oid heapId; + int i; + bool isIndex; + bool found; + uint32 heaptargettaghash; + + /* + * Bail out quickly if there are no serializable transactions running. + * It's safe to check this without taking locks because the caller is + * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which + * would matter here can be acquired while that is held. + */ + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) + return; + + if (!PredicateLockingNeededForRelation(relation)) + return; + + dbId = relation->rd_locator.dbOid; + relId = relation->rd_id; + if (relation->rd_index == NULL) + { + isIndex = false; + heapId = relId; + } + else + { + isIndex = true; + heapId = relation->rd_index->indrelid; + } + Assert(heapId != InvalidOid); + Assert(transfer || !isIndex); /* index OID only makes sense with + * transfer */ + + /* Retrieve first time needed, then keep. */ + heaptargettaghash = 0; + heaptarget = NULL; + + /* Acquire locks on all lock partitions */ + LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE); + for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Remove the dummy entry to give us scratch space, so we know we'll be + * able to create the new lock target. + */ + if (transfer) + RemoveScratchTarget(true); + + /* Scan through target map */ + hash_seq_init(&seqstat, PredicateLockTargetHash); + + while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat))) + { + dlist_mutable_iter iter; + + /* + * Check whether this is a target which needs attention. + */ + if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId) + continue; /* wrong relation id */ + if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId) + continue; /* wrong database id */ + if (transfer && !isIndex + && GET_PREDICATELOCKTARGETTAG_TYPE(oldtarget->tag) == PREDLOCKTAG_RELATION) + continue; /* already the right lock */ + + /* + * If we made it here, we have work to do. We make sure the heap + * relation lock exists, then we walk the list of predicate locks for + * the old target we found, moving all locks to the heap relation lock + * -- unless they already hold that. + */ + + /* + * First make sure we have the heap relation target. We only need to + * do this once. + */ + if (transfer && heaptarget == NULL) + { + PREDICATELOCKTARGETTAG heaptargettag; + + SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId); + heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag); + heaptarget = hash_search_with_hash_value(PredicateLockTargetHash, + &heaptargettag, + heaptargettaghash, + HASH_ENTER, &found); + if (!found) + dlist_init(&heaptarget->predicateLocks); + } + + /* + * Loop through all the locks on the old target, replacing them with + * locks on the new target. + */ + dlist_foreach_modify(iter, &oldtarget->predicateLocks) + { + PREDICATELOCK *oldpredlock = + dlist_container(PREDICATELOCK, targetLink, iter.cur); + PREDICATELOCK *newpredlock; + SerCommitSeqNo oldCommitSeqNo; + SERIALIZABLEXACT *oldXact; + + /* + * Remove the old lock first. This avoids the chance of running + * out of lock structure entries for the hash table. + */ + oldCommitSeqNo = oldpredlock->commitSeqNo; + oldXact = oldpredlock->tag.myXact; + + dlist_delete(&(oldpredlock->xactLink)); + + /* + * No need for retail delete from oldtarget list, we're removing + * the whole target anyway. + */ + hash_search(PredicateLockHash, + &oldpredlock->tag, + HASH_REMOVE, &found); + Assert(found); + + if (transfer) + { + PREDICATELOCKTAG newpredlocktag; + + newpredlocktag.myTarget = heaptarget; + newpredlocktag.myXact = oldXact; + newpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &newpredlocktag, + PredicateLockHashCodeFromTargetHashCode(&newpredlocktag, + heaptargettaghash), + HASH_ENTER, + &found); + if (!found) + { + dlist_push_tail(&(heaptarget->predicateLocks), + &(newpredlock->targetLink)); + dlist_push_tail(&(newpredlocktag.myXact->predicateLocks), + &(newpredlock->xactLink)); + newpredlock->commitSeqNo = oldCommitSeqNo; + } + else + { + if (newpredlock->commitSeqNo < oldCommitSeqNo) + newpredlock->commitSeqNo = oldCommitSeqNo; + } + + Assert(newpredlock->commitSeqNo != 0); + Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo) + || (newpredlock->tag.myXact == OldCommittedSxact)); + } + } + + hash_search(PredicateLockTargetHash, &oldtarget->tag, HASH_REMOVE, + &found); + Assert(found); + } + + /* Put the scratch entry back */ + if (transfer) + RestoreScratchTarget(true); + + /* Release locks in reverse order */ + LWLockRelease(SerializableXactHashLock); + for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); + LWLockRelease(SerializablePredicateListLock); +} + +/* + * TransferPredicateLocksToHeapRelation + * For all transactions, transfer all predicate locks for the given + * relation to a single relation lock on the heap. + */ +void +TransferPredicateLocksToHeapRelation(Relation relation) +{ + DropAllPredicateLocksFromTable(relation, true); +} + + +/* + * PredicateLockPageSplit + * + * Copies any predicate locks for the old page to the new page. + * Skip if this is a temporary table or toast table. + * + * NOTE: A page split (or overflow) affects all serializable transactions, + * even if it occurs in the context of another transaction isolation level. + * + * NOTE: This currently leaves the local copy of the locks without + * information on the new lock which is in shared memory. This could cause + * problems if enough page splits occur on locked pages without the processes + * which hold the locks getting in and noticing. + */ +void +PredicateLockPageSplit(Relation relation, BlockNumber oldblkno, + BlockNumber newblkno) +{ + PREDICATELOCKTARGETTAG oldtargettag; + PREDICATELOCKTARGETTAG newtargettag; + bool success; + + /* + * Bail out quickly if there are no serializable transactions running. + * + * It's safe to do this check without taking any additional locks. Even if + * a serializable transaction starts concurrently, we know it can't take + * any SIREAD locks on the page being split because the caller is holding + * the associated buffer page lock. Memory reordering isn't an issue; the + * memory barrier in the LWLock acquisition guarantees that this read + * occurs while the buffer page lock is held. + */ + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) + return; + + if (!PredicateLockingNeededForRelation(relation)) + return; + + Assert(oldblkno != newblkno); + Assert(BlockNumberIsValid(oldblkno)); + Assert(BlockNumberIsValid(newblkno)); + + SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag, + relation->rd_locator.dbOid, + relation->rd_id, + oldblkno); + SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag, + relation->rd_locator.dbOid, + relation->rd_id, + newblkno); + + LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE); + + /* + * Try copying the locks over to the new page's tag, creating it if + * necessary. + */ + success = TransferPredicateLocksToNewTarget(oldtargettag, + newtargettag, + false); + + if (!success) + { + /* + * No more predicate lock entries are available. Failure isn't an + * option here, so promote the page lock to a relation lock. + */ + + /* Get the parent relation lock's lock tag */ + success = GetParentPredicateLockTag(&oldtargettag, + &newtargettag); + Assert(success); + + /* + * Move the locks to the parent. This shouldn't fail. + * + * Note that here we are removing locks held by other backends, + * leading to a possible inconsistency in their local lock hash table. + * This is OK because we're replacing it with a lock that covers the + * old one. + */ + success = TransferPredicateLocksToNewTarget(oldtargettag, + newtargettag, + true); + Assert(success); + } + + LWLockRelease(SerializablePredicateListLock); +} + +/* + * PredicateLockPageCombine + * + * Combines predicate locks for two existing pages. + * Skip if this is a temporary table or toast table. + * + * NOTE: A page combine affects all serializable transactions, even if it + * occurs in the context of another transaction isolation level. + */ +void +PredicateLockPageCombine(Relation relation, BlockNumber oldblkno, + BlockNumber newblkno) +{ + /* + * Page combines differ from page splits in that we ought to be able to + * remove the locks on the old page after transferring them to the new + * page, instead of duplicating them. However, because we can't edit other + * backends' local lock tables, removing the old lock would leave them + * with an entry in their LocalPredicateLockHash for a lock they're not + * holding, which isn't acceptable. So we wind up having to do the same + * work as a page split, acquiring a lock on the new page and keeping the + * old page locked too. That can lead to some false positives, but should + * be rare in practice. + */ + PredicateLockPageSplit(relation, oldblkno, newblkno); +} + +/* + * Walk the list of in-progress serializable transactions and find the new + * xmin. + */ +static void +SetNewSxactGlobalXmin(void) +{ + dlist_iter iter; + + Assert(LWLockHeldByMe(SerializableXactHashLock)); + + PredXact->SxactGlobalXmin = InvalidTransactionId; + PredXact->SxactGlobalXminCount = 0; + + dlist_foreach(iter, &PredXact->activeList) + { + SERIALIZABLEXACT *sxact = + dlist_container(SERIALIZABLEXACT, xactLink, iter.cur); + + if (!SxactIsRolledBack(sxact) + && !SxactIsCommitted(sxact) + && sxact != OldCommittedSxact) + { + Assert(sxact->xmin != InvalidTransactionId); + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin) + || TransactionIdPrecedes(sxact->xmin, + PredXact->SxactGlobalXmin)) + { + PredXact->SxactGlobalXmin = sxact->xmin; + PredXact->SxactGlobalXminCount = 1; + } + else if (TransactionIdEquals(sxact->xmin, + PredXact->SxactGlobalXmin)) + PredXact->SxactGlobalXminCount++; + } + } + + SerialSetActiveSerXmin(PredXact->SxactGlobalXmin); +} + +/* + * ReleasePredicateLocks + * + * Releases predicate locks based on completion of the current transaction, + * whether committed or rolled back. It can also be called for a read only + * transaction when it becomes impossible for the transaction to become + * part of a dangerous structure. + * + * We do nothing unless this is a serializable transaction. + * + * This method must ensure that shared memory hash tables are cleaned + * up in some relatively timely fashion. + * + * If this transaction is committing and is holding any predicate locks, + * it must be added to a list of completed serializable transactions still + * holding locks. + * + * If isReadOnlySafe is true, then predicate locks are being released before + * the end of the transaction because MySerializableXact has been determined + * to be RO_SAFE. In non-parallel mode we can release it completely, but it + * in parallel mode we partially release the SERIALIZABLEXACT and keep it + * around until the end of the transaction, allowing each backend to clear its + * MySerializableXact variable and benefit from the optimization in its own + * time. + */ +void +ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe) +{ + bool partiallyReleasing = false; + bool needToClear; + SERIALIZABLEXACT *roXact; + dlist_mutable_iter iter; + + /* + * We can't trust XactReadOnly here, because a transaction which started + * as READ WRITE can show as READ ONLY later, e.g., within + * subtransactions. We want to flag a transaction as READ ONLY if it + * commits without writing so that de facto READ ONLY transactions get the + * benefit of some RO optimizations, so we will use this local variable to + * get some cleanup logic right which is based on whether the transaction + * was declared READ ONLY at the top level. + */ + bool topLevelIsDeclaredReadOnly; + + /* We can't be both committing and releasing early due to RO_SAFE. */ + Assert(!(isCommit && isReadOnlySafe)); + + /* Are we at the end of a transaction, that is, a commit or abort? */ + if (!isReadOnlySafe) + { + /* + * Parallel workers mustn't release predicate locks at the end of + * their transaction. The leader will do that at the end of its + * transaction. + */ + if (IsParallelWorker()) + { + ReleasePredicateLocksLocal(); + return; + } + + /* + * By the time the leader in a parallel query reaches end of + * transaction, it has waited for all workers to exit. + */ + Assert(!ParallelContextActive()); + + /* + * If the leader in a parallel query earlier stashed a partially + * released SERIALIZABLEXACT for final clean-up at end of transaction + * (because workers might still have been accessing it), then it's + * time to restore it. + */ + if (SavedSerializableXact != InvalidSerializableXact) + { + Assert(MySerializableXact == InvalidSerializableXact); + MySerializableXact = SavedSerializableXact; + SavedSerializableXact = InvalidSerializableXact; + Assert(SxactIsPartiallyReleased(MySerializableXact)); + } + } + + if (MySerializableXact == InvalidSerializableXact) + { + Assert(LocalPredicateLockHash == NULL); + return; + } + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * If the transaction is committing, but it has been partially released + * already, then treat this as a roll back. It was marked as rolled back. + */ + if (isCommit && SxactIsPartiallyReleased(MySerializableXact)) + isCommit = false; + + /* + * If we're called in the middle of a transaction because we discovered + * that the SXACT_FLAG_RO_SAFE flag was set, then we'll partially release + * it (that is, release the predicate locks and conflicts, but not the + * SERIALIZABLEXACT itself) if we're the first backend to have noticed. + */ + if (isReadOnlySafe && IsInParallelMode()) + { + /* + * The leader needs to stash a pointer to it, so that it can + * completely release it at end-of-transaction. + */ + if (!IsParallelWorker()) + SavedSerializableXact = MySerializableXact; + + /* + * The first backend to reach this condition will partially release + * the SERIALIZABLEXACT. All others will just clear their + * backend-local state so that they stop doing SSI checks for the rest + * of the transaction. + */ + if (SxactIsPartiallyReleased(MySerializableXact)) + { + LWLockRelease(SerializableXactHashLock); + ReleasePredicateLocksLocal(); + return; + } + else + { + MySerializableXact->flags |= SXACT_FLAG_PARTIALLY_RELEASED; + partiallyReleasing = true; + /* ... and proceed to perform the partial release below. */ + } + } + Assert(!isCommit || SxactIsPrepared(MySerializableXact)); + Assert(!isCommit || !SxactIsDoomed(MySerializableXact)); + Assert(!SxactIsCommitted(MySerializableXact)); + Assert(SxactIsPartiallyReleased(MySerializableXact) + || !SxactIsRolledBack(MySerializableXact)); + + /* may not be serializable during COMMIT/ROLLBACK PREPARED */ + Assert(MySerializableXact->pid == 0 || IsolationIsSerializable()); + + /* We'd better not already be on the cleanup list. */ + Assert(!SxactIsOnFinishedList(MySerializableXact)); + + topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact); + + /* + * We don't hold XidGenLock lock here, assuming that TransactionId is + * atomic! + * + * If this value is changing, we don't care that much whether we get the + * old or new value -- it is just used to determine how far + * SxactGlobalXmin must advance before this transaction can be fully + * cleaned up. The worst that could happen is we wait for one more + * transaction to complete before freeing some RAM; correctness of visible + * behavior is not affected. + */ + MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid); + + /* + * If it's not a commit it's either a rollback or a read-only transaction + * flagged SXACT_FLAG_RO_SAFE, and we can clear our locks immediately. + */ + if (isCommit) + { + MySerializableXact->flags |= SXACT_FLAG_COMMITTED; + MySerializableXact->commitSeqNo = ++(PredXact->LastSxactCommitSeqNo); + /* Recognize implicit read-only transaction (commit without write). */ + if (!MyXactDidWrite) + MySerializableXact->flags |= SXACT_FLAG_READ_ONLY; + } + else + { + /* + * The DOOMED flag indicates that we intend to roll back this + * transaction and so it should not cause serialization failures for + * other transactions that conflict with it. Note that this flag might + * already be set, if another backend marked this transaction for + * abort. + * + * The ROLLED_BACK flag further indicates that ReleasePredicateLocks + * has been called, and so the SerializableXact is eligible for + * cleanup. This means it should not be considered when calculating + * SxactGlobalXmin. + */ + MySerializableXact->flags |= SXACT_FLAG_DOOMED; + MySerializableXact->flags |= SXACT_FLAG_ROLLED_BACK; + + /* + * If the transaction was previously prepared, but is now failing due + * to a ROLLBACK PREPARED or (hopefully very rare) error after the + * prepare, clear the prepared flag. This simplifies conflict + * checking. + */ + MySerializableXact->flags &= ~SXACT_FLAG_PREPARED; + } + + if (!topLevelIsDeclaredReadOnly) + { + Assert(PredXact->WritableSxactCount > 0); + if (--(PredXact->WritableSxactCount) == 0) + { + /* + * Release predicate locks and rw-conflicts in for all committed + * transactions. There are no longer any transactions which might + * conflict with the locks and no chance for new transactions to + * overlap. Similarly, existing conflicts in can't cause pivots, + * and any conflicts in which could have completed a dangerous + * structure would already have caused a rollback, so any + * remaining ones must be benign. + */ + PredXact->CanPartialClearThrough = PredXact->LastSxactCommitSeqNo; + } + } + else + { + /* + * Read-only transactions: clear the list of transactions that might + * make us unsafe. Note that we use 'inLink' for the iteration as + * opposed to 'outLink' for the r/w xacts. + */ + dlist_foreach_modify(iter, &MySerializableXact->possibleUnsafeConflicts) + { + RWConflict possibleUnsafeConflict = + dlist_container(RWConflictData, inLink, iter.cur); + + Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut)); + Assert(MySerializableXact == possibleUnsafeConflict->sxactIn); + + ReleaseRWConflict(possibleUnsafeConflict); + } + } + + /* Check for conflict out to old committed transactions. */ + if (isCommit + && !SxactIsReadOnly(MySerializableXact) + && SxactHasSummaryConflictOut(MySerializableXact)) + { + /* + * we don't know which old committed transaction we conflicted with, + * so be conservative and use FirstNormalSerCommitSeqNo here + */ + MySerializableXact->SeqNo.earliestOutConflictCommit = + FirstNormalSerCommitSeqNo; + MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT; + } + + /* + * Release all outConflicts to committed transactions. If we're rolling + * back clear them all. Set SXACT_FLAG_CONFLICT_OUT if any point to + * previously committed transactions. + */ + dlist_foreach_modify(iter, &MySerializableXact->outConflicts) + { + RWConflict conflict = + dlist_container(RWConflictData, outLink, iter.cur); + + if (isCommit + && !SxactIsReadOnly(MySerializableXact) + && SxactIsCommitted(conflict->sxactIn)) + { + if ((MySerializableXact->flags & SXACT_FLAG_CONFLICT_OUT) == 0 + || conflict->sxactIn->prepareSeqNo < MySerializableXact->SeqNo.earliestOutConflictCommit) + MySerializableXact->SeqNo.earliestOutConflictCommit = conflict->sxactIn->prepareSeqNo; + MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT; + } + + if (!isCommit + || SxactIsCommitted(conflict->sxactIn) + || (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo)) + ReleaseRWConflict(conflict); + } + + /* + * Release all inConflicts from committed and read-only transactions. If + * we're rolling back, clear them all. + */ + dlist_foreach_modify(iter, &MySerializableXact->inConflicts) + { + RWConflict conflict = + dlist_container(RWConflictData, inLink, iter.cur); + + if (!isCommit + || SxactIsCommitted(conflict->sxactOut) + || SxactIsReadOnly(conflict->sxactOut)) + ReleaseRWConflict(conflict); + } + + if (!topLevelIsDeclaredReadOnly) + { + /* + * Remove ourselves from the list of possible conflicts for concurrent + * READ ONLY transactions, flagging them as unsafe if we have a + * conflict out. If any are waiting DEFERRABLE transactions, wake them + * up if they are known safe or known unsafe. + */ + dlist_foreach_modify(iter, &MySerializableXact->possibleUnsafeConflicts) + { + RWConflict possibleUnsafeConflict = + dlist_container(RWConflictData, outLink, iter.cur); + + roXact = possibleUnsafeConflict->sxactIn; + Assert(MySerializableXact == possibleUnsafeConflict->sxactOut); + Assert(SxactIsReadOnly(roXact)); + + /* Mark conflicted if necessary. */ + if (isCommit + && MyXactDidWrite + && SxactHasConflictOut(MySerializableXact) + && (MySerializableXact->SeqNo.earliestOutConflictCommit + <= roXact->SeqNo.lastCommitBeforeSnapshot)) + { + /* + * This releases possibleUnsafeConflict (as well as all other + * possible conflicts for roXact) + */ + FlagSxactUnsafe(roXact); + } + else + { + ReleaseRWConflict(possibleUnsafeConflict); + + /* + * If we were the last possible conflict, flag it safe. The + * transaction can now safely release its predicate locks (but + * that transaction's backend has to do that itself). + */ + if (dlist_is_empty(&roXact->possibleUnsafeConflicts)) + roXact->flags |= SXACT_FLAG_RO_SAFE; + } + + /* + * Wake up the process for a waiting DEFERRABLE transaction if we + * now know it's either safe or conflicted. + */ + if (SxactIsDeferrableWaiting(roXact) && + (SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact))) + ProcSendSignal(roXact->pgprocno); + } + } + + /* + * Check whether it's time to clean up old transactions. This can only be + * done when the last serializable transaction with the oldest xmin among + * serializable transactions completes. We then find the "new oldest" + * xmin and purge any transactions which finished before this transaction + * was launched. + * + * For parallel queries in read-only transactions, it might run twice. We + * only release the reference on the first call. + */ + needToClear = false; + if ((partiallyReleasing || + !SxactIsPartiallyReleased(MySerializableXact)) && + TransactionIdEquals(MySerializableXact->xmin, + PredXact->SxactGlobalXmin)) + { + Assert(PredXact->SxactGlobalXminCount > 0); + if (--(PredXact->SxactGlobalXminCount) == 0) + { + SetNewSxactGlobalXmin(); + needToClear = true; + } + } + + LWLockRelease(SerializableXactHashLock); + + LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE); + + /* Add this to the list of transactions to check for later cleanup. */ + if (isCommit) + dlist_push_tail(FinishedSerializableTransactions, + &MySerializableXact->finishedLink); + + /* + * If we're releasing a RO_SAFE transaction in parallel mode, we'll only + * partially release it. That's necessary because other backends may have + * a reference to it. The leader will release the SERIALIZABLEXACT itself + * at the end of the transaction after workers have stopped running. + */ + if (!isCommit) + ReleaseOneSerializableXact(MySerializableXact, + isReadOnlySafe && IsInParallelMode(), + false); + + LWLockRelease(SerializableFinishedListLock); + + if (needToClear) + ClearOldPredicateLocks(); + + ReleasePredicateLocksLocal(); +} + +static void +ReleasePredicateLocksLocal(void) +{ + MySerializableXact = InvalidSerializableXact; + MyXactDidWrite = false; + + /* Delete per-transaction lock table */ + if (LocalPredicateLockHash != NULL) + { + hash_destroy(LocalPredicateLockHash); + LocalPredicateLockHash = NULL; + } +} + +/* + * Clear old predicate locks, belonging to committed transactions that are no + * longer interesting to any in-progress transaction. + */ +static void +ClearOldPredicateLocks(void) +{ + dlist_mutable_iter iter; + + /* + * Loop through finished transactions. They are in commit order, so we can + * stop as soon as we find one that's still interesting. + */ + LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + dlist_foreach_modify(iter, FinishedSerializableTransactions) + { + SERIALIZABLEXACT *finishedSxact = + dlist_container(SERIALIZABLEXACT, finishedLink, iter.cur); + + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin) + || TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore, + PredXact->SxactGlobalXmin)) + { + /* + * This transaction committed before any in-progress transaction + * took its snapshot. It's no longer interesting. + */ + LWLockRelease(SerializableXactHashLock); + dlist_delete_thoroughly(&finishedSxact->finishedLink); + ReleaseOneSerializableXact(finishedSxact, false, false); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough + && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough) + { + /* + * Any active transactions that took their snapshot before this + * transaction committed are read-only, so we can clear part of + * its state. + */ + LWLockRelease(SerializableXactHashLock); + + if (SxactIsReadOnly(finishedSxact)) + { + /* A read-only transaction can be removed entirely */ + dlist_delete_thoroughly(&(finishedSxact->finishedLink)); + ReleaseOneSerializableXact(finishedSxact, false, false); + } + else + { + /* + * A read-write transaction can only be partially cleared. We + * need to keep the SERIALIZABLEXACT but can release the + * SIREAD locks and conflicts in. + */ + ReleaseOneSerializableXact(finishedSxact, true, false); + } + + PredXact->HavePartialClearedThrough = finishedSxact->commitSeqNo; + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + else + { + /* Still interesting. */ + break; + } + } + LWLockRelease(SerializableXactHashLock); + + /* + * Loop through predicate locks on dummy transaction for summarized data. + */ + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + dlist_foreach_modify(iter, &OldCommittedSxact->predicateLocks) + { + PREDICATELOCK *predlock = + dlist_container(PREDICATELOCK, xactLink, iter.cur); + bool canDoPartialCleanup; + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + Assert(predlock->commitSeqNo != 0); + Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo); + canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough); + LWLockRelease(SerializableXactHashLock); + + /* + * If this lock originally belonged to an old enough transaction, we + * can release it. + */ + if (canDoPartialCleanup) + { + PREDICATELOCKTAG tag; + PREDICATELOCKTARGET *target; + PREDICATELOCKTARGETTAG targettag; + uint32 targettaghash; + LWLock *partitionLock; + + tag = predlock->tag; + target = tag.myTarget; + targettag = target->tag; + targettaghash = PredicateLockTargetTagHashCode(&targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + dlist_delete(&(predlock->targetLink)); + dlist_delete(&(predlock->xactLink)); + + hash_search_with_hash_value(PredicateLockHash, &tag, + PredicateLockHashCodeFromTargetHashCode(&tag, + targettaghash), + HASH_REMOVE, NULL); + RemoveTargetIfNoLongerUsed(target, targettaghash); + + LWLockRelease(partitionLock); + } + } + + LWLockRelease(SerializablePredicateListLock); + LWLockRelease(SerializableFinishedListLock); +} + +/* + * This is the normal way to delete anything from any of the predicate + * locking hash tables. Given a transaction which we know can be deleted: + * delete all predicate locks held by that transaction and any predicate + * lock targets which are now unreferenced by a lock; delete all conflicts + * for the transaction; delete all xid values for the transaction; then + * delete the transaction. + * + * When the partial flag is set, we can release all predicate locks and + * in-conflict information -- we've established that there are no longer + * any overlapping read write transactions for which this transaction could + * matter -- but keep the transaction entry itself and any outConflicts. + * + * When the summarize flag is set, we've run short of room for sxact data + * and must summarize to the SLRU. Predicate locks are transferred to a + * dummy "old" transaction, with duplicate locks on a single target + * collapsing to a single lock with the "latest" commitSeqNo from among + * the conflicting locks.. + */ +static void +ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial, + bool summarize) +{ + SERIALIZABLEXIDTAG sxidtag; + dlist_mutable_iter iter; + + Assert(sxact != NULL); + Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact)); + Assert(partial || !SxactIsOnFinishedList(sxact)); + Assert(LWLockHeldByMe(SerializableFinishedListLock)); + + /* + * First release all the predicate locks held by this xact (or transfer + * them to OldCommittedSxact if summarize is true) + */ + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + if (IsInParallelMode()) + LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE); + dlist_foreach_modify(iter, &sxact->predicateLocks) + { + PREDICATELOCK *predlock = + dlist_container(PREDICATELOCK, xactLink, iter.cur); + PREDICATELOCKTAG tag; + PREDICATELOCKTARGET *target; + PREDICATELOCKTARGETTAG targettag; + uint32 targettaghash; + LWLock *partitionLock; + + tag = predlock->tag; + target = tag.myTarget; + targettag = target->tag; + targettaghash = PredicateLockTargetTagHashCode(&targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + dlist_delete(&predlock->targetLink); + + hash_search_with_hash_value(PredicateLockHash, &tag, + PredicateLockHashCodeFromTargetHashCode(&tag, + targettaghash), + HASH_REMOVE, NULL); + if (summarize) + { + bool found; + + /* Fold into dummy transaction list. */ + tag.myXact = OldCommittedSxact; + predlock = hash_search_with_hash_value(PredicateLockHash, &tag, + PredicateLockHashCodeFromTargetHashCode(&tag, + targettaghash), + HASH_ENTER_NULL, &found); + if (!predlock) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"), + errhint("You might need to increase %s.", "max_pred_locks_per_transaction"))); + if (found) + { + Assert(predlock->commitSeqNo != 0); + Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo); + if (predlock->commitSeqNo < sxact->commitSeqNo) + predlock->commitSeqNo = sxact->commitSeqNo; + } + else + { + dlist_push_tail(&target->predicateLocks, + &predlock->targetLink); + dlist_push_tail(&OldCommittedSxact->predicateLocks, + &predlock->xactLink); + predlock->commitSeqNo = sxact->commitSeqNo; + } + } + else + RemoveTargetIfNoLongerUsed(target, targettaghash); + + LWLockRelease(partitionLock); + } + + /* + * Rather than retail removal, just re-init the head after we've run + * through the list. + */ + dlist_init(&sxact->predicateLocks); + + if (IsInParallelMode()) + LWLockRelease(&sxact->perXactPredicateListLock); + LWLockRelease(SerializablePredicateListLock); + + sxidtag.xid = sxact->topXid; + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* Release all outConflicts (unless 'partial' is true) */ + if (!partial) + { + dlist_foreach_modify(iter, &sxact->outConflicts) + { + RWConflict conflict = + dlist_container(RWConflictData, outLink, iter.cur); + + if (summarize) + conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN; + ReleaseRWConflict(conflict); + } + } + + /* Release all inConflicts. */ + dlist_foreach_modify(iter, &sxact->inConflicts) + { + RWConflict conflict = + dlist_container(RWConflictData, inLink, iter.cur); + + if (summarize) + conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT; + ReleaseRWConflict(conflict); + } + + /* Finally, get rid of the xid and the record of the transaction itself. */ + if (!partial) + { + if (sxidtag.xid != InvalidTransactionId) + hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL); + ReleasePredXact(sxact); + } + + LWLockRelease(SerializableXactHashLock); +} + +/* + * Tests whether the given top level transaction is concurrent with + * (overlaps) our current transaction. + * + * We need to identify the top level transaction for SSI, anyway, so pass + * that to this function to save the overhead of checking the snapshot's + * subxip array. + */ +static bool +XidIsConcurrent(TransactionId xid) +{ + Snapshot snap; + + Assert(TransactionIdIsValid(xid)); + Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny())); + + snap = GetTransactionSnapshot(); + + if (TransactionIdPrecedes(xid, snap->xmin)) + return false; + + if (TransactionIdFollowsOrEquals(xid, snap->xmax)) + return true; + + return pg_lfind32(xid, snap->xip, snap->xcnt); +} + +bool +CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot) +{ + if (!SerializationNeededForRead(relation, snapshot)) + return false; + + /* Check if someone else has already decided that we need to die */ + if (SxactIsDoomed(MySerializableXact)) + { + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."), + errhint("The transaction might succeed if retried."))); + } + + return true; +} + +/* + * CheckForSerializableConflictOut + * A table AM is reading a tuple that has been modified. If it determines + * that the tuple version it is reading is not visible to us, it should + * pass in the top level xid of the transaction that created it. + * Otherwise, if it determines that it is visible to us but it has been + * deleted or there is a newer version available due to an update, it + * should pass in the top level xid of the modifying transaction. + * + * This function will check for overlap with our own transaction. If the given + * xid is also serializable and the transactions overlap (i.e., they cannot see + * each other's writes), then we have a conflict out. + */ +void +CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot) +{ + SERIALIZABLEXIDTAG sxidtag; + SERIALIZABLEXID *sxid; + SERIALIZABLEXACT *sxact; + + if (!SerializationNeededForRead(relation, snapshot)) + return; + + /* Check if someone else has already decided that we need to die */ + if (SxactIsDoomed(MySerializableXact)) + { + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."), + errhint("The transaction might succeed if retried."))); + } + Assert(TransactionIdIsValid(xid)); + + if (TransactionIdEquals(xid, GetTopTransactionIdIfAny())) + return; + + /* + * Find sxact or summarized info for the top level xid. + */ + sxidtag.xid = xid; + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + sxid = (SERIALIZABLEXID *) + hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL); + if (!sxid) + { + /* + * Transaction not found in "normal" SSI structures. Check whether it + * got pushed out to SLRU storage for "old committed" transactions. + */ + SerCommitSeqNo conflictCommitSeqNo; + + conflictCommitSeqNo = SerialGetMinConflictCommitSeqNo(xid); + if (conflictCommitSeqNo != 0) + { + if (conflictCommitSeqNo != InvalidSerCommitSeqNo + && (!SxactIsReadOnly(MySerializableXact) + || conflictCommitSeqNo + <= MySerializableXact->SeqNo.lastCommitBeforeSnapshot)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid), + errhint("The transaction might succeed if retried."))); + + if (SxactHasSummaryConflictIn(MySerializableXact) + || !dlist_is_empty(&MySerializableXact->inConflicts)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid), + errhint("The transaction might succeed if retried."))); + + MySerializableXact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT; + } + + /* It's not serializable or otherwise not important. */ + LWLockRelease(SerializableXactHashLock); + return; + } + sxact = sxid->myXact; + Assert(TransactionIdEquals(sxact->topXid, xid)); + if (sxact == MySerializableXact || SxactIsDoomed(sxact)) + { + /* Can't conflict with ourself or a transaction that will roll back. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + /* + * We have a conflict out to a transaction which has a conflict out to a + * summarized transaction. That summarized transaction must have + * committed first, and we can't tell when it committed in relation to our + * snapshot acquisition, so something needs to be canceled. + */ + if (SxactHasSummaryConflictOut(sxact)) + { + if (!SxactIsPrepared(sxact)) + { + sxact->flags |= SXACT_FLAG_DOOMED; + LWLockRelease(SerializableXactHashLock); + return; + } + else + { + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on conflict out to old pivot."), + errhint("The transaction might succeed if retried."))); + } + } + + /* + * If this is a read-only transaction and the writing transaction has + * committed, and it doesn't have a rw-conflict to a transaction which + * committed before it, no conflict. + */ + if (SxactIsReadOnly(MySerializableXact) + && SxactIsCommitted(sxact) + && !SxactHasSummaryConflictOut(sxact) + && (!SxactHasConflictOut(sxact) + || MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit)) + { + /* Read-only transaction will appear to run first. No conflict. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + if (!XidIsConcurrent(xid)) + { + /* This write was already in our snapshot; no conflict. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + if (RWConflictExists(MySerializableXact, sxact)) + { + /* We don't want duplicate conflict records in the list. */ + LWLockRelease(SerializableXactHashLock); + return; + } + + /* + * Flag the conflict. But first, if this conflict creates a dangerous + * structure, ereport an error. + */ + FlagRWConflict(MySerializableXact, sxact); + LWLockRelease(SerializableXactHashLock); +} + +/* + * Check a particular target for rw-dependency conflict in. A subroutine of + * CheckForSerializableConflictIn(). + */ +static void +CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag) +{ + uint32 targettaghash; + LWLock *partitionLock; + PREDICATELOCKTARGET *target; + PREDICATELOCK *mypredlock = NULL; + PREDICATELOCKTAG mypredlocktag; + dlist_mutable_iter iter; + + Assert(MySerializableXact != InvalidSerializableXact); + + /* + * The same hash and LW lock apply to the lock target and the lock itself. + */ + targettaghash = PredicateLockTargetTagHashCode(targettag); + partitionLock = PredicateLockHashPartitionLock(targettaghash); + LWLockAcquire(partitionLock, LW_SHARED); + target = (PREDICATELOCKTARGET *) + hash_search_with_hash_value(PredicateLockTargetHash, + targettag, targettaghash, + HASH_FIND, NULL); + if (!target) + { + /* Nothing has this target locked; we're done here. */ + LWLockRelease(partitionLock); + return; + } + + /* + * Each lock for an overlapping transaction represents a conflict: a + * rw-dependency in to this transaction. + */ + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + + dlist_foreach_modify(iter, &target->predicateLocks) + { + PREDICATELOCK *predlock = + dlist_container(PREDICATELOCK, targetLink, iter.cur); + SERIALIZABLEXACT *sxact = predlock->tag.myXact; + + if (sxact == MySerializableXact) + { + /* + * If we're getting a write lock on a tuple, we don't need a + * predicate (SIREAD) lock on the same tuple. We can safely remove + * our SIREAD lock, but we'll defer doing so until after the loop + * because that requires upgrading to an exclusive partition lock. + * + * We can't use this optimization within a subtransaction because + * the subtransaction could roll back, and we would be left + * without any lock at the top level. + */ + if (!IsSubTransaction() + && GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag)) + { + mypredlock = predlock; + mypredlocktag = predlock->tag; + } + } + else if (!SxactIsDoomed(sxact) + && (!SxactIsCommitted(sxact) + || TransactionIdPrecedes(GetTransactionSnapshot()->xmin, + sxact->finishedBefore)) + && !RWConflictExists(sxact, MySerializableXact)) + { + LWLockRelease(SerializableXactHashLock); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Re-check after getting exclusive lock because the other + * transaction may have flagged a conflict. + */ + if (!SxactIsDoomed(sxact) + && (!SxactIsCommitted(sxact) + || TransactionIdPrecedes(GetTransactionSnapshot()->xmin, + sxact->finishedBefore)) + && !RWConflictExists(sxact, MySerializableXact)) + { + FlagRWConflict(sxact, MySerializableXact); + } + + LWLockRelease(SerializableXactHashLock); + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + } + } + LWLockRelease(SerializableXactHashLock); + LWLockRelease(partitionLock); + + /* + * If we found one of our own SIREAD locks to remove, remove it now. + * + * At this point our transaction already has a RowExclusiveLock on the + * relation, so we are OK to drop the predicate lock on the tuple, if + * found, without fearing that another write against the tuple will occur + * before the MVCC information makes it to the buffer. + */ + if (mypredlock != NULL) + { + uint32 predlockhashcode; + PREDICATELOCK *rmpredlock; + + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + if (IsInParallelMode()) + LWLockAcquire(&MySerializableXact->perXactPredicateListLock, LW_EXCLUSIVE); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Remove the predicate lock from shared memory, if it wasn't removed + * while the locks were released. One way that could happen is from + * autovacuum cleaning up an index. + */ + predlockhashcode = PredicateLockHashCodeFromTargetHashCode + (&mypredlocktag, targettaghash); + rmpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &mypredlocktag, + predlockhashcode, + HASH_FIND, NULL); + if (rmpredlock != NULL) + { + Assert(rmpredlock == mypredlock); + + dlist_delete(&(mypredlock->targetLink)); + dlist_delete(&(mypredlock->xactLink)); + + rmpredlock = (PREDICATELOCK *) + hash_search_with_hash_value(PredicateLockHash, + &mypredlocktag, + predlockhashcode, + HASH_REMOVE, NULL); + Assert(rmpredlock == mypredlock); + + RemoveTargetIfNoLongerUsed(target, targettaghash); + } + + LWLockRelease(SerializableXactHashLock); + LWLockRelease(partitionLock); + if (IsInParallelMode()) + LWLockRelease(&MySerializableXact->perXactPredicateListLock); + LWLockRelease(SerializablePredicateListLock); + + if (rmpredlock != NULL) + { + /* + * Remove entry in local lock table if it exists. It's OK if it + * doesn't exist; that means the lock was transferred to a new + * target by a different backend. + */ + hash_search_with_hash_value(LocalPredicateLockHash, + targettag, targettaghash, + HASH_REMOVE, NULL); + + DecrementParentLocks(targettag); + } + } +} + +/* + * CheckForSerializableConflictIn + * We are writing the given tuple. If that indicates a rw-conflict + * in from another serializable transaction, take appropriate action. + * + * Skip checking for any granularity for which a parameter is missing. + * + * A tuple update or delete is in conflict if we have a predicate lock + * against the relation or page in which the tuple exists, or against the + * tuple itself. + */ +void +CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno) +{ + PREDICATELOCKTARGETTAG targettag; + + if (!SerializationNeededForWrite(relation)) + return; + + /* Check if someone else has already decided that we need to die */ + if (SxactIsDoomed(MySerializableXact)) + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."), + errhint("The transaction might succeed if retried."))); + + /* + * We're doing a write which might cause rw-conflicts now or later. + * Memorize that fact. + */ + MyXactDidWrite = true; + + /* + * It is important that we check for locks from the finest granularity to + * the coarsest granularity, so that granularity promotion doesn't cause + * us to miss a lock. The new (coarser) lock will be acquired before the + * old (finer) locks are released. + * + * It is not possible to take and hold a lock across the checks for all + * granularities because each target could be in a separate partition. + */ + if (tid != NULL) + { + SET_PREDICATELOCKTARGETTAG_TUPLE(targettag, + relation->rd_locator.dbOid, + relation->rd_id, + ItemPointerGetBlockNumber(tid), + ItemPointerGetOffsetNumber(tid)); + CheckTargetForConflictsIn(&targettag); + } + + if (blkno != InvalidBlockNumber) + { + SET_PREDICATELOCKTARGETTAG_PAGE(targettag, + relation->rd_locator.dbOid, + relation->rd_id, + blkno); + CheckTargetForConflictsIn(&targettag); + } + + SET_PREDICATELOCKTARGETTAG_RELATION(targettag, + relation->rd_locator.dbOid, + relation->rd_id); + CheckTargetForConflictsIn(&targettag); +} + +/* + * CheckTableForSerializableConflictIn + * The entire table is going through a DDL-style logical mass delete + * like TRUNCATE or DROP TABLE. If that causes a rw-conflict in from + * another serializable transaction, take appropriate action. + * + * While these operations do not operate entirely within the bounds of + * snapshot isolation, they can occur inside a serializable transaction, and + * will logically occur after any reads which saw rows which were destroyed + * by these operations, so we do what we can to serialize properly under + * SSI. + * + * The relation passed in must be a heap relation. Any predicate lock of any + * granularity on the heap will cause a rw-conflict in to this transaction. + * Predicate locks on indexes do not matter because they only exist to guard + * against conflicting inserts into the index, and this is a mass *delete*. + * When a table is truncated or dropped, the index will also be truncated + * or dropped, and we'll deal with locks on the index when that happens. + * + * Dropping or truncating a table also needs to drop any existing predicate + * locks on heap tuples or pages, because they're about to go away. This + * should be done before altering the predicate locks because the transaction + * could be rolled back because of a conflict, in which case the lock changes + * are not needed. (At the moment, we don't actually bother to drop the + * existing locks on a dropped or truncated table at the moment. That might + * lead to some false positives, but it doesn't seem worth the trouble.) + */ +void +CheckTableForSerializableConflictIn(Relation relation) +{ + HASH_SEQ_STATUS seqstat; + PREDICATELOCKTARGET *target; + Oid dbId; + Oid heapId; + int i; + + /* + * Bail out quickly if there are no serializable transactions running. + * It's safe to check this without taking locks because the caller is + * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which + * would matter here can be acquired while that is held. + */ + if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)) + return; + + if (!SerializationNeededForWrite(relation)) + return; + + /* + * We're doing a write which might cause rw-conflicts now or later. + * Memorize that fact. + */ + MyXactDidWrite = true; + + Assert(relation->rd_index == NULL); /* not an index relation */ + + dbId = relation->rd_locator.dbOid; + heapId = relation->rd_id; + + LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE); + for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++) + LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED); + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* Scan through target list */ + hash_seq_init(&seqstat, PredicateLockTargetHash); + + while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat))) + { + dlist_mutable_iter iter; + + /* + * Check whether this is a target which needs attention. + */ + if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId) + continue; /* wrong relation id */ + if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId) + continue; /* wrong database id */ + + /* + * Loop through locks for this target and flag conflicts. + */ + dlist_foreach_modify(iter, &target->predicateLocks) + { + PREDICATELOCK *predlock = + dlist_container(PREDICATELOCK, targetLink, iter.cur); + + if (predlock->tag.myXact != MySerializableXact + && !RWConflictExists(predlock->tag.myXact, MySerializableXact)) + { + FlagRWConflict(predlock->tag.myXact, MySerializableXact); + } + } + } + + /* Release locks in reverse order */ + LWLockRelease(SerializableXactHashLock); + for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--) + LWLockRelease(PredicateLockHashPartitionLockByIndex(i)); + LWLockRelease(SerializablePredicateListLock); +} + + +/* + * Flag a rw-dependency between two serializable transactions. + * + * The caller is responsible for ensuring that we have a LW lock on + * the transaction hash table. + */ +static void +FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer) +{ + Assert(reader != writer); + + /* First, see if this conflict causes failure. */ + OnConflict_CheckForSerializationFailure(reader, writer); + + /* Actually do the conflict flagging. */ + if (reader == OldCommittedSxact) + writer->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN; + else if (writer == OldCommittedSxact) + reader->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT; + else + SetRWConflict(reader, writer); +} + +/*---------------------------------------------------------------------------- + * We are about to add a RW-edge to the dependency graph - check that we don't + * introduce a dangerous structure by doing so, and abort one of the + * transactions if so. + * + * A serialization failure can only occur if there is a dangerous structure + * in the dependency graph: + * + * Tin ------> Tpivot ------> Tout + * rw rw + * + * Furthermore, Tout must commit first. + * + * One more optimization is that if Tin is declared READ ONLY (or commits + * without writing), we can only have a problem if Tout committed before Tin + * acquired its snapshot. + *---------------------------------------------------------------------------- + */ +static void +OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader, + SERIALIZABLEXACT *writer) +{ + bool failure; + + Assert(LWLockHeldByMe(SerializableXactHashLock)); + + failure = false; + + /*------------------------------------------------------------------------ + * Check for already-committed writer with rw-conflict out flagged + * (conflict-flag on W means that T2 committed before W): + * + * R ------> W ------> T2 + * rw rw + * + * That is a dangerous structure, so we must abort. (Since the writer + * has already committed, we must be the reader) + *------------------------------------------------------------------------ + */ + if (SxactIsCommitted(writer) + && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer))) + failure = true; + + /*------------------------------------------------------------------------ + * Check whether the writer has become a pivot with an out-conflict + * committed transaction (T2), and T2 committed first: + * + * R ------> W ------> T2 + * rw rw + * + * Because T2 must've committed first, there is no anomaly if: + * - the reader committed before T2 + * - the writer committed before T2 + * - the reader is a READ ONLY transaction and the reader was concurrent + * with T2 (= reader acquired its snapshot before T2 committed) + * + * We also handle the case that T2 is prepared but not yet committed + * here. In that case T2 has already checked for conflicts, so if it + * commits first, making the above conflict real, it's too late for it + * to abort. + *------------------------------------------------------------------------ + */ + if (!failure && SxactHasSummaryConflictOut(writer)) + failure = true; + else if (!failure) + { + dlist_iter iter; + + dlist_foreach(iter, &writer->outConflicts) + { + RWConflict conflict = + dlist_container(RWConflictData, outLink, iter.cur); + SERIALIZABLEXACT *t2 = conflict->sxactIn; + + if (SxactIsPrepared(t2) + && (!SxactIsCommitted(reader) + || t2->prepareSeqNo <= reader->commitSeqNo) + && (!SxactIsCommitted(writer) + || t2->prepareSeqNo <= writer->commitSeqNo) + && (!SxactIsReadOnly(reader) + || t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot)) + { + failure = true; + break; + } + } + } + + /*------------------------------------------------------------------------ + * Check whether the reader has become a pivot with a writer + * that's committed (or prepared): + * + * T0 ------> R ------> W + * rw rw + * + * Because W must've committed first for an anomaly to occur, there is no + * anomaly if: + * - T0 committed before the writer + * - T0 is READ ONLY, and overlaps the writer + *------------------------------------------------------------------------ + */ + if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader)) + { + if (SxactHasSummaryConflictIn(reader)) + { + failure = true; + } + else + { + dlist_iter iter; + + /* + * The unconstify is needed as we have no const version of + * dlist_foreach(). + */ + dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->inConflicts) + { + const RWConflict conflict = + dlist_container(RWConflictData, inLink, iter.cur); + const SERIALIZABLEXACT *t0 = conflict->sxactOut; + + if (!SxactIsDoomed(t0) + && (!SxactIsCommitted(t0) + || t0->commitSeqNo >= writer->prepareSeqNo) + && (!SxactIsReadOnly(t0) + || t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo)) + { + failure = true; + break; + } + } + } + } + + if (failure) + { + /* + * We have to kill a transaction to avoid a possible anomaly from + * occurring. If the writer is us, we can just ereport() to cause a + * transaction abort. Otherwise we flag the writer for termination, + * causing it to abort when it tries to commit. However, if the writer + * is a prepared transaction, already prepared, we can't abort it + * anymore, so we have to kill the reader instead. + */ + if (MySerializableXact == writer) + { + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during write."), + errhint("The transaction might succeed if retried."))); + } + else if (SxactIsPrepared(writer)) + { + LWLockRelease(SerializableXactHashLock); + + /* if we're not the writer, we have to be the reader */ + Assert(MySerializableXact == reader); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid), + errhint("The transaction might succeed if retried."))); + } + writer->flags |= SXACT_FLAG_DOOMED; + } +} + +/* + * PreCommit_CheckForSerializationFailure + * Check for dangerous structures in a serializable transaction + * at commit. + * + * We're checking for a dangerous structure as each conflict is recorded. + * The only way we could have a problem at commit is if this is the "out" + * side of a pivot, and neither the "in" side nor the pivot has yet + * committed. + * + * If a dangerous structure is found, the pivot (the near conflict) is + * marked for death, because rolling back another transaction might mean + * that we fail without ever making progress. This transaction is + * committing writes, so letting it commit ensures progress. If we + * canceled the far conflict, it might immediately fail again on retry. + */ +void +PreCommit_CheckForSerializationFailure(void) +{ + dlist_iter near_iter; + + if (MySerializableXact == InvalidSerializableXact) + return; + + Assert(IsolationIsSerializable()); + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + + /* + * Check if someone else has already decided that we need to die. Since + * we set our own DOOMED flag when partially releasing, ignore in that + * case. + */ + if (SxactIsDoomed(MySerializableXact) && + !SxactIsPartiallyReleased(MySerializableXact)) + { + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."), + errhint("The transaction might succeed if retried."))); + } + + dlist_foreach(near_iter, &MySerializableXact->inConflicts) + { + RWConflict nearConflict = + dlist_container(RWConflictData, inLink, near_iter.cur); + + if (!SxactIsCommitted(nearConflict->sxactOut) + && !SxactIsDoomed(nearConflict->sxactOut)) + { + dlist_iter far_iter; + + dlist_foreach(far_iter, &nearConflict->sxactOut->inConflicts) + { + RWConflict farConflict = + dlist_container(RWConflictData, inLink, far_iter.cur); + + if (farConflict->sxactOut == MySerializableXact + || (!SxactIsCommitted(farConflict->sxactOut) + && !SxactIsReadOnly(farConflict->sxactOut) + && !SxactIsDoomed(farConflict->sxactOut))) + { + /* + * Normally, we kill the pivot transaction to make sure we + * make progress if the failing transaction is retried. + * However, we can't kill it if it's already prepared, so + * in that case we commit suicide instead. + */ + if (SxactIsPrepared(nearConflict->sxactOut)) + { + LWLockRelease(SerializableXactHashLock); + ereport(ERROR, + (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), + errmsg("could not serialize access due to read/write dependencies among transactions"), + errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."), + errhint("The transaction might succeed if retried."))); + } + nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED; + break; + } + } + } + } + + MySerializableXact->prepareSeqNo = ++(PredXact->LastSxactCommitSeqNo); + MySerializableXact->flags |= SXACT_FLAG_PREPARED; + + LWLockRelease(SerializableXactHashLock); +} + +/*------------------------------------------------------------------------*/ + +/* + * Two-phase commit support + */ + +/* + * AtPrepare_Locks + * Do the preparatory work for a PREPARE: make 2PC state file + * records for all predicate locks currently held. + */ +void +AtPrepare_PredicateLocks(void) +{ + SERIALIZABLEXACT *sxact; + TwoPhasePredicateRecord record; + TwoPhasePredicateXactRecord *xactRecord; + TwoPhasePredicateLockRecord *lockRecord; + dlist_iter iter; + + sxact = MySerializableXact; + xactRecord = &(record.data.xactRecord); + lockRecord = &(record.data.lockRecord); + + if (MySerializableXact == InvalidSerializableXact) + return; + + /* Generate an xact record for our SERIALIZABLEXACT */ + record.type = TWOPHASEPREDICATERECORD_XACT; + xactRecord->xmin = MySerializableXact->xmin; + xactRecord->flags = MySerializableXact->flags; + + /* + * Note that we don't include the list of conflicts in our out in the + * statefile, because new conflicts can be added even after the + * transaction prepares. We'll just make a conservative assumption during + * recovery instead. + */ + + RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0, + &record, sizeof(record)); + + /* + * Generate a lock record for each lock. + * + * To do this, we need to walk the predicate lock list in our sxact rather + * than using the local predicate lock table because the latter is not + * guaranteed to be accurate. + */ + LWLockAcquire(SerializablePredicateListLock, LW_SHARED); + + /* + * No need to take sxact->perXactPredicateListLock in parallel mode + * because there cannot be any parallel workers running while we are + * preparing a transaction. + */ + Assert(!IsParallelWorker() && !ParallelContextActive()); + + dlist_foreach(iter, &sxact->predicateLocks) + { + PREDICATELOCK *predlock = + dlist_container(PREDICATELOCK, xactLink, iter.cur); + + record.type = TWOPHASEPREDICATERECORD_LOCK; + lockRecord->target = predlock->tag.myTarget->tag; + + RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0, + &record, sizeof(record)); + } + + LWLockRelease(SerializablePredicateListLock); +} + +/* + * PostPrepare_Locks + * Clean up after successful PREPARE. Unlike the non-predicate + * lock manager, we do not need to transfer locks to a dummy + * PGPROC because our SERIALIZABLEXACT will stay around + * anyway. We only need to clean up our local state. + */ +void +PostPrepare_PredicateLocks(TransactionId xid) +{ + if (MySerializableXact == InvalidSerializableXact) + return; + + Assert(SxactIsPrepared(MySerializableXact)); + + MySerializableXact->pid = 0; + MySerializableXact->pgprocno = INVALID_PGPROCNO; + + hash_destroy(LocalPredicateLockHash); + LocalPredicateLockHash = NULL; + + MySerializableXact = InvalidSerializableXact; + MyXactDidWrite = false; +} + +/* + * PredicateLockTwoPhaseFinish + * Release a prepared transaction's predicate locks once it + * commits or aborts. + */ +void +PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit) +{ + SERIALIZABLEXID *sxid; + SERIALIZABLEXIDTAG sxidtag; + + sxidtag.xid = xid; + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + sxid = (SERIALIZABLEXID *) + hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL); + LWLockRelease(SerializableXactHashLock); + + /* xid will not be found if it wasn't a serializable transaction */ + if (sxid == NULL) + return; + + /* Release its locks */ + MySerializableXact = sxid->myXact; + MyXactDidWrite = true; /* conservatively assume that we wrote + * something */ + ReleasePredicateLocks(isCommit, false); +} + +/* + * Re-acquire a predicate lock belonging to a transaction that was prepared. + */ +void +predicatelock_twophase_recover(TransactionId xid, uint16 info, + void *recdata, uint32 len) +{ + TwoPhasePredicateRecord *record; + + Assert(len == sizeof(TwoPhasePredicateRecord)); + + record = (TwoPhasePredicateRecord *) recdata; + + Assert((record->type == TWOPHASEPREDICATERECORD_XACT) || + (record->type == TWOPHASEPREDICATERECORD_LOCK)); + + if (record->type == TWOPHASEPREDICATERECORD_XACT) + { + /* Per-transaction record. Set up a SERIALIZABLEXACT. */ + TwoPhasePredicateXactRecord *xactRecord; + SERIALIZABLEXACT *sxact; + SERIALIZABLEXID *sxid; + SERIALIZABLEXIDTAG sxidtag; + bool found; + + xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord; + + LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE); + sxact = CreatePredXact(); + if (!sxact) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); + + /* vxid for a prepared xact is InvalidBackendId/xid; no pid */ + sxact->vxid.backendId = InvalidBackendId; + sxact->vxid.localTransactionId = (LocalTransactionId) xid; + sxact->pid = 0; + sxact->pgprocno = INVALID_PGPROCNO; + + /* a prepared xact hasn't committed yet */ + sxact->prepareSeqNo = RecoverySerCommitSeqNo; + sxact->commitSeqNo = InvalidSerCommitSeqNo; + sxact->finishedBefore = InvalidTransactionId; + + sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo; + + /* + * Don't need to track this; no transactions running at the time the + * recovered xact started are still active, except possibly other + * prepared xacts and we don't care whether those are RO_SAFE or not. + */ + dlist_init(&(sxact->possibleUnsafeConflicts)); + + dlist_init(&(sxact->predicateLocks)); + dlist_node_init(&sxact->finishedLink); + + sxact->topXid = xid; + sxact->xmin = xactRecord->xmin; + sxact->flags = xactRecord->flags; + Assert(SxactIsPrepared(sxact)); + if (!SxactIsReadOnly(sxact)) + { + ++(PredXact->WritableSxactCount); + Assert(PredXact->WritableSxactCount <= + (MaxBackends + max_prepared_xacts)); + } + + /* + * We don't know whether the transaction had any conflicts or not, so + * we'll conservatively assume that it had both a conflict in and a + * conflict out, and represent that with the summary conflict flags. + */ + dlist_init(&(sxact->outConflicts)); + dlist_init(&(sxact->inConflicts)); + sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN; + sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT; + + /* Register the transaction's xid */ + sxidtag.xid = xid; + sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash, + &sxidtag, + HASH_ENTER, &found); + Assert(sxid != NULL); + Assert(!found); + sxid->myXact = (SERIALIZABLEXACT *) sxact; + + /* + * Update global xmin. Note that this is a special case compared to + * registering a normal transaction, because the global xmin might go + * backwards. That's OK, because until recovery is over we're not + * going to complete any transactions or create any non-prepared + * transactions, so there's no danger of throwing away. + */ + if ((!TransactionIdIsValid(PredXact->SxactGlobalXmin)) || + (TransactionIdFollows(PredXact->SxactGlobalXmin, sxact->xmin))) + { + PredXact->SxactGlobalXmin = sxact->xmin; + PredXact->SxactGlobalXminCount = 1; + SerialSetActiveSerXmin(sxact->xmin); + } + else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin)) + { + Assert(PredXact->SxactGlobalXminCount > 0); + PredXact->SxactGlobalXminCount++; + } + + LWLockRelease(SerializableXactHashLock); + } + else if (record->type == TWOPHASEPREDICATERECORD_LOCK) + { + /* Lock record. Recreate the PREDICATELOCK */ + TwoPhasePredicateLockRecord *lockRecord; + SERIALIZABLEXID *sxid; + SERIALIZABLEXACT *sxact; + SERIALIZABLEXIDTAG sxidtag; + uint32 targettaghash; + + lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord; + targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target); + + LWLockAcquire(SerializableXactHashLock, LW_SHARED); + sxidtag.xid = xid; + sxid = (SERIALIZABLEXID *) + hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL); + LWLockRelease(SerializableXactHashLock); + + Assert(sxid != NULL); + sxact = sxid->myXact; + Assert(sxact != InvalidSerializableXact); + + CreatePredicateLock(&lockRecord->target, targettaghash, sxact); + } +} + +/* + * Prepare to share the current SERIALIZABLEXACT with parallel workers. + * Return a handle object that can be used by AttachSerializableXact() in a + * parallel worker. + */ +SerializableXactHandle +ShareSerializableXact(void) +{ + return MySerializableXact; +} + +/* + * Allow parallel workers to import the leader's SERIALIZABLEXACT. + */ +void +AttachSerializableXact(SerializableXactHandle handle) +{ + + Assert(MySerializableXact == InvalidSerializableXact); + + MySerializableXact = (SERIALIZABLEXACT *) handle; + if (MySerializableXact != InvalidSerializableXact) + CreateLocalPredicateLockHash(); +} diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c new file mode 100644 index 0000000..e9e445b --- /dev/null +++ b/src/backend/storage/lmgr/proc.c @@ -0,0 +1,1897 @@ +/*------------------------------------------------------------------------- + * + * proc.c + * routines to manage per-process shared memory data structure + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/proc.c + * + *------------------------------------------------------------------------- + */ +/* + * Interface (a): + * ProcSleep(), ProcWakeup(), + * + * Waiting for a lock causes the backend to be put to sleep. Whoever releases + * the lock wakes the process up again (and gives it an error code so it knows + * whether it was awoken on an error condition). + * + * Interface (b): + * + * ProcReleaseLocks -- frees the locks associated with current transaction + * + * ProcKill -- destroys the shared memory state (and locks) + * associated with the process. + */ +#include "postgres.h" + +#include +#include +#include + +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xlogutils.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "replication/slot.h" +#include "replication/syncrep.h" +#include "replication/walsender.h" +#include "storage/condition_variable.h" +#include "storage/ipc.h" +#include "storage/lmgr.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "storage/spin.h" +#include "storage/standby.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* GUC variables */ +int DeadlockTimeout = 1000; +int StatementTimeout = 0; +int LockTimeout = 0; +int IdleInTransactionSessionTimeout = 0; +int IdleSessionTimeout = 0; +bool log_lock_waits = false; + +/* Pointer to this process's PGPROC struct, if any */ +PGPROC *MyProc = NULL; + +/* + * This spinlock protects the freelist of recycled PGPROC structures. + * We cannot use an LWLock because the LWLock manager depends on already + * having a PGPROC and a wait semaphore! But these structures are touched + * relatively infrequently (only at backend startup or shutdown) and not for + * very long, so a spinlock is okay. + */ +NON_EXEC_STATIC slock_t *ProcStructLock = NULL; + +/* Pointers to shared-memory structures */ +PROC_HDR *ProcGlobal = NULL; +NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL; +PGPROC *PreparedXactProcs = NULL; + +/* If we are waiting for a lock, this points to the associated LOCALLOCK */ +static LOCALLOCK *lockAwaited = NULL; + +static DeadLockState deadlock_state = DS_NOT_YET_CHECKED; + +/* Is a deadlock check pending? */ +static volatile sig_atomic_t got_deadlock_timeout; + +static void RemoveProcFromArray(int code, Datum arg); +static void ProcKill(int code, Datum arg); +static void AuxiliaryProcKill(int code, Datum arg); +static void CheckDeadLock(void); + + +/* + * Report shared-memory space needed by InitProcGlobal. + */ +Size +ProcGlobalShmemSize(void) +{ + Size size = 0; + Size TotalProcs = + add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts)); + + /* ProcGlobal */ + size = add_size(size, sizeof(PROC_HDR)); + size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC))); + size = add_size(size, sizeof(slock_t)); + + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids))); + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates))); + size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags))); + + return size; +} + +/* + * Report number of semaphores needed by InitProcGlobal. + */ +int +ProcGlobalSemas(void) +{ + /* + * We need a sema per backend (including autovacuum), plus one for each + * auxiliary process. + */ + return MaxBackends + NUM_AUXILIARY_PROCS; +} + +/* + * InitProcGlobal - + * Initialize the global process table during postmaster or standalone + * backend startup. + * + * We also create all the per-process semaphores we will need to support + * the requested number of backends. We used to allocate semaphores + * only when backends were actually started up, but that is bad because + * it lets Postgres fail under load --- a lot of Unix systems are + * (mis)configured with small limits on the number of semaphores, and + * running out when trying to start another backend is a common failure. + * So, now we grab enough semaphores to support the desired max number + * of backends immediately at initialization --- if the sysadmin has set + * MaxConnections, max_worker_processes, max_wal_senders, or + * autovacuum_max_workers higher than his kernel will support, he'll + * find out sooner rather than later. + * + * Another reason for creating semaphores here is that the semaphore + * implementation typically requires us to create semaphores in the + * postmaster, not in backends. + * + * Note: this is NOT called by individual backends under a postmaster, + * not even in the EXEC_BACKEND case. The ProcGlobal and AuxiliaryProcs + * pointers must be propagated specially for EXEC_BACKEND operation. + */ +void +InitProcGlobal(void) +{ + PGPROC *procs; + int i, + j; + bool found; + uint32 TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts; + + /* Create the ProcGlobal shared structure */ + ProcGlobal = (PROC_HDR *) + ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found); + Assert(!found); + + /* + * Initialize the data structures. + */ + ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY; + dlist_init(&ProcGlobal->freeProcs); + dlist_init(&ProcGlobal->autovacFreeProcs); + dlist_init(&ProcGlobal->bgworkerFreeProcs); + dlist_init(&ProcGlobal->walsenderFreeProcs); + ProcGlobal->startupBufferPinWaitBufId = -1; + ProcGlobal->walwriterLatch = NULL; + ProcGlobal->checkpointerLatch = NULL; + pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PGPROCNO); + pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PGPROCNO); + + /* + * Create and initialize all the PGPROC structures we'll need. There are + * five separate consumers: (1) normal backends, (2) autovacuum workers + * and the autovacuum launcher, (3) background workers, (4) auxiliary + * processes, and (5) prepared transactions. Each PGPROC structure is + * dedicated to exactly one of these purposes, and they do not move + * between groups. + */ + procs = (PGPROC *) ShmemAlloc(TotalProcs * sizeof(PGPROC)); + MemSet(procs, 0, TotalProcs * sizeof(PGPROC)); + ProcGlobal->allProcs = procs; + /* XXX allProcCount isn't really all of them; it excludes prepared xacts */ + ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS; + + /* + * Allocate arrays mirroring PGPROC fields in a dense manner. See + * PROC_HDR. + * + * XXX: It might make sense to increase padding for these arrays, given + * how hotly they are accessed. + */ + ProcGlobal->xids = + (TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids)); + MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids)); + ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates)); + MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates)); + ProcGlobal->statusFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->statusFlags)); + MemSet(ProcGlobal->statusFlags, 0, TotalProcs * sizeof(*ProcGlobal->statusFlags)); + + for (i = 0; i < TotalProcs; i++) + { + PGPROC *proc = &procs[i]; + + /* Common initialization for all PGPROCs, regardless of type. */ + + /* + * Set up per-PGPROC semaphore, latch, and fpInfoLock. Prepared xact + * dummy PGPROCs don't need these though - they're never associated + * with a real process + */ + if (i < MaxBackends + NUM_AUXILIARY_PROCS) + { + proc->sem = PGSemaphoreCreate(); + InitSharedLatch(&(proc->procLatch)); + LWLockInitialize(&(proc->fpInfoLock), LWTRANCHE_LOCK_FASTPATH); + } + proc->pgprocno = i; + + /* + * Newly created PGPROCs for normal backends, autovacuum and bgworkers + * must be queued up on the appropriate free list. Because there can + * only ever be a small, fixed number of auxiliary processes, no free + * list is used in that case; InitAuxiliaryProcess() instead uses a + * linear search. PGPROCs for prepared transactions are added to a + * free list by TwoPhaseShmemInit(). + */ + if (i < MaxConnections) + { + /* PGPROC for normal backend, add to freeProcs list */ + dlist_push_head(&ProcGlobal->freeProcs, &proc->links); + proc->procgloballist = &ProcGlobal->freeProcs; + } + else if (i < MaxConnections + autovacuum_max_workers + 1) + { + /* PGPROC for AV launcher/worker, add to autovacFreeProcs list */ + dlist_push_head(&ProcGlobal->autovacFreeProcs, &proc->links); + proc->procgloballist = &ProcGlobal->autovacFreeProcs; + } + else if (i < MaxConnections + autovacuum_max_workers + 1 + max_worker_processes) + { + /* PGPROC for bgworker, add to bgworkerFreeProcs list */ + dlist_push_head(&ProcGlobal->bgworkerFreeProcs, &proc->links); + proc->procgloballist = &ProcGlobal->bgworkerFreeProcs; + } + else if (i < MaxBackends) + { + /* PGPROC for walsender, add to walsenderFreeProcs list */ + dlist_push_head(&ProcGlobal->walsenderFreeProcs, &proc->links); + proc->procgloballist = &ProcGlobal->walsenderFreeProcs; + } + + /* Initialize myProcLocks[] shared memory queues. */ + for (j = 0; j < NUM_LOCK_PARTITIONS; j++) + dlist_init(&(proc->myProcLocks[j])); + + /* Initialize lockGroupMembers list. */ + dlist_init(&proc->lockGroupMembers); + + /* + * Initialize the atomic variables, otherwise, it won't be safe to + * access them for backends that aren't currently in use. + */ + pg_atomic_init_u32(&(proc->procArrayGroupNext), INVALID_PGPROCNO); + pg_atomic_init_u32(&(proc->clogGroupNext), INVALID_PGPROCNO); + pg_atomic_init_u64(&(proc->waitStart), 0); + } + + /* + * Save pointers to the blocks of PGPROC structures reserved for auxiliary + * processes and prepared transactions. + */ + AuxiliaryProcs = &procs[MaxBackends]; + PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS]; + + /* Create ProcStructLock spinlock, too */ + ProcStructLock = (slock_t *) ShmemAlloc(sizeof(slock_t)); + SpinLockInit(ProcStructLock); +} + +/* + * InitProcess -- initialize a per-process data structure for this backend + */ +void +InitProcess(void) +{ + dlist_head *procgloballist; + + /* + * ProcGlobal should be set up already (if we are a backend, we inherit + * this by fork() or EXEC_BACKEND mechanism from the postmaster). + */ + if (ProcGlobal == NULL) + elog(PANIC, "proc header uninitialized"); + + if (MyProc != NULL) + elog(ERROR, "you already exist"); + + /* Decide which list should supply our PGPROC. */ + if (IsAnyAutoVacuumProcess()) + procgloballist = &ProcGlobal->autovacFreeProcs; + else if (IsBackgroundWorker) + procgloballist = &ProcGlobal->bgworkerFreeProcs; + else if (am_walsender) + procgloballist = &ProcGlobal->walsenderFreeProcs; + else + procgloballist = &ProcGlobal->freeProcs; + + /* + * Try to get a proc struct from the appropriate free list. If this + * fails, we must be out of PGPROC structures (not to mention semaphores). + * + * While we are holding the ProcStructLock, also copy the current shared + * estimate of spins_per_delay to local storage. + */ + SpinLockAcquire(ProcStructLock); + + set_spins_per_delay(ProcGlobal->spins_per_delay); + + if (!dlist_is_empty(procgloballist)) + { + MyProc = (PGPROC *) dlist_pop_head_node(procgloballist); + SpinLockRelease(ProcStructLock); + } + else + { + /* + * If we reach here, all the PGPROCs are in use. This is one of the + * possible places to detect "too many backends", so give the standard + * error message. XXX do we need to give a different failure message + * in the autovacuum case? + */ + SpinLockRelease(ProcStructLock); + if (am_walsender) + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("number of requested standby connections exceeds max_wal_senders (currently %d)", + max_wal_senders))); + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("sorry, too many clients already"))); + } + + /* + * Cross-check that the PGPROC is of the type we expect; if this were not + * the case, it would get returned to the wrong list. + */ + Assert(MyProc->procgloballist == procgloballist); + + /* + * Now that we have a PGPROC, mark ourselves as an active postmaster + * child; this is so that the postmaster can detect it if we exit without + * cleaning up. (XXX autovac launcher currently doesn't participate in + * this; it probably should.) + */ + if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess()) + MarkPostmasterChildActive(); + + /* + * Initialize all fields of MyProc, except for those previously + * initialized by InitProcGlobal. + */ + dlist_node_init(&MyProc->links); + MyProc->waitStatus = PROC_WAIT_STATUS_OK; + MyProc->lxid = InvalidLocalTransactionId; + MyProc->fpVXIDLock = false; + MyProc->fpLocalTransactionId = InvalidLocalTransactionId; + MyProc->xid = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; + MyProc->pid = MyProcPid; + /* backendId, databaseId and roleId will be filled in later */ + MyProc->backendId = InvalidBackendId; + MyProc->databaseId = InvalidOid; + MyProc->roleId = InvalidOid; + MyProc->tempNamespaceId = InvalidOid; + MyProc->isBackgroundWorker = IsBackgroundWorker; + MyProc->delayChkptFlags = 0; + MyProc->statusFlags = 0; + /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */ + if (IsAutoVacuumWorkerProcess()) + MyProc->statusFlags |= PROC_IS_AUTOVACUUM; + MyProc->lwWaiting = LW_WS_NOT_WAITING; + MyProc->lwWaitMode = 0; + MyProc->waitLock = NULL; + MyProc->waitProcLock = NULL; + pg_atomic_write_u64(&MyProc->waitStart, 0); +#ifdef USE_ASSERT_CHECKING + { + int i; + + /* Last process should have released all locks. */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + Assert(dlist_is_empty(&(MyProc->myProcLocks[i]))); + } +#endif + MyProc->recoveryConflictPending = false; + + /* Initialize fields for sync rep */ + MyProc->waitLSN = 0; + MyProc->syncRepState = SYNC_REP_NOT_WAITING; + dlist_node_init(&MyProc->syncRepLinks); + + /* Initialize fields for group XID clearing. */ + MyProc->procArrayGroupMember = false; + MyProc->procArrayGroupMemberXid = InvalidTransactionId; + Assert(pg_atomic_read_u32(&MyProc->procArrayGroupNext) == INVALID_PGPROCNO); + + /* Check that group locking fields are in a proper initial state. */ + Assert(MyProc->lockGroupLeader == NULL); + Assert(dlist_is_empty(&MyProc->lockGroupMembers)); + + /* Initialize wait event information. */ + MyProc->wait_event_info = 0; + + /* Initialize fields for group transaction status update. */ + MyProc->clogGroupMember = false; + MyProc->clogGroupMemberXid = InvalidTransactionId; + MyProc->clogGroupMemberXidStatus = TRANSACTION_STATUS_IN_PROGRESS; + MyProc->clogGroupMemberPage = -1; + MyProc->clogGroupMemberLsn = InvalidXLogRecPtr; + Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO); + + /* + * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch + * on it. That allows us to repoint the process latch, which so far + * points to process local one, to the shared one. + */ + OwnLatch(&MyProc->procLatch); + SwitchToSharedLatch(); + + /* now that we have a proc, report wait events to shared memory */ + pgstat_set_wait_event_storage(&MyProc->wait_event_info); + + /* + * We might be reusing a semaphore that belonged to a failed process. So + * be careful and reinitialize its value here. (This is not strictly + * necessary anymore, but seems like a good idea for cleanliness.) + */ + PGSemaphoreReset(MyProc->sem); + + /* + * Arrange to clean up at backend exit. + */ + on_shmem_exit(ProcKill, 0); + + /* + * Now that we have a PGPROC, we could try to acquire locks, so initialize + * local state needed for LWLocks, and the deadlock checker. + */ + InitLWLockAccess(); + InitDeadLockChecking(); +} + +/* + * InitProcessPhase2 -- make MyProc visible in the shared ProcArray. + * + * This is separate from InitProcess because we can't acquire LWLocks until + * we've created a PGPROC, but in the EXEC_BACKEND case ProcArrayAdd won't + * work until after we've done CreateSharedMemoryAndSemaphores. + */ +void +InitProcessPhase2(void) +{ + Assert(MyProc != NULL); + + /* + * Add our PGPROC to the PGPROC array in shared memory. + */ + ProcArrayAdd(MyProc); + + /* + * Arrange to clean that up at backend exit. + */ + on_shmem_exit(RemoveProcFromArray, 0); +} + +/* + * InitAuxiliaryProcess -- create a per-auxiliary-process data structure + * + * This is called by bgwriter and similar processes so that they will have a + * MyProc value that's real enough to let them wait for LWLocks. The PGPROC + * and sema that are assigned are one of the extra ones created during + * InitProcGlobal. + * + * Auxiliary processes are presently not expected to wait for real (lockmgr) + * locks, so we need not set up the deadlock checker. They are never added + * to the ProcArray or the sinval messaging mechanism, either. They also + * don't get a VXID assigned, since this is only useful when we actually + * hold lockmgr locks. + * + * Startup process however uses locks but never waits for them in the + * normal backend sense. Startup process also takes part in sinval messaging + * as a sendOnly process, so never reads messages from sinval queue. So + * Startup process does have a VXID and does show up in pg_locks. + */ +void +InitAuxiliaryProcess(void) +{ + PGPROC *auxproc; + int proctype; + + /* + * ProcGlobal should be set up already (if we are a backend, we inherit + * this by fork() or EXEC_BACKEND mechanism from the postmaster). + */ + if (ProcGlobal == NULL || AuxiliaryProcs == NULL) + elog(PANIC, "proc header uninitialized"); + + if (MyProc != NULL) + elog(ERROR, "you already exist"); + + /* + * We use the ProcStructLock to protect assignment and releasing of + * AuxiliaryProcs entries. + * + * While we are holding the ProcStructLock, also copy the current shared + * estimate of spins_per_delay to local storage. + */ + SpinLockAcquire(ProcStructLock); + + set_spins_per_delay(ProcGlobal->spins_per_delay); + + /* + * Find a free auxproc ... *big* trouble if there isn't one ... + */ + for (proctype = 0; proctype < NUM_AUXILIARY_PROCS; proctype++) + { + auxproc = &AuxiliaryProcs[proctype]; + if (auxproc->pid == 0) + break; + } + if (proctype >= NUM_AUXILIARY_PROCS) + { + SpinLockRelease(ProcStructLock); + elog(FATAL, "all AuxiliaryProcs are in use"); + } + + /* Mark auxiliary proc as in use by me */ + /* use volatile pointer to prevent code rearrangement */ + ((volatile PGPROC *) auxproc)->pid = MyProcPid; + + MyProc = auxproc; + + SpinLockRelease(ProcStructLock); + + /* + * Initialize all fields of MyProc, except for those previously + * initialized by InitProcGlobal. + */ + dlist_node_init(&MyProc->links); + MyProc->waitStatus = PROC_WAIT_STATUS_OK; + MyProc->lxid = InvalidLocalTransactionId; + MyProc->fpVXIDLock = false; + MyProc->fpLocalTransactionId = InvalidLocalTransactionId; + MyProc->xid = InvalidTransactionId; + MyProc->xmin = InvalidTransactionId; + MyProc->backendId = InvalidBackendId; + MyProc->databaseId = InvalidOid; + MyProc->roleId = InvalidOid; + MyProc->tempNamespaceId = InvalidOid; + MyProc->isBackgroundWorker = IsBackgroundWorker; + MyProc->delayChkptFlags = 0; + MyProc->statusFlags = 0; + MyProc->lwWaiting = LW_WS_NOT_WAITING; + MyProc->lwWaitMode = 0; + MyProc->waitLock = NULL; + MyProc->waitProcLock = NULL; + pg_atomic_write_u64(&MyProc->waitStart, 0); +#ifdef USE_ASSERT_CHECKING + { + int i; + + /* Last process should have released all locks. */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + Assert(dlist_is_empty(&(MyProc->myProcLocks[i]))); + } +#endif + + /* + * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch + * on it. That allows us to repoint the process latch, which so far + * points to process local one, to the shared one. + */ + OwnLatch(&MyProc->procLatch); + SwitchToSharedLatch(); + + /* now that we have a proc, report wait events to shared memory */ + pgstat_set_wait_event_storage(&MyProc->wait_event_info); + + /* Check that group locking fields are in a proper initial state. */ + Assert(MyProc->lockGroupLeader == NULL); + Assert(dlist_is_empty(&MyProc->lockGroupMembers)); + + /* + * We might be reusing a semaphore that belonged to a failed process. So + * be careful and reinitialize its value here. (This is not strictly + * necessary anymore, but seems like a good idea for cleanliness.) + */ + PGSemaphoreReset(MyProc->sem); + + /* + * Arrange to clean up at process exit. + */ + on_shmem_exit(AuxiliaryProcKill, Int32GetDatum(proctype)); +} + +/* + * Used from bufmgr to share the value of the buffer that Startup waits on, + * or to reset the value to "not waiting" (-1). This allows processing + * of recovery conflicts for buffer pins. Set is made before backends look + * at this value, so locking not required, especially since the set is + * an atomic integer set operation. + */ +void +SetStartupBufferPinWaitBufId(int bufid) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile PROC_HDR *procglobal = ProcGlobal; + + procglobal->startupBufferPinWaitBufId = bufid; +} + +/* + * Used by backends when they receive a request to check for buffer pin waits. + */ +int +GetStartupBufferPinWaitBufId(void) +{ + /* use volatile pointer to prevent code rearrangement */ + volatile PROC_HDR *procglobal = ProcGlobal; + + return procglobal->startupBufferPinWaitBufId; +} + +/* + * Check whether there are at least N free PGPROC objects. If false is + * returned, *nfree will be set to the number of free PGPROC objects. + * Otherwise, *nfree will be set to n. + * + * Note: this is designed on the assumption that N will generally be small. + */ +bool +HaveNFreeProcs(int n, int *nfree) +{ + dlist_iter iter; + + Assert(n > 0); + Assert(nfree); + + SpinLockAcquire(ProcStructLock); + + *nfree = 0; + dlist_foreach(iter, &ProcGlobal->freeProcs) + { + (*nfree)++; + if (*nfree == n) + break; + } + + SpinLockRelease(ProcStructLock); + + return (*nfree == n); +} + +/* + * Check if the current process is awaiting a lock. + */ +bool +IsWaitingForLock(void) +{ + if (lockAwaited == NULL) + return false; + + return true; +} + +/* + * Cancel any pending wait for lock, when aborting a transaction, and revert + * any strong lock count acquisition for a lock being acquired. + * + * (Normally, this would only happen if we accept a cancel/die + * interrupt while waiting; but an ereport(ERROR) before or during the lock + * wait is within the realm of possibility, too.) + */ +void +LockErrorCleanup(void) +{ + LWLock *partitionLock; + DisableTimeoutParams timeouts[2]; + + HOLD_INTERRUPTS(); + + AbortStrongLockAcquire(); + + /* Nothing to do if we weren't waiting for a lock */ + if (lockAwaited == NULL) + { + RESUME_INTERRUPTS(); + return; + } + + /* + * Turn off the deadlock and lock timeout timers, if they are still + * running (see ProcSleep). Note we must preserve the LOCK_TIMEOUT + * indicator flag, since this function is executed before + * ProcessInterrupts when responding to SIGINT; else we'd lose the + * knowledge that the SIGINT came from a lock timeout and not an external + * source. + */ + timeouts[0].id = DEADLOCK_TIMEOUT; + timeouts[0].keep_indicator = false; + timeouts[1].id = LOCK_TIMEOUT; + timeouts[1].keep_indicator = true; + disable_timeouts(timeouts, 2); + + /* Unlink myself from the wait queue, if on it (might not be anymore!) */ + partitionLock = LockHashPartitionLock(lockAwaited->hashcode); + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + if (!dlist_node_is_detached(&MyProc->links)) + { + /* We could not have been granted the lock yet */ + RemoveFromWaitQueue(MyProc, lockAwaited->hashcode); + } + else + { + /* + * Somebody kicked us off the lock queue already. Perhaps they + * granted us the lock, or perhaps they detected a deadlock. If they + * did grant us the lock, we'd better remember it in our local lock + * table. + */ + if (MyProc->waitStatus == PROC_WAIT_STATUS_OK) + GrantAwaitedLock(); + } + + lockAwaited = NULL; + + LWLockRelease(partitionLock); + + RESUME_INTERRUPTS(); +} + + +/* + * ProcReleaseLocks() -- release locks associated with current transaction + * at main transaction commit or abort + * + * At main transaction commit, we release standard locks except session locks. + * At main transaction abort, we release all locks including session locks. + * + * Advisory locks are released only if they are transaction-level; + * session-level holds remain, whether this is a commit or not. + * + * At subtransaction commit, we don't release any locks (so this func is not + * needed at all); we will defer the releasing to the parent transaction. + * At subtransaction abort, we release all locks held by the subtransaction; + * this is implemented by retail releasing of the locks under control of + * the ResourceOwner mechanism. + */ +void +ProcReleaseLocks(bool isCommit) +{ + if (!MyProc) + return; + /* If waiting, get off wait queue (should only be needed after error) */ + LockErrorCleanup(); + /* Release standard locks, including session-level if aborting */ + LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit); + /* Release transaction-level advisory locks */ + LockReleaseAll(USER_LOCKMETHOD, false); +} + + +/* + * RemoveProcFromArray() -- Remove this process from the shared ProcArray. + */ +static void +RemoveProcFromArray(int code, Datum arg) +{ + Assert(MyProc != NULL); + ProcArrayRemove(MyProc, InvalidTransactionId); +} + +/* + * ProcKill() -- Destroy the per-proc data structure for + * this process. Release any of its held LW locks. + */ +static void +ProcKill(int code, Datum arg) +{ + PGPROC *proc; + dlist_head *procgloballist; + + Assert(MyProc != NULL); + + /* not safe if forked by system(), etc. */ + if (MyProc->pid != (int) getpid()) + elog(PANIC, "ProcKill() called in child process"); + + /* Make sure we're out of the sync rep lists */ + SyncRepCleanupAtProcExit(); + +#ifdef USE_ASSERT_CHECKING + { + int i; + + /* Last process should have released all locks. */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + Assert(dlist_is_empty(&(MyProc->myProcLocks[i]))); + } +#endif + + /* + * Release any LW locks I am holding. There really shouldn't be any, but + * it's cheap to check again before we cut the knees off the LWLock + * facility by releasing our PGPROC ... + */ + LWLockReleaseAll(); + + /* Cancel any pending condition variable sleep, too */ + ConditionVariableCancelSleep(); + + /* + * Detach from any lock group of which we are a member. If the leader + * exits before all other group members, its PGPROC will remain allocated + * until the last group process exits; that process must return the + * leader's PGPROC to the appropriate list. + */ + if (MyProc->lockGroupLeader != NULL) + { + PGPROC *leader = MyProc->lockGroupLeader; + LWLock *leader_lwlock = LockHashPartitionLockByProc(leader); + + LWLockAcquire(leader_lwlock, LW_EXCLUSIVE); + Assert(!dlist_is_empty(&leader->lockGroupMembers)); + dlist_delete(&MyProc->lockGroupLink); + if (dlist_is_empty(&leader->lockGroupMembers)) + { + leader->lockGroupLeader = NULL; + if (leader != MyProc) + { + procgloballist = leader->procgloballist; + + /* Leader exited first; return its PGPROC. */ + SpinLockAcquire(ProcStructLock); + dlist_push_head(procgloballist, &leader->links); + SpinLockRelease(ProcStructLock); + } + } + else if (leader != MyProc) + MyProc->lockGroupLeader = NULL; + LWLockRelease(leader_lwlock); + } + + /* + * Reset MyLatch to the process local one. This is so that signal + * handlers et al can continue using the latch after the shared latch + * isn't ours anymore. + * + * Similarly, stop reporting wait events to MyProc->wait_event_info. + * + * After that clear MyProc and disown the shared latch. + */ + SwitchBackToLocalLatch(); + pgstat_reset_wait_event_storage(); + + proc = MyProc; + MyProc = NULL; + DisownLatch(&proc->procLatch); + + procgloballist = proc->procgloballist; + SpinLockAcquire(ProcStructLock); + + /* + * If we're still a member of a locking group, that means we're a leader + * which has somehow exited before its children. The last remaining child + * will release our PGPROC. Otherwise, release it now. + */ + if (proc->lockGroupLeader == NULL) + { + /* Since lockGroupLeader is NULL, lockGroupMembers should be empty. */ + Assert(dlist_is_empty(&proc->lockGroupMembers)); + + /* Return PGPROC structure (and semaphore) to appropriate freelist */ + dlist_push_tail(procgloballist, &proc->links); + } + + /* Update shared estimate of spins_per_delay */ + ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay); + + SpinLockRelease(ProcStructLock); + + /* + * This process is no longer present in shared memory in any meaningful + * way, so tell the postmaster we've cleaned up acceptably well. (XXX + * autovac launcher should be included here someday) + */ + if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess()) + MarkPostmasterChildInactive(); + + /* wake autovac launcher if needed -- see comments in FreeWorkerInfo */ + if (AutovacuumLauncherPid != 0) + kill(AutovacuumLauncherPid, SIGUSR2); +} + +/* + * AuxiliaryProcKill() -- Cut-down version of ProcKill for auxiliary + * processes (bgwriter, etc). The PGPROC and sema are not released, only + * marked as not-in-use. + */ +static void +AuxiliaryProcKill(int code, Datum arg) +{ + int proctype = DatumGetInt32(arg); + PGPROC *auxproc PG_USED_FOR_ASSERTS_ONLY; + PGPROC *proc; + + Assert(proctype >= 0 && proctype < NUM_AUXILIARY_PROCS); + + /* not safe if forked by system(), etc. */ + if (MyProc->pid != (int) getpid()) + elog(PANIC, "AuxiliaryProcKill() called in child process"); + + auxproc = &AuxiliaryProcs[proctype]; + + Assert(MyProc == auxproc); + + /* Release any LW locks I am holding (see notes above) */ + LWLockReleaseAll(); + + /* Cancel any pending condition variable sleep, too */ + ConditionVariableCancelSleep(); + + /* look at the equivalent ProcKill() code for comments */ + SwitchBackToLocalLatch(); + pgstat_reset_wait_event_storage(); + + proc = MyProc; + MyProc = NULL; + DisownLatch(&proc->procLatch); + + SpinLockAcquire(ProcStructLock); + + /* Mark auxiliary proc no longer in use */ + proc->pid = 0; + + /* Update shared estimate of spins_per_delay */ + ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay); + + SpinLockRelease(ProcStructLock); +} + +/* + * AuxiliaryPidGetProc -- get PGPROC for an auxiliary process + * given its PID + * + * Returns NULL if not found. + */ +PGPROC * +AuxiliaryPidGetProc(int pid) +{ + PGPROC *result = NULL; + int index; + + if (pid == 0) /* never match dummy PGPROCs */ + return NULL; + + for (index = 0; index < NUM_AUXILIARY_PROCS; index++) + { + PGPROC *proc = &AuxiliaryProcs[index]; + + if (proc->pid == pid) + { + result = proc; + break; + } + } + return result; +} + + +/* + * ProcSleep -- put a process to sleep on the specified lock + * + * Caller must have set MyProc->heldLocks to reflect locks already held + * on the lockable object by this process (under all XIDs). + * + * The lock table's partition lock must be held at entry, and will be held + * at exit. + * + * Result: PROC_WAIT_STATUS_OK if we acquired the lock, PROC_WAIT_STATUS_ERROR if not (deadlock). + * + * ASSUME: that no one will fiddle with the queue until after + * we release the partition lock. + * + * NOTES: The process queue is now a priority queue for locking. + */ +ProcWaitStatus +ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) +{ + LOCKMODE lockmode = locallock->tag.mode; + LOCK *lock = locallock->lock; + PROCLOCK *proclock = locallock->proclock; + uint32 hashcode = locallock->hashcode; + LWLock *partitionLock = LockHashPartitionLock(hashcode); + dclist_head *waitQueue = &lock->waitProcs; + PGPROC *insert_before = NULL; + LOCKMASK myHeldLocks = MyProc->heldLocks; + TimestampTz standbyWaitStart = 0; + bool early_deadlock = false; + bool allow_autovacuum_cancel = true; + bool logged_recovery_conflict = false; + ProcWaitStatus myWaitStatus; + PGPROC *leader = MyProc->lockGroupLeader; + + /* + * If group locking is in use, locks held by members of my locking group + * need to be included in myHeldLocks. This is not required for relation + * extension lock which conflict among group members. However, including + * them in myHeldLocks will give group members the priority to get those + * locks as compared to other backends which are also trying to acquire + * those locks. OTOH, we can avoid giving priority to group members for + * that kind of locks, but there doesn't appear to be a clear advantage of + * the same. + */ + if (leader != NULL) + { + dlist_iter iter; + + dlist_foreach(iter, &lock->procLocks) + { + PROCLOCK *otherproclock; + + otherproclock = dlist_container(PROCLOCK, lockLink, iter.cur); + + if (otherproclock->groupLeader == leader) + myHeldLocks |= otherproclock->holdMask; + } + } + + /* + * Determine where to add myself in the wait queue. + * + * Normally I should go at the end of the queue. However, if I already + * hold locks that conflict with the request of any previous waiter, put + * myself in the queue just in front of the first such waiter. This is not + * a necessary step, since deadlock detection would move me to before that + * waiter anyway; but it's relatively cheap to detect such a conflict + * immediately, and avoid delaying till deadlock timeout. + * + * Special case: if I find I should go in front of some waiter, check to + * see if I conflict with already-held locks or the requests before that + * waiter. If not, then just grant myself the requested lock immediately. + * This is the same as the test for immediate grant in LockAcquire, except + * we are only considering the part of the wait queue before my insertion + * point. + */ + if (myHeldLocks != 0 && !dclist_is_empty(waitQueue)) + { + LOCKMASK aheadRequests = 0; + dlist_iter iter; + + dclist_foreach(iter, waitQueue) + { + PGPROC *proc = dlist_container(PGPROC, links, iter.cur); + + /* + * If we're part of the same locking group as this waiter, its + * locks neither conflict with ours nor contribute to + * aheadRequests. + */ + if (leader != NULL && leader == proc->lockGroupLeader) + continue; + + /* Must he wait for me? */ + if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks) + { + /* Must I wait for him ? */ + if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks) + { + /* + * Yes, so we have a deadlock. Easiest way to clean up + * correctly is to call RemoveFromWaitQueue(), but we + * can't do that until we are *on* the wait queue. So, set + * a flag to check below, and break out of loop. Also, + * record deadlock info for later message. + */ + RememberSimpleDeadLock(MyProc, lockmode, lock, proc); + early_deadlock = true; + break; + } + /* I must go before this waiter. Check special case. */ + if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 && + !LockCheckConflicts(lockMethodTable, lockmode, lock, + proclock)) + { + /* Skip the wait and just grant myself the lock. */ + GrantLock(lock, proclock, lockmode); + GrantAwaitedLock(); + return PROC_WAIT_STATUS_OK; + } + + /* Put myself into wait queue before conflicting process */ + insert_before = proc; + break; + } + /* Nope, so advance to next waiter */ + aheadRequests |= LOCKBIT_ON(proc->waitLockMode); + } + } + + /* + * Insert self into queue, at the position determined above. + */ + if (insert_before) + dclist_insert_before(waitQueue, &insert_before->links, &MyProc->links); + else + dclist_push_tail(waitQueue, &MyProc->links); + + lock->waitMask |= LOCKBIT_ON(lockmode); + + /* Set up wait information in PGPROC object, too */ + MyProc->waitLock = lock; + MyProc->waitProcLock = proclock; + MyProc->waitLockMode = lockmode; + + MyProc->waitStatus = PROC_WAIT_STATUS_WAITING; + + /* + * If we detected deadlock, give up without waiting. This must agree with + * CheckDeadLock's recovery code. + */ + if (early_deadlock) + { + RemoveFromWaitQueue(MyProc, hashcode); + return PROC_WAIT_STATUS_ERROR; + } + + /* mark that we are waiting for a lock */ + lockAwaited = locallock; + + /* + * Release the lock table's partition lock. + * + * NOTE: this may also cause us to exit critical-section state, possibly + * allowing a cancel/die interrupt to be accepted. This is OK because we + * have recorded the fact that we are waiting for a lock, and so + * LockErrorCleanup will clean up if cancel/die happens. + */ + LWLockRelease(partitionLock); + + /* + * Also, now that we will successfully clean up after an ereport, it's + * safe to check to see if there's a buffer pin deadlock against the + * Startup process. Of course, that's only necessary if we're doing Hot + * Standby and are not the Startup process ourselves. + */ + if (RecoveryInProgress() && !InRecovery) + CheckRecoveryConflictDeadlock(); + + /* Reset deadlock_state before enabling the timeout handler */ + deadlock_state = DS_NOT_YET_CHECKED; + got_deadlock_timeout = false; + + /* + * Set timer so we can wake up after awhile and check for a deadlock. If a + * deadlock is detected, the handler sets MyProc->waitStatus = + * PROC_WAIT_STATUS_ERROR, allowing us to know that we must report failure + * rather than success. + * + * By delaying the check until we've waited for a bit, we can avoid + * running the rather expensive deadlock-check code in most cases. + * + * If LockTimeout is set, also enable the timeout for that. We can save a + * few cycles by enabling both timeout sources in one call. + * + * If InHotStandby we set lock waits slightly later for clarity with other + * code. + */ + if (!InHotStandby) + { + if (LockTimeout > 0) + { + EnableTimeoutParams timeouts[2]; + + timeouts[0].id = DEADLOCK_TIMEOUT; + timeouts[0].type = TMPARAM_AFTER; + timeouts[0].delay_ms = DeadlockTimeout; + timeouts[1].id = LOCK_TIMEOUT; + timeouts[1].type = TMPARAM_AFTER; + timeouts[1].delay_ms = LockTimeout; + enable_timeouts(timeouts, 2); + } + else + enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout); + + /* + * Use the current time obtained for the deadlock timeout timer as + * waitStart (i.e., the time when this process started waiting for the + * lock). Since getting the current time newly can cause overhead, we + * reuse the already-obtained time to avoid that overhead. + * + * Note that waitStart is updated without holding the lock table's + * partition lock, to avoid the overhead by additional lock + * acquisition. This can cause "waitstart" in pg_locks to become NULL + * for a very short period of time after the wait started even though + * "granted" is false. This is OK in practice because we can assume + * that users are likely to look at "waitstart" when waiting for the + * lock for a long time. + */ + pg_atomic_write_u64(&MyProc->waitStart, + get_timeout_start_time(DEADLOCK_TIMEOUT)); + } + else if (log_recovery_conflict_waits) + { + /* + * Set the wait start timestamp if logging is enabled and in hot + * standby. + */ + standbyWaitStart = GetCurrentTimestamp(); + } + + /* + * If somebody wakes us between LWLockRelease and WaitLatch, the latch + * will not wait. But a set latch does not necessarily mean that the lock + * is free now, as there are many other sources for latch sets than + * somebody releasing the lock. + * + * We process interrupts whenever the latch has been set, so cancel/die + * interrupts are processed quickly. This means we must not mind losing + * control to a cancel/die interrupt here. We don't, because we have no + * shared-state-change work to do after being granted the lock (the + * grantor did it all). We do have to worry about canceling the deadlock + * timeout and updating the locallock table, but if we lose control to an + * error, LockErrorCleanup will fix that up. + */ + do + { + if (InHotStandby) + { + bool maybe_log_conflict = + (standbyWaitStart != 0 && !logged_recovery_conflict); + + /* Set a timer and wait for that or for the lock to be granted */ + ResolveRecoveryConflictWithLock(locallock->tag.lock, + maybe_log_conflict); + + /* + * Emit the log message if the startup process is waiting longer + * than deadlock_timeout for recovery conflict on lock. + */ + if (maybe_log_conflict) + { + TimestampTz now = GetCurrentTimestamp(); + + if (TimestampDifferenceExceeds(standbyWaitStart, now, + DeadlockTimeout)) + { + VirtualTransactionId *vxids; + int cnt; + + vxids = GetLockConflicts(&locallock->tag.lock, + AccessExclusiveLock, &cnt); + + /* + * Log the recovery conflict and the list of PIDs of + * backends holding the conflicting lock. Note that we do + * logging even if there are no such backends right now + * because the startup process here has already waited + * longer than deadlock_timeout. + */ + LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK, + standbyWaitStart, now, + cnt > 0 ? vxids : NULL, true); + logged_recovery_conflict = true; + } + } + } + else + { + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + PG_WAIT_LOCK | locallock->tag.lock.locktag_type); + ResetLatch(MyLatch); + /* check for deadlocks first, as that's probably log-worthy */ + if (got_deadlock_timeout) + { + CheckDeadLock(); + got_deadlock_timeout = false; + } + CHECK_FOR_INTERRUPTS(); + } + + /* + * waitStatus could change from PROC_WAIT_STATUS_WAITING to something + * else asynchronously. Read it just once per loop to prevent + * surprising behavior (such as missing log messages). + */ + myWaitStatus = *((volatile ProcWaitStatus *) &MyProc->waitStatus); + + /* + * If we are not deadlocked, but are waiting on an autovacuum-induced + * task, send a signal to interrupt it. + */ + if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel) + { + PGPROC *autovac = GetBlockingAutoVacuumPgproc(); + uint8 statusFlags; + uint8 lockmethod_copy; + LOCKTAG locktag_copy; + + /* + * Grab info we need, then release lock immediately. Note this + * coding means that there is a tiny chance that the process + * terminates its current transaction and starts a different one + * before we have a change to send the signal; the worst possible + * consequence is that a for-wraparound vacuum is cancelled. But + * that could happen in any case unless we were to do kill() with + * the lock held, which is much more undesirable. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + statusFlags = ProcGlobal->statusFlags[autovac->pgxactoff]; + lockmethod_copy = lock->tag.locktag_lockmethodid; + locktag_copy = lock->tag; + LWLockRelease(ProcArrayLock); + + /* + * Only do it if the worker is not working to protect against Xid + * wraparound. + */ + if ((statusFlags & PROC_IS_AUTOVACUUM) && + !(statusFlags & PROC_VACUUM_FOR_WRAPAROUND)) + { + int pid = autovac->pid; + + /* report the case, if configured to do so */ + if (message_level_is_interesting(DEBUG1)) + { + StringInfoData locktagbuf; + StringInfoData logbuf; /* errdetail for server log */ + + initStringInfo(&locktagbuf); + initStringInfo(&logbuf); + DescribeLockTag(&locktagbuf, &locktag_copy); + appendStringInfo(&logbuf, + "Process %d waits for %s on %s.", + MyProcPid, + GetLockmodeName(lockmethod_copy, lockmode), + locktagbuf.data); + + ereport(DEBUG1, + (errmsg_internal("sending cancel to blocking autovacuum PID %d", + pid), + errdetail_log("%s", logbuf.data))); + + pfree(locktagbuf.data); + pfree(logbuf.data); + } + + /* send the autovacuum worker Back to Old Kent Road */ + if (kill(pid, SIGINT) < 0) + { + /* + * There's a race condition here: once we release the + * ProcArrayLock, it's possible for the autovac worker to + * close up shop and exit before we can do the kill(). + * Therefore, we do not whinge about no-such-process. + * Other errors such as EPERM could conceivably happen if + * the kernel recycles the PID fast enough, but such cases + * seem improbable enough that it's probably best to issue + * a warning if we see some other errno. + */ + if (errno != ESRCH) + ereport(WARNING, + (errmsg("could not send signal to process %d: %m", + pid))); + } + } + + /* prevent signal from being sent again more than once */ + allow_autovacuum_cancel = false; + } + + /* + * If awoken after the deadlock check interrupt has run, and + * log_lock_waits is on, then report about the wait. + */ + if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED) + { + StringInfoData buf, + lock_waiters_sbuf, + lock_holders_sbuf; + const char *modename; + long secs; + int usecs; + long msecs; + dlist_iter proc_iter; + PROCLOCK *curproclock; + bool first_holder = true, + first_waiter = true; + int lockHoldersNum = 0; + + initStringInfo(&buf); + initStringInfo(&lock_waiters_sbuf); + initStringInfo(&lock_holders_sbuf); + + DescribeLockTag(&buf, &locallock->tag.lock); + modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid, + lockmode); + TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT), + GetCurrentTimestamp(), + &secs, &usecs); + msecs = secs * 1000 + usecs / 1000; + usecs = usecs % 1000; + + /* + * we loop over the lock's procLocks to gather a list of all + * holders and waiters. Thus we will be able to provide more + * detailed information for lock debugging purposes. + * + * lock->procLocks contains all processes which hold or wait for + * this lock. + */ + + LWLockAcquire(partitionLock, LW_SHARED); + + dlist_foreach(proc_iter, &lock->procLocks) + { + curproclock = + dlist_container(PROCLOCK, lockLink, proc_iter.cur); + + /* + * we are a waiter if myProc->waitProcLock == curproclock; we + * are a holder if it is NULL or something different + */ + if (curproclock->tag.myProc->waitProcLock == curproclock) + { + if (first_waiter) + { + appendStringInfo(&lock_waiters_sbuf, "%d", + curproclock->tag.myProc->pid); + first_waiter = false; + } + else + appendStringInfo(&lock_waiters_sbuf, ", %d", + curproclock->tag.myProc->pid); + } + else + { + if (first_holder) + { + appendStringInfo(&lock_holders_sbuf, "%d", + curproclock->tag.myProc->pid); + first_holder = false; + } + else + appendStringInfo(&lock_holders_sbuf, ", %d", + curproclock->tag.myProc->pid); + + lockHoldersNum++; + } + } + + LWLockRelease(partitionLock); + + if (deadlock_state == DS_SOFT_DEADLOCK) + ereport(LOG, + (errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs), + (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", + "Processes holding the lock: %s. Wait queue: %s.", + lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); + else if (deadlock_state == DS_HARD_DEADLOCK) + { + /* + * This message is a bit redundant with the error that will be + * reported subsequently, but in some cases the error report + * might not make it to the log (eg, if it's caught by an + * exception handler), and we want to ensure all long-wait + * events get logged. + */ + ereport(LOG, + (errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs), + (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", + "Processes holding the lock: %s. Wait queue: %s.", + lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); + } + + if (myWaitStatus == PROC_WAIT_STATUS_WAITING) + ereport(LOG, + (errmsg("process %d still waiting for %s on %s after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs), + (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", + "Processes holding the lock: %s. Wait queue: %s.", + lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); + else if (myWaitStatus == PROC_WAIT_STATUS_OK) + ereport(LOG, + (errmsg("process %d acquired %s on %s after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs))); + else + { + Assert(myWaitStatus == PROC_WAIT_STATUS_ERROR); + + /* + * Currently, the deadlock checker always kicks its own + * process, which means that we'll only see + * PROC_WAIT_STATUS_ERROR when deadlock_state == + * DS_HARD_DEADLOCK, and there's no need to print redundant + * messages. But for completeness and future-proofing, print + * a message if it looks like someone else kicked us off the + * lock. + */ + if (deadlock_state != DS_HARD_DEADLOCK) + ereport(LOG, + (errmsg("process %d failed to acquire %s on %s after %ld.%03d ms", + MyProcPid, modename, buf.data, msecs, usecs), + (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.", + "Processes holding the lock: %s. Wait queue: %s.", + lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data)))); + } + + /* + * At this point we might still need to wait for the lock. Reset + * state so we don't print the above messages again. + */ + deadlock_state = DS_NO_DEADLOCK; + + pfree(buf.data); + pfree(lock_holders_sbuf.data); + pfree(lock_waiters_sbuf.data); + } + } while (myWaitStatus == PROC_WAIT_STATUS_WAITING); + + /* + * Disable the timers, if they are still running. As in LockErrorCleanup, + * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has + * already caused QueryCancelPending to become set, we want the cancel to + * be reported as a lock timeout, not a user cancel. + */ + if (!InHotStandby) + { + if (LockTimeout > 0) + { + DisableTimeoutParams timeouts[2]; + + timeouts[0].id = DEADLOCK_TIMEOUT; + timeouts[0].keep_indicator = false; + timeouts[1].id = LOCK_TIMEOUT; + timeouts[1].keep_indicator = true; + disable_timeouts(timeouts, 2); + } + else + disable_timeout(DEADLOCK_TIMEOUT, false); + } + + /* + * Emit the log message if recovery conflict on lock was resolved but the + * startup process waited longer than deadlock_timeout for it. + */ + if (InHotStandby && logged_recovery_conflict) + LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK, + standbyWaitStart, GetCurrentTimestamp(), + NULL, false); + + /* + * Re-acquire the lock table's partition lock. We have to do this to hold + * off cancel/die interrupts before we can mess with lockAwaited (else we + * might have a missed or duplicated locallock update). + */ + LWLockAcquire(partitionLock, LW_EXCLUSIVE); + + /* + * We no longer want LockErrorCleanup to do anything. + */ + lockAwaited = NULL; + + /* + * If we got the lock, be sure to remember it in the locallock table. + */ + if (MyProc->waitStatus == PROC_WAIT_STATUS_OK) + GrantAwaitedLock(); + + /* + * We don't have to do anything else, because the awaker did all the + * necessary update of the lock table and MyProc. + */ + return MyProc->waitStatus; +} + + +/* + * ProcWakeup -- wake up a process by setting its latch. + * + * Also remove the process from the wait queue and set its links invalid. + * + * The appropriate lock partition lock must be held by caller. + * + * XXX: presently, this code is only used for the "success" case, and only + * works correctly for that case. To clean up in failure case, would need + * to twiddle the lock's request counts too --- see RemoveFromWaitQueue. + * Hence, in practice the waitStatus parameter must be PROC_WAIT_STATUS_OK. + */ +void +ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus) +{ + if (dlist_node_is_detached(&proc->links)) + return; + + Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING); + + /* Remove process from wait queue */ + dclist_delete_from_thoroughly(&proc->waitLock->waitProcs, &proc->links); + + /* Clean up process' state and pass it the ok/fail signal */ + proc->waitLock = NULL; + proc->waitProcLock = NULL; + proc->waitStatus = waitStatus; + pg_atomic_write_u64(&MyProc->waitStart, 0); + + /* And awaken it */ + SetLatch(&proc->procLatch); +} + +/* + * ProcLockWakeup -- routine for waking up processes when a lock is + * released (or a prior waiter is aborted). Scan all waiters + * for lock, waken any that are no longer blocked. + * + * The appropriate lock partition lock must be held by caller. + */ +void +ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock) +{ + dclist_head *waitQueue = &lock->waitProcs; + LOCKMASK aheadRequests = 0; + dlist_mutable_iter miter; + + if (dclist_is_empty(waitQueue)) + return; + + dclist_foreach_modify(miter, waitQueue) + { + PGPROC *proc = dlist_container(PGPROC, links, miter.cur); + LOCKMODE lockmode = proc->waitLockMode; + + /* + * Waken if (a) doesn't conflict with requests of earlier waiters, and + * (b) doesn't conflict with already-held locks. + */ + if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 && + !LockCheckConflicts(lockMethodTable, lockmode, lock, + proc->waitProcLock)) + { + /* OK to waken */ + GrantLock(lock, proc->waitProcLock, lockmode); + /* removes proc from the lock's waiting process queue */ + ProcWakeup(proc, PROC_WAIT_STATUS_OK); + } + else + { + /* + * Lock conflicts: Don't wake, but remember requested mode for + * later checks. + */ + aheadRequests |= LOCKBIT_ON(lockmode); + } + } +} + +/* + * CheckDeadLock + * + * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a + * lock to be released by some other process. Check if there's a deadlock; if + * not, just return. (But signal ProcSleep to log a message, if + * log_lock_waits is true.) If we have a real deadlock, remove ourselves from + * the lock's wait queue and signal an error to ProcSleep. + */ +static void +CheckDeadLock(void) +{ + int i; + + /* + * Acquire exclusive lock on the entire shared lock data structures. Must + * grab LWLocks in partition-number order to avoid LWLock deadlock. + * + * Note that the deadlock check interrupt had better not be enabled + * anywhere that this process itself holds lock partition locks, else this + * will wait forever. Also note that LWLockAcquire creates a critical + * section, so that this routine cannot be interrupted by cancel/die + * interrupts. + */ + for (i = 0; i < NUM_LOCK_PARTITIONS; i++) + LWLockAcquire(LockHashPartitionLockByIndex(i), LW_EXCLUSIVE); + + /* + * Check to see if we've been awoken by anyone in the interim. + * + * If we have, we can return and resume our transaction -- happy day. + * Before we are awoken the process releasing the lock grants it to us so + * we know that we don't have to wait anymore. + * + * We check by looking to see if we've been unlinked from the wait queue. + * This is safe because we hold the lock partition lock. + */ + if (MyProc->links.prev == NULL || + MyProc->links.next == NULL) + goto check_done; + +#ifdef LOCK_DEBUG + if (Debug_deadlocks) + DumpAllLocks(); +#endif + + /* Run the deadlock check, and set deadlock_state for use by ProcSleep */ + deadlock_state = DeadLockCheck(MyProc); + + if (deadlock_state == DS_HARD_DEADLOCK) + { + /* + * Oops. We have a deadlock. + * + * Get this process out of wait state. (Note: we could do this more + * efficiently by relying on lockAwaited, but use this coding to + * preserve the flexibility to kill some other transaction than the + * one detecting the deadlock.) + * + * RemoveFromWaitQueue sets MyProc->waitStatus to + * PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we + * return from the signal handler. + */ + Assert(MyProc->waitLock != NULL); + RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag))); + + /* + * We're done here. Transaction abort caused by the error that + * ProcSleep will raise will cause any other locks we hold to be + * released, thus allowing other processes to wake up; we don't need + * to do that here. NOTE: an exception is that releasing locks we + * hold doesn't consider the possibility of waiters that were blocked + * behind us on the lock we just failed to get, and might now be + * wakable because we're not in front of them anymore. However, + * RemoveFromWaitQueue took care of waking up any such processes. + */ + } + + /* + * And release locks. We do this in reverse order for two reasons: (1) + * Anyone else who needs more than one of the locks will be trying to lock + * them in increasing order; we don't want to release the other process + * until it can get all the locks it needs. (2) This avoids O(N^2) + * behavior inside LWLockRelease. + */ +check_done: + for (i = NUM_LOCK_PARTITIONS; --i >= 0;) + LWLockRelease(LockHashPartitionLockByIndex(i)); +} + +/* + * CheckDeadLockAlert - Handle the expiry of deadlock_timeout. + * + * NB: Runs inside a signal handler, be careful. + */ +void +CheckDeadLockAlert(void) +{ + int save_errno = errno; + + got_deadlock_timeout = true; + + /* + * Have to set the latch again, even if handle_sig_alarm already did. Back + * then got_deadlock_timeout wasn't yet set... It's unlikely that this + * ever would be a problem, but setting a set latch again is cheap. + * + * Note that, when this function runs inside procsignal_sigusr1_handler(), + * the handler function sets the latch again after the latch is set here. + */ + SetLatch(MyLatch); + errno = save_errno; +} + +/* + * ProcWaitForSignal - wait for a signal from another backend. + * + * As this uses the generic process latch the caller has to be robust against + * unrelated wakeups: Always check that the desired state has occurred, and + * wait again if not. + */ +void +ProcWaitForSignal(uint32 wait_event_info) +{ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + wait_event_info); + ResetLatch(MyLatch); + CHECK_FOR_INTERRUPTS(); +} + +/* + * ProcSendSignal - set the latch of a backend identified by pgprocno + */ +void +ProcSendSignal(int pgprocno) +{ + if (pgprocno < 0 || pgprocno >= ProcGlobal->allProcCount) + elog(ERROR, "pgprocno out of range"); + + SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch); +} + +/* + * BecomeLockGroupLeader - designate process as lock group leader + * + * Once this function has returned, other processes can join the lock group + * by calling BecomeLockGroupMember. + */ +void +BecomeLockGroupLeader(void) +{ + LWLock *leader_lwlock; + + /* If we already did it, we don't need to do it again. */ + if (MyProc->lockGroupLeader == MyProc) + return; + + /* We had better not be a follower. */ + Assert(MyProc->lockGroupLeader == NULL); + + /* Create single-member group, containing only ourselves. */ + leader_lwlock = LockHashPartitionLockByProc(MyProc); + LWLockAcquire(leader_lwlock, LW_EXCLUSIVE); + MyProc->lockGroupLeader = MyProc; + dlist_push_head(&MyProc->lockGroupMembers, &MyProc->lockGroupLink); + LWLockRelease(leader_lwlock); +} + +/* + * BecomeLockGroupMember - designate process as lock group member + * + * This is pretty straightforward except for the possibility that the leader + * whose group we're trying to join might exit before we manage to do so; + * and the PGPROC might get recycled for an unrelated process. To avoid + * that, we require the caller to pass the PID of the intended PGPROC as + * an interlock. Returns true if we successfully join the intended lock + * group, and false if not. + */ +bool +BecomeLockGroupMember(PGPROC *leader, int pid) +{ + LWLock *leader_lwlock; + bool ok = false; + + /* Group leader can't become member of group */ + Assert(MyProc != leader); + + /* Can't already be a member of a group */ + Assert(MyProc->lockGroupLeader == NULL); + + /* PID must be valid. */ + Assert(pid != 0); + + /* + * Get lock protecting the group fields. Note LockHashPartitionLockByProc + * accesses leader->pgprocno in a PGPROC that might be free. This is safe + * because all PGPROCs' pgprocno fields are set during shared memory + * initialization and never change thereafter; so we will acquire the + * correct lock even if the leader PGPROC is in process of being recycled. + */ + leader_lwlock = LockHashPartitionLockByProc(leader); + LWLockAcquire(leader_lwlock, LW_EXCLUSIVE); + + /* Is this the leader we're looking for? */ + if (leader->pid == pid && leader->lockGroupLeader == leader) + { + /* OK, join the group */ + ok = true; + MyProc->lockGroupLeader = leader; + dlist_push_tail(&leader->lockGroupMembers, &MyProc->lockGroupLink); + } + LWLockRelease(leader_lwlock); + + return ok; +} diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c new file mode 100644 index 0000000..327ac64 --- /dev/null +++ b/src/backend/storage/lmgr/s_lock.c @@ -0,0 +1,324 @@ +/*------------------------------------------------------------------------- + * + * s_lock.c + * Hardware-dependent implementation of spinlocks. + * + * When waiting for a contended spinlock we loop tightly for awhile, then + * delay using pg_usleep() and try again. Preferably, "awhile" should be a + * small multiple of the maximum time we expect a spinlock to be held. 100 + * iterations seems about right as an initial guess. However, on a + * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario + * it's usually better to spin a bit longer than to call the kernel, so we try + * to adapt the spin loop count depending on whether we seem to be in a + * uniprocessor or multiprocessor. + * + * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd + * be wrong; there are platforms where that can result in a "stuck + * spinlock" failure. This has been seen particularly on Alphas; it seems + * that the first TAS after returning from kernel space will always fail + * on that hardware. + * + * Once we do decide to block, we use randomly increasing pg_usleep() + * delays. The first delay is 1 msec, then the delay randomly increases to + * about one second, after which we reset to 1 msec and start again. The + * idea here is that in the presence of heavy contention we need to + * increase the delay, else the spinlock holder may never get to run and + * release the lock. (Consider situation where spinlock holder has been + * nice'd down in priority by the scheduler --- it will not get scheduled + * until all would-be acquirers are sleeping, so if we always use a 1-msec + * sleep, there is a real possibility of starvation.) But we can't just + * clamp the delay to an upper bound, else it would take a long time to + * make a reasonable number of tries. + * + * We time out and declare error after NUM_DELAYS delays (thus, exactly + * that many tries). With the given settings, this will usually take 2 or + * so minutes. It seems better to fix the total number of tries (and thus + * the probability of unintended failure) than to fix the total time + * spent. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/s_lock.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "common/pg_prng.h" +#include "port/atomics.h" +#include "storage/s_lock.h" +#include "utils/wait_event.h" + +#define MIN_SPINS_PER_DELAY 10 +#define MAX_SPINS_PER_DELAY 1000 +#define NUM_DELAYS 1000 +#define MIN_DELAY_USEC 1000L +#define MAX_DELAY_USEC 1000000L + + +slock_t dummy_spinlock; + +static int spins_per_delay = DEFAULT_SPINS_PER_DELAY; + + +/* + * s_lock_stuck() - complain about a stuck spinlock + */ +static void +s_lock_stuck(const char *file, int line, const char *func) +{ + if (!func) + func = "(unknown)"; +#if defined(S_LOCK_TEST) + fprintf(stderr, + "\nStuck spinlock detected at %s, %s:%d.\n", + func, file, line); + exit(1); +#else + elog(PANIC, "stuck spinlock detected at %s, %s:%d", + func, file, line); +#endif +} + +/* + * s_lock(lock) - platform-independent portion of waiting for a spinlock. + */ +int +s_lock(volatile slock_t *lock, const char *file, int line, const char *func) +{ + SpinDelayStatus delayStatus; + + init_spin_delay(&delayStatus, file, line, func); + + while (TAS_SPIN(lock)) + { + perform_spin_delay(&delayStatus); + } + + finish_spin_delay(&delayStatus); + + return delayStatus.delays; +} + +#ifdef USE_DEFAULT_S_UNLOCK +void +s_unlock(volatile slock_t *lock) +{ +#ifdef TAS_ACTIVE_WORD + /* HP's PA-RISC */ + *TAS_ACTIVE_WORD(lock) = -1; +#else + *lock = 0; +#endif +} +#endif + +/* + * Wait while spinning on a contended spinlock. + */ +void +perform_spin_delay(SpinDelayStatus *status) +{ + /* CPU-specific delay each time through the loop */ + SPIN_DELAY(); + + /* Block the process every spins_per_delay tries */ + if (++(status->spins) >= spins_per_delay) + { + if (++(status->delays) > NUM_DELAYS) + s_lock_stuck(status->file, status->line, status->func); + + if (status->cur_delay == 0) /* first time to delay? */ + status->cur_delay = MIN_DELAY_USEC; + + /* + * Once we start sleeping, the overhead of reporting a wait event is + * justified. Actively spinning easily stands out in profilers, but + * sleeping with an exponential backoff is harder to spot... + * + * We might want to report something more granular at some point, but + * this is better than nothing. + */ + pgstat_report_wait_start(WAIT_EVENT_SPIN_DELAY); + pg_usleep(status->cur_delay); + pgstat_report_wait_end(); + +#if defined(S_LOCK_TEST) + fprintf(stdout, "*"); + fflush(stdout); +#endif + + /* increase delay by a random fraction between 1X and 2X */ + status->cur_delay += (int) (status->cur_delay * + pg_prng_double(&pg_global_prng_state) + 0.5); + /* wrap back to minimum delay when max is exceeded */ + if (status->cur_delay > MAX_DELAY_USEC) + status->cur_delay = MIN_DELAY_USEC; + + status->spins = 0; + } +} + +/* + * After acquiring a spinlock, update estimates about how long to loop. + * + * If we were able to acquire the lock without delaying, it's a good + * indication we are in a multiprocessor. If we had to delay, it's a sign + * (but not a sure thing) that we are in a uniprocessor. Hence, we + * decrement spins_per_delay slowly when we had to delay, and increase it + * rapidly when we didn't. It's expected that spins_per_delay will + * converge to the minimum value on a uniprocessor and to the maximum + * value on a multiprocessor. + * + * Note: spins_per_delay is local within our current process. We want to + * average these observations across multiple backends, since it's + * relatively rare for this function to even get entered, and so a single + * backend might not live long enough to converge on a good value. That + * is handled by the two routines below. + */ +void +finish_spin_delay(SpinDelayStatus *status) +{ + if (status->cur_delay == 0) + { + /* we never had to delay */ + if (spins_per_delay < MAX_SPINS_PER_DELAY) + spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY); + } + else + { + if (spins_per_delay > MIN_SPINS_PER_DELAY) + spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY); + } +} + +/* + * Set local copy of spins_per_delay during backend startup. + * + * NB: this has to be pretty fast as it is called while holding a spinlock + */ +void +set_spins_per_delay(int shared_spins_per_delay) +{ + spins_per_delay = shared_spins_per_delay; +} + +/* + * Update shared estimate of spins_per_delay during backend exit. + * + * NB: this has to be pretty fast as it is called while holding a spinlock + */ +int +update_spins_per_delay(int shared_spins_per_delay) +{ + /* + * We use an exponential moving average with a relatively slow adaption + * rate, so that noise in any one backend's result won't affect the shared + * value too much. As long as both inputs are within the allowed range, + * the result must be too, so we need not worry about clamping the result. + * + * We deliberately truncate rather than rounding; this is so that single + * adjustments inside a backend can affect the shared estimate (see the + * asymmetric adjustment rules above). + */ + return (shared_spins_per_delay * 15 + spins_per_delay) / 16; +} + + +/*****************************************************************************/ +#if defined(S_LOCK_TEST) + +/* + * test program for verifying a port's spinlock support. + */ + +struct test_lock_struct +{ + char pad1; + slock_t lock; + char pad2; +}; + +volatile struct test_lock_struct test_lock; + +int +main() +{ + pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL)); + + test_lock.pad1 = test_lock.pad2 = 0x44; + + S_INIT_LOCK(&test_lock.lock); + + if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) + { + printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); + return 1; + } + + if (!S_LOCK_FREE(&test_lock.lock)) + { + printf("S_LOCK_TEST: failed, lock not initialized\n"); + return 1; + } + + S_LOCK(&test_lock.lock); + + if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) + { + printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); + return 1; + } + + if (S_LOCK_FREE(&test_lock.lock)) + { + printf("S_LOCK_TEST: failed, lock not locked\n"); + return 1; + } + + S_UNLOCK(&test_lock.lock); + + if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) + { + printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); + return 1; + } + + if (!S_LOCK_FREE(&test_lock.lock)) + { + printf("S_LOCK_TEST: failed, lock not unlocked\n"); + return 1; + } + + S_LOCK(&test_lock.lock); + + if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44) + { + printf("S_LOCK_TEST: failed, declared datatype is wrong size\n"); + return 1; + } + + if (S_LOCK_FREE(&test_lock.lock)) + { + printf("S_LOCK_TEST: failed, lock not re-locked\n"); + return 1; + } + + printf("S_LOCK_TEST: this will print %d stars and then\n", NUM_DELAYS); + printf(" exit with a 'stuck spinlock' message\n"); + printf(" if S_LOCK() and TAS() are working.\n"); + fflush(stdout); + + s_lock(&test_lock.lock, __FILE__, __LINE__, __func__); + + printf("S_LOCK_TEST: failed, lock not locked\n"); + return 1; +} + +#endif /* S_LOCK_TEST */ diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c new file mode 100644 index 0000000..6052779 --- /dev/null +++ b/src/backend/storage/lmgr/spin.c @@ -0,0 +1,180 @@ +/*------------------------------------------------------------------------- + * + * spin.c + * Hardware-independent implementation of spinlocks. + * + * + * For machines that have test-and-set (TAS) instructions, s_lock.h/.c + * define the spinlock implementation. This file contains only a stub + * implementation for spinlocks using PGSemaphores. Unless semaphores + * are implemented in a way that doesn't involve a kernel call, this + * is too slow to be very useful :-( + * + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/lmgr/spin.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/pg_sema.h" +#include "storage/shmem.h" +#include "storage/spin.h" + + +#ifndef HAVE_SPINLOCKS + +/* + * No TAS, so spinlocks are implemented as PGSemaphores. + */ + +#ifndef HAVE_ATOMICS +#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES) +#else +#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES) +#endif /* HAVE_ATOMICS */ + +PGSemaphore *SpinlockSemaArray; + +#else /* !HAVE_SPINLOCKS */ + +#define NUM_EMULATION_SEMAPHORES 0 + +#endif /* HAVE_SPINLOCKS */ + +/* + * Report the amount of shared memory needed to store semaphores for spinlock + * support. + */ +Size +SpinlockSemaSize(void) +{ + return NUM_EMULATION_SEMAPHORES * sizeof(PGSemaphore); +} + +/* + * Report number of semaphores needed to support spinlocks. + */ +int +SpinlockSemas(void) +{ + return NUM_EMULATION_SEMAPHORES; +} + +#ifndef HAVE_SPINLOCKS + +/* + * Initialize spinlock emulation. + * + * This must be called after PGReserveSemaphores(). + */ +void +SpinlockSemaInit(void) +{ + PGSemaphore *spinsemas; + int nsemas = SpinlockSemas(); + int i; + + /* + * We must use ShmemAllocUnlocked(), since the spinlock protecting + * ShmemAlloc() obviously can't be ready yet. + */ + spinsemas = (PGSemaphore *) ShmemAllocUnlocked(SpinlockSemaSize()); + for (i = 0; i < nsemas; ++i) + spinsemas[i] = PGSemaphoreCreate(); + SpinlockSemaArray = spinsemas; +} + +/* + * s_lock.h hardware-spinlock emulation using semaphores + * + * We map all spinlocks onto NUM_EMULATION_SEMAPHORES semaphores. It's okay to + * map multiple spinlocks onto one semaphore because no process should ever + * hold more than one at a time. We just need enough semaphores so that we + * aren't adding too much extra contention from that. + * + * There is one exception to the restriction of only holding one spinlock at a + * time, which is that it's ok if emulated atomic operations are nested inside + * spinlocks. To avoid the danger of spinlocks and atomic using the same sema, + * we make sure "normal" spinlocks and atomics backed by spinlocks use + * distinct semaphores (see the nested argument to s_init_lock_sema). + * + * slock_t is just an int for this implementation; it holds the spinlock + * number from 1..NUM_EMULATION_SEMAPHORES. We intentionally ensure that 0 + * is not a valid value, so that testing with this code can help find + * failures to initialize spinlocks. + */ + +static inline void +s_check_valid(int lockndx) +{ + if (unlikely(lockndx <= 0 || lockndx > NUM_EMULATION_SEMAPHORES)) + elog(ERROR, "invalid spinlock number: %d", lockndx); +} + +void +s_init_lock_sema(volatile slock_t *lock, bool nested) +{ + static uint32 counter = 0; + uint32 offset; + uint32 sema_total; + uint32 idx; + + if (nested) + { + /* + * To allow nesting atomics inside spinlocked sections, use a + * different spinlock. See comment above. + */ + offset = 1 + NUM_SPINLOCK_SEMAPHORES; + sema_total = NUM_ATOMICS_SEMAPHORES; + } + else + { + offset = 1; + sema_total = NUM_SPINLOCK_SEMAPHORES; + } + + idx = (counter++ % sema_total) + offset; + + /* double check we did things correctly */ + s_check_valid(idx); + + *lock = idx; +} + +void +s_unlock_sema(volatile slock_t *lock) +{ + int lockndx = *lock; + + s_check_valid(lockndx); + + PGSemaphoreUnlock(SpinlockSemaArray[lockndx - 1]); +} + +bool +s_lock_free_sema(volatile slock_t *lock) +{ + /* We don't currently use S_LOCK_FREE anyway */ + elog(ERROR, "spin.c does not support S_LOCK_FREE()"); + return false; +} + +int +tas_sema(volatile slock_t *lock) +{ + int lockndx = *lock; + + s_check_valid(lockndx); + + /* Note that TAS macros return 0 if *success* */ + return !PGSemaphoreTryLock(SpinlockSemaArray[lockndx - 1]); +} + +#endif /* !HAVE_SPINLOCKS */ diff --git a/src/backend/storage/meson.build b/src/backend/storage/meson.build new file mode 100644 index 0000000..6ea9faa --- /dev/null +++ b/src/backend/storage/meson.build @@ -0,0 +1,11 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +subdir('buffer') +subdir('file') +subdir('freespace') +subdir('ipc') +subdir('large_object') +subdir('lmgr') +subdir('page') +subdir('smgr') +subdir('sync') diff --git a/src/backend/storage/page/Makefile b/src/backend/storage/page/Makefile new file mode 100644 index 0000000..da539b1 --- /dev/null +++ b/src/backend/storage/page/Makefile @@ -0,0 +1,23 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/page +# +# IDENTIFICATION +# src/backend/storage/page/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/page +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + bufpage.o \ + checksum.o \ + itemptr.o + +include $(top_srcdir)/src/backend/common.mk + +# Provide special optimization flags for checksum.c +checksum.o: CFLAGS += ${CFLAGS_UNROLL_LOOPS} ${CFLAGS_VECTORIZE} diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README new file mode 100644 index 0000000..e30d7ac --- /dev/null +++ b/src/backend/storage/page/README @@ -0,0 +1,64 @@ +src/backend/storage/page/README + +Checksums +--------- + +Checksums on data pages are designed to detect corruption by the I/O system. +We do not protect buffers against uncorrectable memory errors, since these +have a very low measured incidence according to research on large server farms, +http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed +2010/12/22 on -hackers list. + +Current implementation requires this be enabled system-wide at initdb time, or +by using the pg_checksums tool on an offline cluster. + +The checksum is not valid at all times on a data page!! +The checksum is valid when the page leaves the shared pool and is checked +when it later re-enters the shared pool as a result of I/O. +We set the checksum on a buffer in the shared pool immediately before we +flush the buffer. As a result we implicitly invalidate the page's checksum +when we modify the page for a data change or even a hint. This means that +many or even most pages in shared buffers have invalid page checksums, +so be careful how you interpret the pd_checksum field. + +That means that WAL-logged changes to a page do NOT update the page checksum, +so full page images may not have a valid checksum. But those page images have +the WAL CRC covering them and so are verified separately from this +mechanism. WAL replay should not test the checksum of a full-page image. + +The best way to understand this is that WAL CRCs protect records entering the +WAL stream, and data page verification protects blocks entering the shared +buffer pool. They are similar in purpose, yet completely separate. Together +they ensure we are able to detect errors in data re-entering +PostgreSQL-controlled memory. Note also that the WAL checksum is a 32-bit CRC, +whereas the page checksum is only 16-bits. + +Any write of a data block can cause a torn page if the write is unsuccessful. +Full page writes protect us from that, which are stored in WAL. Setting hint +bits when a page is already dirty is OK because a full page write must already +have been written for it since the last checkpoint. Setting hint bits on an +otherwise clean page can allow torn pages; this doesn't normally matter since +they are just hints, but when the page has checksums, then losing a few bits +would cause the checksum to be invalid. So if we have full_page_writes = on +and checksums enabled then we must write a WAL record specifically so that we +record a full page image in WAL. Hint bits updates should be protected using +MarkBufferDirtyHint(), which is responsible for writing the full-page image +when necessary. + +Note that when we write a page checksum we include the hopefully zeroed bytes +that form the hole in the centre of a standard page. Thus, when we read the +block back from storage we implicitly check that the hole is still all zeroes. +We do this to ensure that we spot errors that could have destroyed data even +if they haven't actually done so. Full page images stored in WAL do *not* +check that the hole is all zero; the data in the hole is simply skipped and +re-zeroed if the backup block is reapplied. We do this because a failure in +WAL is a fatal error and prevents further recovery, whereas a checksum failure +on a normal data block is a hard error but not a critical one for the server, +even if it is a very bad thing for the user. + +New WAL records cannot be written during recovery, so hint bits set during +recovery must not dirty the page if the buffer is not already dirty, when +checksums are enabled. Systems in Hot-Standby mode may benefit from hint bits +being set, but with checksums enabled, a page cannot be dirtied after setting a +hint bit (due to the torn page risk). So, it must wait for full-page images +containing the hint bit updates to arrive from the primary. diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c new file mode 100644 index 0000000..9a302dd --- /dev/null +++ b/src/backend/storage/page/bufpage.c @@ -0,0 +1,1549 @@ +/*------------------------------------------------------------------------- + * + * bufpage.c + * POSTGRES standard buffer page code. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/page/bufpage.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/itup.h" +#include "access/xlog.h" +#include "pgstat.h" +#include "storage/checksum.h" +#include "utils/memdebug.h" +#include "utils/memutils.h" + + +/* GUC variable */ +bool ignore_checksum_failure = false; + + +/* ---------------------------------------------------------------- + * Page support functions + * ---------------------------------------------------------------- + */ + +/* + * PageInit + * Initializes the contents of a page. + * Note that we don't calculate an initial checksum here; that's not done + * until it's time to write. + */ +void +PageInit(Page page, Size pageSize, Size specialSize) +{ + PageHeader p = (PageHeader) page; + + specialSize = MAXALIGN(specialSize); + + Assert(pageSize == BLCKSZ); + Assert(pageSize > specialSize + SizeOfPageHeaderData); + + /* Make sure all fields of page are zero, as well as unused space */ + MemSet(p, 0, pageSize); + + p->pd_flags = 0; + p->pd_lower = SizeOfPageHeaderData; + p->pd_upper = pageSize - specialSize; + p->pd_special = pageSize - specialSize; + PageSetPageSizeAndVersion(page, pageSize, PG_PAGE_LAYOUT_VERSION); + /* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */ +} + + +/* + * PageIsVerifiedExtended + * Check that the page header and checksum (if any) appear valid. + * + * This is called when a page has just been read in from disk. The idea is + * to cheaply detect trashed pages before we go nuts following bogus line + * pointers, testing invalid transaction identifiers, etc. + * + * It turns out to be necessary to allow zeroed pages here too. Even though + * this routine is *not* called when deliberately adding a page to a relation, + * there are scenarios in which a zeroed page might be found in a table. + * (Example: a backend extends a relation, then crashes before it can write + * any WAL entry about the new page. The kernel will already have the + * zeroed page in the file, and it will stay that way after restart.) So we + * allow zeroed pages here, and are careful that the page access macros + * treat such a page as empty and without free space. Eventually, VACUUM + * will clean up such a page and make it usable. + * + * If flag PIV_LOG_WARNING is set, a WARNING is logged in the event of + * a checksum failure. + * + * If flag PIV_REPORT_STAT is set, a checksum failure is reported directly + * to pgstat. + */ +bool +PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags) +{ + PageHeader p = (PageHeader) page; + size_t *pagebytes; + int i; + bool checksum_failure = false; + bool header_sane = false; + bool all_zeroes = false; + uint16 checksum = 0; + + /* + * Don't verify page data unless the page passes basic non-zero test + */ + if (!PageIsNew(page)) + { + if (DataChecksumsEnabled()) + { + checksum = pg_checksum_page((char *) page, blkno); + + if (checksum != p->pd_checksum) + checksum_failure = true; + } + + /* + * The following checks don't prove the header is correct, only that + * it looks sane enough to allow into the buffer pool. Later usage of + * the block can still reveal problems, which is why we offer the + * checksum option. + */ + if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 && + p->pd_lower <= p->pd_upper && + p->pd_upper <= p->pd_special && + p->pd_special <= BLCKSZ && + p->pd_special == MAXALIGN(p->pd_special)) + header_sane = true; + + if (header_sane && !checksum_failure) + return true; + } + + /* Check all-zeroes case */ + all_zeroes = true; + pagebytes = (size_t *) page; + for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++) + { + if (pagebytes[i] != 0) + { + all_zeroes = false; + break; + } + } + + if (all_zeroes) + return true; + + /* + * Throw a WARNING if the checksum fails, but only after we've checked for + * the all-zeroes case. + */ + if (checksum_failure) + { + if ((flags & PIV_LOG_WARNING) != 0) + ereport(WARNING, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("page verification failed, calculated checksum %u but expected %u", + checksum, p->pd_checksum))); + + if ((flags & PIV_REPORT_STAT) != 0) + pgstat_report_checksum_failure(); + + if (header_sane && ignore_checksum_failure) + return true; + } + + return false; +} + + +/* + * PageAddItemExtended + * + * Add an item to a page. Return value is the offset at which it was + * inserted, or InvalidOffsetNumber if the item is not inserted for any + * reason. A WARNING is issued indicating the reason for the refusal. + * + * offsetNumber must be either InvalidOffsetNumber to specify finding a + * free line pointer, or a value between FirstOffsetNumber and one past + * the last existing item, to specify using that particular line pointer. + * + * If offsetNumber is valid and flag PAI_OVERWRITE is set, we just store + * the item at the specified offsetNumber, which must be either a + * currently-unused line pointer, or one past the last existing item. + * + * If offsetNumber is valid and flag PAI_OVERWRITE is not set, insert + * the item at the specified offsetNumber, moving existing items later + * in the array to make room. + * + * If offsetNumber is not valid, then assign a slot by finding the first + * one that is both unused and deallocated. + * + * If flag PAI_IS_HEAP is set, we enforce that there can't be more than + * MaxHeapTuplesPerPage line pointers on the page. + * + * !!! EREPORT(ERROR) IS DISALLOWED HERE !!! + */ +OffsetNumber +PageAddItemExtended(Page page, + Item item, + Size size, + OffsetNumber offsetNumber, + int flags) +{ + PageHeader phdr = (PageHeader) page; + Size alignedSize; + int lower; + int upper; + ItemId itemId; + OffsetNumber limit; + bool needshuffle = false; + + /* + * Be wary about corrupted page pointers + */ + if (phdr->pd_lower < SizeOfPageHeaderData || + phdr->pd_lower > phdr->pd_upper || + phdr->pd_upper > phdr->pd_special || + phdr->pd_special > BLCKSZ) + ereport(PANIC, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); + + /* + * Select offsetNumber to place the new item at + */ + limit = OffsetNumberNext(PageGetMaxOffsetNumber(page)); + + /* was offsetNumber passed in? */ + if (OffsetNumberIsValid(offsetNumber)) + { + /* yes, check it */ + if ((flags & PAI_OVERWRITE) != 0) + { + if (offsetNumber < limit) + { + itemId = PageGetItemId(page, offsetNumber); + if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId)) + { + elog(WARNING, "will not overwrite a used ItemId"); + return InvalidOffsetNumber; + } + } + } + else + { + if (offsetNumber < limit) + needshuffle = true; /* need to move existing linp's */ + } + } + else + { + /* offsetNumber was not passed in, so find a free slot */ + /* if no free slot, we'll put it at limit (1st open slot) */ + if (PageHasFreeLinePointers(page)) + { + /* + * Scan line pointer array to locate a "recyclable" (unused) + * ItemId. + * + * Always use earlier items first. PageTruncateLinePointerArray + * can only truncate unused items when they appear as a contiguous + * group at the end of the line pointer array. + */ + for (offsetNumber = FirstOffsetNumber; + offsetNumber < limit; /* limit is maxoff+1 */ + offsetNumber++) + { + itemId = PageGetItemId(page, offsetNumber); + + /* + * We check for no storage as well, just to be paranoid; + * unused items should never have storage. Assert() that the + * invariant is respected too. + */ + Assert(ItemIdIsUsed(itemId) || !ItemIdHasStorage(itemId)); + + if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId)) + break; + } + if (offsetNumber >= limit) + { + /* the hint is wrong, so reset it */ + PageClearHasFreeLinePointers(page); + } + } + else + { + /* don't bother searching if hint says there's no free slot */ + offsetNumber = limit; + } + } + + /* Reject placing items beyond the first unused line pointer */ + if (offsetNumber > limit) + { + elog(WARNING, "specified item offset is too large"); + return InvalidOffsetNumber; + } + + /* Reject placing items beyond heap boundary, if heap */ + if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage) + { + elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page"); + return InvalidOffsetNumber; + } + + /* + * Compute new lower and upper pointers for page, see if it'll fit. + * + * Note: do arithmetic as signed ints, to avoid mistakes if, say, + * alignedSize > pd_upper. + */ + if (offsetNumber == limit || needshuffle) + lower = phdr->pd_lower + sizeof(ItemIdData); + else + lower = phdr->pd_lower; + + alignedSize = MAXALIGN(size); + + upper = (int) phdr->pd_upper - (int) alignedSize; + + if (lower > upper) + return InvalidOffsetNumber; + + /* + * OK to insert the item. First, shuffle the existing pointers if needed. + */ + itemId = PageGetItemId(page, offsetNumber); + + if (needshuffle) + memmove(itemId + 1, itemId, + (limit - offsetNumber) * sizeof(ItemIdData)); + + /* set the line pointer */ + ItemIdSetNormal(itemId, upper, size); + + /* + * Items normally contain no uninitialized bytes. Core bufpage consumers + * conform, but this is not a necessary coding rule; a new index AM could + * opt to depart from it. However, data type input functions and other + * C-language functions that synthesize datums should initialize all + * bytes; datumIsEqual() relies on this. Testing here, along with the + * similar check in printtup(), helps to catch such mistakes. + * + * Values of the "name" type retrieved via index-only scans may contain + * uninitialized bytes; see comment in btrescan(). Valgrind will report + * this as an error, but it is safe to ignore. + */ + VALGRIND_CHECK_MEM_IS_DEFINED(item, size); + + /* copy the item's data onto the page */ + memcpy((char *) page + upper, item, size); + + /* adjust page header */ + phdr->pd_lower = (LocationIndex) lower; + phdr->pd_upper = (LocationIndex) upper; + + return offsetNumber; +} + + +/* + * PageGetTempPage + * Get a temporary page in local memory for special processing. + * The returned page is not initialized at all; caller must do that. + */ +Page +PageGetTempPage(Page page) +{ + Size pageSize; + Page temp; + + pageSize = PageGetPageSize(page); + temp = (Page) palloc(pageSize); + + return temp; +} + +/* + * PageGetTempPageCopy + * Get a temporary page in local memory for special processing. + * The page is initialized by copying the contents of the given page. + */ +Page +PageGetTempPageCopy(Page page) +{ + Size pageSize; + Page temp; + + pageSize = PageGetPageSize(page); + temp = (Page) palloc(pageSize); + + memcpy(temp, page, pageSize); + + return temp; +} + +/* + * PageGetTempPageCopySpecial + * Get a temporary page in local memory for special processing. + * The page is PageInit'd with the same special-space size as the + * given page, and the special space is copied from the given page. + */ +Page +PageGetTempPageCopySpecial(Page page) +{ + Size pageSize; + Page temp; + + pageSize = PageGetPageSize(page); + temp = (Page) palloc(pageSize); + + PageInit(temp, pageSize, PageGetSpecialSize(page)); + memcpy(PageGetSpecialPointer(temp), + PageGetSpecialPointer(page), + PageGetSpecialSize(page)); + + return temp; +} + +/* + * PageRestoreTempPage + * Copy temporary page back to permanent page after special processing + * and release the temporary page. + */ +void +PageRestoreTempPage(Page tempPage, Page oldPage) +{ + Size pageSize; + + pageSize = PageGetPageSize(tempPage); + memcpy((char *) oldPage, (char *) tempPage, pageSize); + + pfree(tempPage); +} + +/* + * Tuple defrag support for PageRepairFragmentation and PageIndexMultiDelete + */ +typedef struct itemIdCompactData +{ + uint16 offsetindex; /* linp array index */ + int16 itemoff; /* page offset of item data */ + uint16 alignedlen; /* MAXALIGN(item data len) */ +} itemIdCompactData; +typedef itemIdCompactData *itemIdCompact; + +/* + * After removing or marking some line pointers unused, move the tuples to + * remove the gaps caused by the removed items and reorder them back into + * reverse line pointer order in the page. + * + * This function can often be fairly hot, so it pays to take some measures to + * make it as optimal as possible. + * + * Callers may pass 'presorted' as true if the 'itemidbase' array is sorted in + * descending order of itemoff. When this is true we can just memmove() + * tuples towards the end of the page. This is quite a common case as it's + * the order that tuples are initially inserted into pages. When we call this + * function to defragment the tuples in the page then any new line pointers + * added to the page will keep that presorted order, so hitting this case is + * still very common for tables that are commonly updated. + * + * When the 'itemidbase' array is not presorted then we're unable to just + * memmove() tuples around freely. Doing so could cause us to overwrite the + * memory belonging to a tuple we've not moved yet. In this case, we copy all + * the tuples that need to be moved into a temporary buffer. We can then + * simply memcpy() out of that temp buffer back into the page at the correct + * location. Tuples are copied back into the page in the same order as the + * 'itemidbase' array, so we end up reordering the tuples back into reverse + * line pointer order. This will increase the chances of hitting the + * presorted case the next time around. + * + * Callers must ensure that nitems is > 0 + */ +static void +compactify_tuples(itemIdCompact itemidbase, int nitems, Page page, bool presorted) +{ + PageHeader phdr = (PageHeader) page; + Offset upper; + Offset copy_tail; + Offset copy_head; + itemIdCompact itemidptr; + int i; + + /* Code within will not work correctly if nitems == 0 */ + Assert(nitems > 0); + + if (presorted) + { + +#ifdef USE_ASSERT_CHECKING + { + /* + * Verify we've not gotten any new callers that are incorrectly + * passing a true presorted value. + */ + Offset lastoff = phdr->pd_special; + + for (i = 0; i < nitems; i++) + { + itemidptr = &itemidbase[i]; + + Assert(lastoff > itemidptr->itemoff); + + lastoff = itemidptr->itemoff; + } + } +#endif /* USE_ASSERT_CHECKING */ + + /* + * 'itemidbase' is already in the optimal order, i.e, lower item + * pointers have a higher offset. This allows us to memmove() the + * tuples up to the end of the page without having to worry about + * overwriting other tuples that have not been moved yet. + * + * There's a good chance that there are tuples already right at the + * end of the page that we can simply skip over because they're + * already in the correct location within the page. We'll do that + * first... + */ + upper = phdr->pd_special; + i = 0; + do + { + itemidptr = &itemidbase[i]; + if (upper != itemidptr->itemoff + itemidptr->alignedlen) + break; + upper -= itemidptr->alignedlen; + + i++; + } while (i < nitems); + + /* + * Now that we've found the first tuple that needs to be moved, we can + * do the tuple compactification. We try and make the least number of + * memmove() calls and only call memmove() when there's a gap. When + * we see a gap we just move all tuples after the gap up until the + * point of the last move operation. + */ + copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen; + for (; i < nitems; i++) + { + ItemId lp; + + itemidptr = &itemidbase[i]; + lp = PageGetItemId(page, itemidptr->offsetindex + 1); + + if (copy_head != itemidptr->itemoff + itemidptr->alignedlen) + { + memmove((char *) page + upper, + page + copy_head, + copy_tail - copy_head); + + /* + * We've now moved all tuples already seen, but not the + * current tuple, so we set the copy_tail to the end of this + * tuple so it can be moved in another iteration of the loop. + */ + copy_tail = itemidptr->itemoff + itemidptr->alignedlen; + } + /* shift the target offset down by the length of this tuple */ + upper -= itemidptr->alignedlen; + /* point the copy_head to the start of this tuple */ + copy_head = itemidptr->itemoff; + + /* update the line pointer to reference the new offset */ + lp->lp_off = upper; + } + + /* move the remaining tuples. */ + memmove((char *) page + upper, + page + copy_head, + copy_tail - copy_head); + } + else + { + PGAlignedBlock scratch; + char *scratchptr = scratch.data; + + /* + * Non-presorted case: The tuples in the itemidbase array may be in + * any order. So, in order to move these to the end of the page we + * must make a temp copy of each tuple that needs to be moved before + * we copy them back into the page at the new offset. + * + * If a large percentage of tuples have been pruned (>75%) then we'll + * copy these into the temp buffer tuple-by-tuple, otherwise, we'll + * just do a single memcpy() for all tuples that need to be moved. + * When so many tuples have been removed there's likely to be a lot of + * gaps and it's unlikely that many non-movable tuples remain at the + * end of the page. + */ + if (nitems < PageGetMaxOffsetNumber(page) / 4) + { + i = 0; + do + { + itemidptr = &itemidbase[i]; + memcpy(scratchptr + itemidptr->itemoff, page + itemidptr->itemoff, + itemidptr->alignedlen); + i++; + } while (i < nitems); + + /* Set things up for the compactification code below */ + i = 0; + itemidptr = &itemidbase[0]; + upper = phdr->pd_special; + } + else + { + upper = phdr->pd_special; + + /* + * Many tuples are likely to already be in the correct location. + * There's no need to copy these into the temp buffer. Instead + * we'll just skip forward in the itemidbase array to the position + * that we do need to move tuples from so that the code below just + * leaves these ones alone. + */ + i = 0; + do + { + itemidptr = &itemidbase[i]; + if (upper != itemidptr->itemoff + itemidptr->alignedlen) + break; + upper -= itemidptr->alignedlen; + + i++; + } while (i < nitems); + + /* Copy all tuples that need to be moved into the temp buffer */ + memcpy(scratchptr + phdr->pd_upper, + page + phdr->pd_upper, + upper - phdr->pd_upper); + } + + /* + * Do the tuple compactification. itemidptr is already pointing to + * the first tuple that we're going to move. Here we collapse the + * memcpy calls for adjacent tuples into a single call. This is done + * by delaying the memcpy call until we find a gap that needs to be + * closed. + */ + copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen; + for (; i < nitems; i++) + { + ItemId lp; + + itemidptr = &itemidbase[i]; + lp = PageGetItemId(page, itemidptr->offsetindex + 1); + + /* copy pending tuples when we detect a gap */ + if (copy_head != itemidptr->itemoff + itemidptr->alignedlen) + { + memcpy((char *) page + upper, + scratchptr + copy_head, + copy_tail - copy_head); + + /* + * We've now copied all tuples already seen, but not the + * current tuple, so we set the copy_tail to the end of this + * tuple. + */ + copy_tail = itemidptr->itemoff + itemidptr->alignedlen; + } + /* shift the target offset down by the length of this tuple */ + upper -= itemidptr->alignedlen; + /* point the copy_head to the start of this tuple */ + copy_head = itemidptr->itemoff; + + /* update the line pointer to reference the new offset */ + lp->lp_off = upper; + } + + /* Copy the remaining chunk */ + memcpy((char *) page + upper, + scratchptr + copy_head, + copy_tail - copy_head); + } + + phdr->pd_upper = upper; +} + +/* + * PageRepairFragmentation + * + * Frees fragmented space on a heap page following pruning. + * + * This routine is usable for heap pages only, but see PageIndexMultiDelete. + * + * This routine removes unused line pointers from the end of the line pointer + * array. This is possible when dead heap-only tuples get removed by pruning, + * especially when there were HOT chains with several tuples each beforehand. + * + * Caller had better have a full cleanup lock on page's buffer. As a side + * effect the page's PD_HAS_FREE_LINES hint bit will be set or unset as + * needed. Caller might also need to account for a reduction in the length of + * the line pointer array following array truncation. + */ +void +PageRepairFragmentation(Page page) +{ + Offset pd_lower = ((PageHeader) page)->pd_lower; + Offset pd_upper = ((PageHeader) page)->pd_upper; + Offset pd_special = ((PageHeader) page)->pd_special; + Offset last_offset; + itemIdCompactData itemidbase[MaxHeapTuplesPerPage]; + itemIdCompact itemidptr; + ItemId lp; + int nline, + nstorage, + nunused; + OffsetNumber finalusedlp = InvalidOffsetNumber; + int i; + Size totallen; + bool presorted = true; /* For now */ + + /* + * It's worth the trouble to be more paranoid here than in most places, + * because we are about to reshuffle data in (what is usually) a shared + * disk buffer. If we aren't careful then corrupted pointers, lengths, + * etc could cause us to clobber adjacent disk buffers, spreading the data + * loss further. So, check everything. + */ + if (pd_lower < SizeOfPageHeaderData || + pd_lower > pd_upper || + pd_upper > pd_special || + pd_special > BLCKSZ || + pd_special != MAXALIGN(pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + pd_lower, pd_upper, pd_special))); + + /* + * Run through the line pointer array and collect data about live items. + */ + nline = PageGetMaxOffsetNumber(page); + itemidptr = itemidbase; + nunused = totallen = 0; + last_offset = pd_special; + for (i = FirstOffsetNumber; i <= nline; i++) + { + lp = PageGetItemId(page, i); + if (ItemIdIsUsed(lp)) + { + if (ItemIdHasStorage(lp)) + { + itemidptr->offsetindex = i - 1; + itemidptr->itemoff = ItemIdGetOffset(lp); + + if (last_offset > itemidptr->itemoff) + last_offset = itemidptr->itemoff; + else + presorted = false; + + if (unlikely(itemidptr->itemoff < (int) pd_upper || + itemidptr->itemoff >= (int) pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: %u", + itemidptr->itemoff))); + itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp)); + totallen += itemidptr->alignedlen; + itemidptr++; + } + + finalusedlp = i; /* Could be the final non-LP_UNUSED item */ + } + else + { + /* Unused entries should have lp_len = 0, but make sure */ + Assert(!ItemIdHasStorage(lp)); + ItemIdSetUnused(lp); + nunused++; + } + } + + nstorage = itemidptr - itemidbase; + if (nstorage == 0) + { + /* Page is completely empty, so just reset it quickly */ + ((PageHeader) page)->pd_upper = pd_special; + } + else + { + /* Need to compact the page the hard way */ + if (totallen > (Size) (pd_special - pd_lower)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted item lengths: total %u, available space %u", + (unsigned int) totallen, pd_special - pd_lower))); + + compactify_tuples(itemidbase, nstorage, page, presorted); + } + + if (finalusedlp != nline) + { + /* The last line pointer is not the last used line pointer */ + int nunusedend = nline - finalusedlp; + + Assert(nunused >= nunusedend && nunusedend > 0); + + /* remove trailing unused line pointers from the count */ + nunused -= nunusedend; + /* truncate the line pointer array */ + ((PageHeader) page)->pd_lower -= (sizeof(ItemIdData) * nunusedend); + } + + /* Set hint bit for PageAddItemExtended */ + if (nunused > 0) + PageSetHasFreeLinePointers(page); + else + PageClearHasFreeLinePointers(page); +} + +/* + * PageTruncateLinePointerArray + * + * Removes unused line pointers at the end of the line pointer array. + * + * This routine is usable for heap pages only. It is called by VACUUM during + * its second pass over the heap. We expect at least one LP_UNUSED line + * pointer on the page (if VACUUM didn't have an LP_DEAD item on the page that + * it just set to LP_UNUSED then it should not call here). + * + * We avoid truncating the line pointer array to 0 items, if necessary by + * leaving behind a single remaining LP_UNUSED item. This is a little + * arbitrary, but it seems like a good idea to avoid leaving a PageIsEmpty() + * page behind. + * + * Caller can have either an exclusive lock or a full cleanup lock on page's + * buffer. The page's PD_HAS_FREE_LINES hint bit will be set or unset based + * on whether or not we leave behind any remaining LP_UNUSED items. + */ +void +PageTruncateLinePointerArray(Page page) +{ + PageHeader phdr = (PageHeader) page; + bool countdone = false, + sethint = false; + int nunusedend = 0; + + /* Scan line pointer array back-to-front */ + for (int i = PageGetMaxOffsetNumber(page); i >= FirstOffsetNumber; i--) + { + ItemId lp = PageGetItemId(page, i); + + if (!countdone && i > FirstOffsetNumber) + { + /* + * Still determining which line pointers from the end of the array + * will be truncated away. Either count another line pointer as + * safe to truncate, or notice that it's not safe to truncate + * additional line pointers (stop counting line pointers). + */ + if (!ItemIdIsUsed(lp)) + nunusedend++; + else + countdone = true; + } + else + { + /* + * Once we've stopped counting we still need to figure out if + * there are any remaining LP_UNUSED line pointers somewhere more + * towards the front of the array. + */ + if (!ItemIdIsUsed(lp)) + { + /* + * This is an unused line pointer that we won't be truncating + * away -- so there is at least one. Set hint on page. + */ + sethint = true; + break; + } + } + } + + if (nunusedend > 0) + { + phdr->pd_lower -= sizeof(ItemIdData) * nunusedend; + +#ifdef CLOBBER_FREED_MEMORY + memset((char *) page + phdr->pd_lower, 0x7F, + sizeof(ItemIdData) * nunusedend); +#endif + } + else + Assert(sethint); + + /* Set hint bit for PageAddItemExtended */ + if (sethint) + PageSetHasFreeLinePointers(page); + else + PageClearHasFreeLinePointers(page); +} + +/* + * PageGetFreeSpace + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for a new line pointer. + * + * Note: this should usually only be used on index pages. Use + * PageGetHeapFreeSpace on heap pages. + */ +Size +PageGetFreeSpace(Page page) +{ + int space; + + /* + * Use signed arithmetic here so that we behave sensibly if pd_lower > + * pd_upper. + */ + space = (int) ((PageHeader) page)->pd_upper - + (int) ((PageHeader) page)->pd_lower; + + if (space < (int) sizeof(ItemIdData)) + return 0; + space -= sizeof(ItemIdData); + + return (Size) space; +} + +/* + * PageGetFreeSpaceForMultipleTuples + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for multiple new line pointers. + * + * Note: this should usually only be used on index pages. Use + * PageGetHeapFreeSpace on heap pages. + */ +Size +PageGetFreeSpaceForMultipleTuples(Page page, int ntups) +{ + int space; + + /* + * Use signed arithmetic here so that we behave sensibly if pd_lower > + * pd_upper. + */ + space = (int) ((PageHeader) page)->pd_upper - + (int) ((PageHeader) page)->pd_lower; + + if (space < (int) (ntups * sizeof(ItemIdData))) + return 0; + space -= ntups * sizeof(ItemIdData); + + return (Size) space; +} + +/* + * PageGetExactFreeSpace + * Returns the size of the free (allocatable) space on a page, + * without any consideration for adding/removing line pointers. + */ +Size +PageGetExactFreeSpace(Page page) +{ + int space; + + /* + * Use signed arithmetic here so that we behave sensibly if pd_lower > + * pd_upper. + */ + space = (int) ((PageHeader) page)->pd_upper - + (int) ((PageHeader) page)->pd_lower; + + if (space < 0) + return 0; + + return (Size) space; +} + + +/* + * PageGetHeapFreeSpace + * Returns the size of the free (allocatable) space on a page, + * reduced by the space needed for a new line pointer. + * + * The difference between this and PageGetFreeSpace is that this will return + * zero if there are already MaxHeapTuplesPerPage line pointers in the page + * and none are free. We use this to enforce that no more than + * MaxHeapTuplesPerPage line pointers are created on a heap page. (Although + * no more tuples than that could fit anyway, in the presence of redirected + * or dead line pointers it'd be possible to have too many line pointers. + * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit + * on the number of line pointers, we make this extra check.) + */ +Size +PageGetHeapFreeSpace(Page page) +{ + Size space; + + space = PageGetFreeSpace(page); + if (space > 0) + { + OffsetNumber offnum, + nline; + + /* + * Are there already MaxHeapTuplesPerPage line pointers in the page? + */ + nline = PageGetMaxOffsetNumber(page); + if (nline >= MaxHeapTuplesPerPage) + { + if (PageHasFreeLinePointers(page)) + { + /* + * Since this is just a hint, we must confirm that there is + * indeed a free line pointer + */ + for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) + { + ItemId lp = PageGetItemId(page, offnum); + + if (!ItemIdIsUsed(lp)) + break; + } + + if (offnum > nline) + { + /* + * The hint is wrong, but we can't clear it here since we + * don't have the ability to mark the page dirty. + */ + space = 0; + } + } + else + { + /* + * Although the hint might be wrong, PageAddItem will believe + * it anyway, so we must believe it too. + */ + space = 0; + } + } + } + return space; +} + + +/* + * PageIndexTupleDelete + * + * This routine does the work of removing a tuple from an index page. + * + * Unlike heap pages, we compact out the line pointer for the removed tuple. + */ +void +PageIndexTupleDelete(Page page, OffsetNumber offnum) +{ + PageHeader phdr = (PageHeader) page; + char *addr; + ItemId tup; + Size size; + unsigned offset; + int nbytes; + int offidx; + int nline; + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (phdr->pd_lower < SizeOfPageHeaderData || + phdr->pd_lower > phdr->pd_upper || + phdr->pd_upper > phdr->pd_special || + phdr->pd_special > BLCKSZ || + phdr->pd_special != MAXALIGN(phdr->pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); + + nline = PageGetMaxOffsetNumber(page); + if ((int) offnum <= 0 || (int) offnum > nline) + elog(ERROR, "invalid index offnum: %u", offnum); + + /* change offset number to offset index */ + offidx = offnum - 1; + + tup = PageGetItemId(page, offnum); + Assert(ItemIdHasStorage(tup)); + size = ItemIdGetLength(tup); + offset = ItemIdGetOffset(tup); + + if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: offset = %u, size = %u", + offset, (unsigned int) size))); + + /* Amount of space to actually be deleted */ + size = MAXALIGN(size); + + /* + * First, we want to get rid of the pd_linp entry for the index tuple. We + * copy all subsequent linp's back one slot in the array. We don't use + * PageGetItemId, because we are manipulating the _array_, not individual + * linp's. + */ + nbytes = phdr->pd_lower - + ((char *) &phdr->pd_linp[offidx + 1] - (char *) phdr); + + if (nbytes > 0) + memmove((char *) &(phdr->pd_linp[offidx]), + (char *) &(phdr->pd_linp[offidx + 1]), + nbytes); + + /* + * Now move everything between the old upper bound (beginning of tuple + * space) and the beginning of the deleted tuple forward, so that space in + * the middle of the page is left free. If we've just deleted the tuple + * at the beginning of tuple space, then there's no need to do the copy. + */ + + /* beginning of tuple space */ + addr = (char *) page + phdr->pd_upper; + + if (offset > phdr->pd_upper) + memmove(addr + size, addr, offset - phdr->pd_upper); + + /* adjust free space boundary pointers */ + phdr->pd_upper += size; + phdr->pd_lower -= sizeof(ItemIdData); + + /* + * Finally, we need to adjust the linp entries that remain. + * + * Anything that used to be before the deleted tuple's data was moved + * forward by the size of the deleted tuple. + */ + if (!PageIsEmpty(page)) + { + int i; + + nline--; /* there's one less than when we started */ + for (i = 1; i <= nline; i++) + { + ItemId ii = PageGetItemId(page, i); + + Assert(ItemIdHasStorage(ii)); + if (ItemIdGetOffset(ii) <= offset) + ii->lp_off += size; + } + } +} + + +/* + * PageIndexMultiDelete + * + * This routine handles the case of deleting multiple tuples from an + * index page at once. It is considerably faster than a loop around + * PageIndexTupleDelete ... however, the caller *must* supply the array + * of item numbers to be deleted in item number order! + */ +void +PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems) +{ + PageHeader phdr = (PageHeader) page; + Offset pd_lower = phdr->pd_lower; + Offset pd_upper = phdr->pd_upper; + Offset pd_special = phdr->pd_special; + Offset last_offset; + itemIdCompactData itemidbase[MaxIndexTuplesPerPage]; + ItemIdData newitemids[MaxIndexTuplesPerPage]; + itemIdCompact itemidptr; + ItemId lp; + int nline, + nused; + Size totallen; + Size size; + unsigned offset; + int nextitm; + OffsetNumber offnum; + bool presorted = true; /* For now */ + + Assert(nitems <= MaxIndexTuplesPerPage); + + /* + * If there aren't very many items to delete, then retail + * PageIndexTupleDelete is the best way. Delete the items in reverse + * order so we don't have to think about adjusting item numbers for + * previous deletions. + * + * TODO: tune the magic number here + */ + if (nitems <= 2) + { + while (--nitems >= 0) + PageIndexTupleDelete(page, itemnos[nitems]); + return; + } + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (pd_lower < SizeOfPageHeaderData || + pd_lower > pd_upper || + pd_upper > pd_special || + pd_special > BLCKSZ || + pd_special != MAXALIGN(pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + pd_lower, pd_upper, pd_special))); + + /* + * Scan the line pointer array and build a list of just the ones we are + * going to keep. Notice we do not modify the page yet, since we are + * still validity-checking. + */ + nline = PageGetMaxOffsetNumber(page); + itemidptr = itemidbase; + totallen = 0; + nused = 0; + nextitm = 0; + last_offset = pd_special; + for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum)) + { + lp = PageGetItemId(page, offnum); + Assert(ItemIdHasStorage(lp)); + size = ItemIdGetLength(lp); + offset = ItemIdGetOffset(lp); + if (offset < pd_upper || + (offset + size) > pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: offset = %u, size = %u", + offset, (unsigned int) size))); + + if (nextitm < nitems && offnum == itemnos[nextitm]) + { + /* skip item to be deleted */ + nextitm++; + } + else + { + itemidptr->offsetindex = nused; /* where it will go */ + itemidptr->itemoff = offset; + + if (last_offset > itemidptr->itemoff) + last_offset = itemidptr->itemoff; + else + presorted = false; + + itemidptr->alignedlen = MAXALIGN(size); + totallen += itemidptr->alignedlen; + newitemids[nused] = *lp; + itemidptr++; + nused++; + } + } + + /* this will catch invalid or out-of-order itemnos[] */ + if (nextitm != nitems) + elog(ERROR, "incorrect index offsets supplied"); + + if (totallen > (Size) (pd_special - pd_lower)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted item lengths: total %u, available space %u", + (unsigned int) totallen, pd_special - pd_lower))); + + /* + * Looks good. Overwrite the line pointers with the copy, from which we've + * removed all the unused items. + */ + memcpy(phdr->pd_linp, newitemids, nused * sizeof(ItemIdData)); + phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData); + + /* and compactify the tuple data */ + if (nused > 0) + compactify_tuples(itemidbase, nused, page, presorted); + else + phdr->pd_upper = pd_special; +} + + +/* + * PageIndexTupleDeleteNoCompact + * + * Remove the specified tuple from an index page, but set its line pointer + * to "unused" instead of compacting it out, except that it can be removed + * if it's the last line pointer on the page. + * + * This is used for index AMs that require that existing TIDs of live tuples + * remain unchanged, and are willing to allow unused line pointers instead. + */ +void +PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum) +{ + PageHeader phdr = (PageHeader) page; + char *addr; + ItemId tup; + Size size; + unsigned offset; + int nline; + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (phdr->pd_lower < SizeOfPageHeaderData || + phdr->pd_lower > phdr->pd_upper || + phdr->pd_upper > phdr->pd_special || + phdr->pd_special > BLCKSZ || + phdr->pd_special != MAXALIGN(phdr->pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); + + nline = PageGetMaxOffsetNumber(page); + if ((int) offnum <= 0 || (int) offnum > nline) + elog(ERROR, "invalid index offnum: %u", offnum); + + tup = PageGetItemId(page, offnum); + Assert(ItemIdHasStorage(tup)); + size = ItemIdGetLength(tup); + offset = ItemIdGetOffset(tup); + + if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: offset = %u, size = %u", + offset, (unsigned int) size))); + + /* Amount of space to actually be deleted */ + size = MAXALIGN(size); + + /* + * Either set the line pointer to "unused", or zap it if it's the last + * one. (Note: it's possible that the next-to-last one(s) are already + * unused, but we do not trouble to try to compact them out if so.) + */ + if ((int) offnum < nline) + ItemIdSetUnused(tup); + else + { + phdr->pd_lower -= sizeof(ItemIdData); + nline--; /* there's one less than when we started */ + } + + /* + * Now move everything between the old upper bound (beginning of tuple + * space) and the beginning of the deleted tuple forward, so that space in + * the middle of the page is left free. If we've just deleted the tuple + * at the beginning of tuple space, then there's no need to do the copy. + */ + + /* beginning of tuple space */ + addr = (char *) page + phdr->pd_upper; + + if (offset > phdr->pd_upper) + memmove(addr + size, addr, offset - phdr->pd_upper); + + /* adjust free space boundary pointer */ + phdr->pd_upper += size; + + /* + * Finally, we need to adjust the linp entries that remain. + * + * Anything that used to be before the deleted tuple's data was moved + * forward by the size of the deleted tuple. + */ + if (!PageIsEmpty(page)) + { + int i; + + for (i = 1; i <= nline; i++) + { + ItemId ii = PageGetItemId(page, i); + + if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset) + ii->lp_off += size; + } + } +} + + +/* + * PageIndexTupleOverwrite + * + * Replace a specified tuple on an index page. + * + * The new tuple is placed exactly where the old one had been, shifting + * other tuples' data up or down as needed to keep the page compacted. + * This is better than deleting and reinserting the tuple, because it + * avoids any data shifting when the tuple size doesn't change; and + * even when it does, we avoid moving the line pointers around. + * This could be used by an index AM that doesn't want to unset the + * LP_DEAD bit when it happens to be set. It could conceivably also be + * used by an index AM that cares about the physical order of tuples as + * well as their logical/ItemId order. + * + * If there's insufficient space for the new tuple, return false. Other + * errors represent data-corruption problems, so we just elog. + */ +bool +PageIndexTupleOverwrite(Page page, OffsetNumber offnum, + Item newtup, Size newsize) +{ + PageHeader phdr = (PageHeader) page; + ItemId tupid; + int oldsize; + unsigned offset; + Size alignednewsize; + int size_diff; + int itemcount; + + /* + * As with PageRepairFragmentation, paranoia seems justified. + */ + if (phdr->pd_lower < SizeOfPageHeaderData || + phdr->pd_lower > phdr->pd_upper || + phdr->pd_upper > phdr->pd_special || + phdr->pd_special > BLCKSZ || + phdr->pd_special != MAXALIGN(phdr->pd_special)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u", + phdr->pd_lower, phdr->pd_upper, phdr->pd_special))); + + itemcount = PageGetMaxOffsetNumber(page); + if ((int) offnum <= 0 || (int) offnum > itemcount) + elog(ERROR, "invalid index offnum: %u", offnum); + + tupid = PageGetItemId(page, offnum); + Assert(ItemIdHasStorage(tupid)); + oldsize = ItemIdGetLength(tupid); + offset = ItemIdGetOffset(tupid); + + if (offset < phdr->pd_upper || (offset + oldsize) > phdr->pd_special || + offset != MAXALIGN(offset)) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("corrupted line pointer: offset = %u, size = %u", + offset, (unsigned int) oldsize))); + + /* + * Determine actual change in space requirement, check for page overflow. + */ + oldsize = MAXALIGN(oldsize); + alignednewsize = MAXALIGN(newsize); + if (alignednewsize > oldsize + (phdr->pd_upper - phdr->pd_lower)) + return false; + + /* + * Relocate existing data and update line pointers, unless the new tuple + * is the same size as the old (after alignment), in which case there's + * nothing to do. Notice that what we have to relocate is data before the + * target tuple, not data after, so it's convenient to express size_diff + * as the amount by which the tuple's size is decreasing, making it the + * delta to add to pd_upper and affected line pointers. + */ + size_diff = oldsize - (int) alignednewsize; + if (size_diff != 0) + { + char *addr = (char *) page + phdr->pd_upper; + int i; + + /* relocate all tuple data before the target tuple */ + memmove(addr + size_diff, addr, offset - phdr->pd_upper); + + /* adjust free space boundary pointer */ + phdr->pd_upper += size_diff; + + /* adjust affected line pointers too */ + for (i = FirstOffsetNumber; i <= itemcount; i++) + { + ItemId ii = PageGetItemId(page, i); + + /* Allow items without storage; currently only BRIN needs that */ + if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset) + ii->lp_off += size_diff; + } + } + + /* Update the item's tuple length without changing its lp_flags field */ + tupid->lp_off = offset + size_diff; + tupid->lp_len = newsize; + + /* Copy new tuple data onto page */ + memcpy(PageGetItem(page, tupid), newtup, newsize); + + return true; +} + + +/* + * Set checksum for a page in shared buffers. + * + * If checksums are disabled, or if the page is not initialized, just return + * the input. Otherwise, we must make a copy of the page before calculating + * the checksum, to prevent concurrent modifications (e.g. setting hint bits) + * from making the final checksum invalid. It doesn't matter if we include or + * exclude hints during the copy, as long as we write a valid page and + * associated checksum. + * + * Returns a pointer to the block-sized data that needs to be written. Uses + * statically-allocated memory, so the caller must immediately write the + * returned page and not refer to it again. + */ +char * +PageSetChecksumCopy(Page page, BlockNumber blkno) +{ + static char *pageCopy = NULL; + + /* If we don't need a checksum, just return the passed-in data */ + if (PageIsNew(page) || !DataChecksumsEnabled()) + return (char *) page; + + /* + * We allocate the copy space once and use it over on each subsequent + * call. The point of palloc'ing here, rather than having a static char + * array, is first to ensure adequate alignment for the checksumming code + * and second to avoid wasting space in processes that never call this. + */ + if (pageCopy == NULL) + pageCopy = MemoryContextAllocAligned(TopMemoryContext, + BLCKSZ, + PG_IO_ALIGN_SIZE, + 0); + + memcpy(pageCopy, (char *) page, BLCKSZ); + ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno); + return pageCopy; +} + +/* + * Set checksum for a page in private memory. + * + * This must only be used when we know that no other process can be modifying + * the page buffer. + */ +void +PageSetChecksumInplace(Page page, BlockNumber blkno) +{ + /* If we don't need a checksum, just return */ + if (PageIsNew(page) || !DataChecksumsEnabled()) + return; + + ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno); +} diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c new file mode 100644 index 0000000..81fd519 --- /dev/null +++ b/src/backend/storage/page/checksum.c @@ -0,0 +1,22 @@ +/*------------------------------------------------------------------------- + * + * checksum.c + * Checksum implementation for data pages. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/page/checksum.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/checksum.h" +/* + * The actual code is in storage/checksum_impl.h. This is done so that + * external programs can incorporate the checksum code by #include'ing + * that file from the exported Postgres headers. (Compare our CRC code.) + */ +#include "storage/checksum_impl.h" diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c new file mode 100644 index 0000000..2c25ad5 --- /dev/null +++ b/src/backend/storage/page/itemptr.c @@ -0,0 +1,131 @@ +/*------------------------------------------------------------------------- + * + * itemptr.c + * POSTGRES disk item pointer code. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/page/itemptr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/itemptr.h" + + +/* + * We really want ItemPointerData to be exactly 6 bytes. + */ +StaticAssertDecl(sizeof(ItemPointerData) == 3 * sizeof(uint16), + "ItemPointerData struct is improperly padded"); + +/* + * ItemPointerEquals + * Returns true if both item pointers point to the same item, + * otherwise returns false. + * + * Note: + * Asserts that the disk item pointers are both valid! + */ +bool +ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2) +{ + if (ItemPointerGetBlockNumber(pointer1) == + ItemPointerGetBlockNumber(pointer2) && + ItemPointerGetOffsetNumber(pointer1) == + ItemPointerGetOffsetNumber(pointer2)) + return true; + else + return false; +} + +/* + * ItemPointerCompare + * Generic btree-style comparison for item pointers. + */ +int32 +ItemPointerCompare(ItemPointer arg1, ItemPointer arg2) +{ + /* + * Use ItemPointerGet{Offset,Block}NumberNoCheck to avoid asserting + * ip_posid != 0, which may not be true for a user-supplied TID. + */ + BlockNumber b1 = ItemPointerGetBlockNumberNoCheck(arg1); + BlockNumber b2 = ItemPointerGetBlockNumberNoCheck(arg2); + + if (b1 < b2) + return -1; + else if (b1 > b2) + return 1; + else if (ItemPointerGetOffsetNumberNoCheck(arg1) < + ItemPointerGetOffsetNumberNoCheck(arg2)) + return -1; + else if (ItemPointerGetOffsetNumberNoCheck(arg1) > + ItemPointerGetOffsetNumberNoCheck(arg2)) + return 1; + else + return 0; +} + +/* + * ItemPointerInc + * Increment 'pointer' by 1 only paying attention to the ItemPointer's + * type's range limits and not MaxOffsetNumber and FirstOffsetNumber. + * This may result in 'pointer' becoming !OffsetNumberIsValid. + * + * If the pointer is already the maximum possible values permitted by the + * range of the ItemPointer's types, then do nothing. + */ +void +ItemPointerInc(ItemPointer pointer) +{ + BlockNumber blk = ItemPointerGetBlockNumberNoCheck(pointer); + OffsetNumber off = ItemPointerGetOffsetNumberNoCheck(pointer); + + if (off == PG_UINT16_MAX) + { + if (blk != InvalidBlockNumber) + { + off = 0; + blk++; + } + } + else + off++; + + ItemPointerSet(pointer, blk, off); +} + +/* + * ItemPointerDec + * Decrement 'pointer' by 1 only paying attention to the ItemPointer's + * type's range limits and not MaxOffsetNumber and FirstOffsetNumber. + * This may result in 'pointer' becoming !OffsetNumberIsValid. + * + * If the pointer is already the minimum possible values permitted by the + * range of the ItemPointer's types, then do nothing. This does rely on + * FirstOffsetNumber being 1 rather than 0. + */ +void +ItemPointerDec(ItemPointer pointer) +{ + BlockNumber blk = ItemPointerGetBlockNumberNoCheck(pointer); + OffsetNumber off = ItemPointerGetOffsetNumberNoCheck(pointer); + + if (off == 0) + { + if (blk != 0) + { + off = PG_UINT16_MAX; + blk--; + } + } + else + off--; + + ItemPointerSet(pointer, blk, off); +} diff --git a/src/backend/storage/page/meson.build b/src/backend/storage/page/meson.build new file mode 100644 index 0000000..2160a37 --- /dev/null +++ b/src/backend/storage/page/meson.build @@ -0,0 +1,7 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'bufpage.c', + 'checksum.c', + 'itemptr.c', +) diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile new file mode 100644 index 0000000..596b564 --- /dev/null +++ b/src/backend/storage/smgr/Makefile @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/smgr +# +# IDENTIFICATION +# src/backend/storage/smgr/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/smgr +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + md.o \ + smgr.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/smgr/README b/src/backend/storage/smgr/README new file mode 100644 index 0000000..cf3aa56 --- /dev/null +++ b/src/backend/storage/smgr/README @@ -0,0 +1,52 @@ +src/backend/storage/smgr/README + +Storage Managers +================ + +In the original Berkeley Postgres system, there were several storage managers, +of which only the "magnetic disk" manager remains. (At Berkeley there were +also managers for the Sony WORM optical disk jukebox and persistent main +memory, but these were never supported in any externally released Postgres, +nor in any version of PostgreSQL.) The "magnetic disk" manager is itself +seriously misnamed, because actually it supports any kind of device for +which the operating system provides standard filesystem operations; which +these days is pretty much everything of interest. However, we retain the +notion of a storage manager switch in case anyone ever wants to reintroduce +other kinds of storage managers. Removing the switch layer would save +nothing noticeable anyway, since storage-access operations are surely far +more expensive than one extra layer of C function calls. + +In Berkeley Postgres each relation was tagged with the ID of the storage +manager to use for it. This is gone. It would be probably more reasonable +to associate storage managers with tablespaces, should we ever re-introduce +multiple storage managers into the system catalogs. + +The files in this directory, and their contents, are + + smgr.c The storage manager switch dispatch code. The routines in + this file call the appropriate storage manager to do storage + accesses requested by higher-level code. smgr.c also manages + the file handle cache (SMgrRelation table). + + md.c The "magnetic disk" storage manager, which is really just + an interface to the kernel's filesystem operations. + +Note that md.c in turn relies on src/backend/storage/file/fd.c. + + +Relation Forks +============== + +Since 8.4, a single smgr relation can be comprised of multiple physical +files, called relation forks. This allows storing additional metadata like +Free Space information in additional forks, which can be grown and truncated +independently of the main data file, while still treating it all as a single +physical relation in system catalogs. + +It is assumed that the main fork, fork number 0 or MAIN_FORKNUM, always +exists. Fork numbers are assigned in src/include/common/relpath.h. +Functions in smgr.c and md.c take an extra fork number argument, in addition +to relfilelocator and block number, to identify which relation fork you want to +access. Since most code wants to access the main fork, a shortcut version of +ReadBuffer that accesses MAIN_FORKNUM is provided in the buffer manager for +convenience. diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c new file mode 100644 index 0000000..fdecbad --- /dev/null +++ b/src/backend/storage/smgr/md.c @@ -0,0 +1,1623 @@ +/*------------------------------------------------------------------------- + * + * md.c + * This code manages relations that reside on magnetic disk. + * + * Or at least, that was what the Berkeley folk had in mind when they named + * this file. In reality, what this code provides is an interface from + * the smgr API to Unix-like filesystem APIs, so it will work with any type + * of device for which the operating system provides filesystem support. + * It doesn't matter whether the bits are on spinning rust or some other + * storage technology. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/smgr/md.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "commands/tablespace.h" +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/md.h" +#include "storage/relfilelocator.h" +#include "storage/smgr.h" +#include "storage/sync.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" + +/* + * The magnetic disk storage manager keeps track of open file + * descriptors in its own descriptor pool. This is done to make it + * easier to support relations that are larger than the operating + * system's file size limit (often 2GBytes). In order to do that, + * we break relations up into "segment" files that are each shorter than + * the OS file size limit. The segment size is set by the RELSEG_SIZE + * configuration constant in pg_config.h. + * + * On disk, a relation must consist of consecutively numbered segment + * files in the pattern + * -- Zero or more full segments of exactly RELSEG_SIZE blocks each + * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks + * -- Optionally, any number of inactive segments of size 0 blocks. + * The full and partial segments are collectively the "active" segments. + * Inactive segments are those that once contained data but are currently + * not needed because of an mdtruncate() operation. The reason for leaving + * them present at size zero, rather than unlinking them, is that other + * backends and/or the checkpointer might be holding open file references to + * such segments. If the relation expands again after mdtruncate(), such + * that a deactivated segment becomes active again, it is important that + * such file references still be valid --- else data might get written + * out to an unlinked old copy of a segment file that will eventually + * disappear. + * + * File descriptors are stored in the per-fork md_seg_fds arrays inside + * SMgrRelation. The length of these arrays is stored in md_num_open_segs. + * Note that a fork's md_num_open_segs having a specific value does not + * necessarily mean the relation doesn't have additional segments; we may + * just not have opened the next segment yet. (We could not have "all + * segments are in the array" as an invariant anyway, since another backend + * could extend the relation while we aren't looking.) We do not have + * entries for inactive segments, however; as soon as we find a partial + * segment, we assume that any subsequent segments are inactive. + * + * The entire MdfdVec array is palloc'd in the MdCxt memory context. + */ + +typedef struct _MdfdVec +{ + File mdfd_vfd; /* fd number in fd.c's pool */ + BlockNumber mdfd_segno; /* segment number, from 0 */ +} MdfdVec; + +static MemoryContext MdCxt; /* context for all MdfdVec objects */ + + +/* Populate a file tag describing an md.c segment file. */ +#define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \ +( \ + memset(&(a), 0, sizeof(FileTag)), \ + (a).handler = SYNC_HANDLER_MD, \ + (a).rlocator = (xx_rlocator), \ + (a).forknum = (xx_forknum), \ + (a).segno = (xx_segno) \ +) + + +/*** behavior for mdopen & _mdfd_getseg ***/ +/* ereport if segment not present */ +#define EXTENSION_FAIL (1 << 0) +/* return NULL if segment not present */ +#define EXTENSION_RETURN_NULL (1 << 1) +/* create new segments as needed */ +#define EXTENSION_CREATE (1 << 2) +/* create new segments if needed during recovery */ +#define EXTENSION_CREATE_RECOVERY (1 << 3) +/* + * Allow opening segments which are preceded by segments smaller than + * RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks + * mdnblocks() and related functionality henceforth - which currently is ok, + * because this is only required in the checkpointer which never uses + * mdnblocks(). + */ +#define EXTENSION_DONT_CHECK_SIZE (1 << 4) +/* don't try to open a segment, if not already open */ +#define EXTENSION_DONT_OPEN (1 << 5) + + +/* local routines */ +static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, + bool isRedo); +static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior); +static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, + MdfdVec *seg); +static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, + BlockNumber segno); +static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, + BlockNumber segno); +static void _fdvec_resize(SMgrRelation reln, + ForkNumber forknum, + int nseg); +static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, + BlockNumber segno); +static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, + BlockNumber segno, int oflags); +static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, + BlockNumber blkno, bool skipFsync, int behavior); +static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, + MdfdVec *seg); + +static inline int +_mdfd_open_flags(void) +{ + int flags = O_RDWR | PG_BINARY; + + if (io_direct_flags & IO_DIRECT_DATA) + flags |= PG_O_DIRECT; + + return flags; +} + +/* + * mdinit() -- Initialize private state for magnetic disk storage manager. + */ +void +mdinit(void) +{ + MdCxt = AllocSetContextCreate(TopMemoryContext, + "MdSmgr", + ALLOCSET_DEFAULT_SIZES); +} + +/* + * mdexists() -- Does the physical file exist? + * + * Note: this will return true for lingering files, with pending deletions + */ +bool +mdexists(SMgrRelation reln, ForkNumber forknum) +{ + /* + * Close it first, to ensure that we notice if the fork has been unlinked + * since we opened it. As an optimization, we can skip that in recovery, + * which already closes relations when dropping them. + */ + if (!InRecovery) + mdclose(reln, forknum); + + return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL); +} + +/* + * mdcreate() -- Create a new relation on magnetic disk. + * + * If isRedo is true, it's okay for the relation to exist already. + */ +void +mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ + MdfdVec *mdfd; + char *path; + File fd; + + if (isRedo && reln->md_num_open_segs[forknum] > 0) + return; /* created and opened already... */ + + Assert(reln->md_num_open_segs[forknum] == 0); + + /* + * We may be using the target table space for the first time in this + * database, so create a per-database subdirectory if needed. + * + * XXX this is a fairly ugly violation of module layering, but this seems + * to be the best place to put the check. Maybe TablespaceCreateDbspace + * should be here and not in commands/tablespace.c? But that would imply + * importing a lot of stuff that smgr.c oughtn't know, either. + */ + TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + isRedo); + + path = relpath(reln->smgr_rlocator, forknum); + + fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL); + + if (fd < 0) + { + int save_errno = errno; + + if (isRedo) + fd = PathNameOpenFile(path, _mdfd_open_flags()); + if (fd < 0) + { + /* be sure to report the error reported by create, not open */ + errno = save_errno; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create file \"%s\": %m", path))); + } + } + + pfree(path); + + _fdvec_resize(reln, forknum, 1); + mdfd = &reln->md_seg_fds[forknum][0]; + mdfd->mdfd_vfd = fd; + mdfd->mdfd_segno = 0; + + if (!SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, mdfd); +} + +/* + * mdunlink() -- Unlink a relation. + * + * Note that we're passed a RelFileLocatorBackend --- by the time this is called, + * there won't be an SMgrRelation hashtable entry anymore. + * + * forknum can be a fork number to delete a specific fork, or InvalidForkNumber + * to delete all forks. + * + * For regular relations, we don't unlink the first segment file of the rel, + * but just truncate it to zero length, and record a request to unlink it after + * the next checkpoint. Additional segments can be unlinked immediately, + * however. Leaving the empty file in place prevents that relfilenumber + * from being reused. The scenario this protects us from is: + * 1. We delete a relation (and commit, and actually remove its file). + * 2. We create a new relation, which by chance gets the same relfilenumber as + * the just-deleted one (OIDs must've wrapped around for that to happen). + * 3. We crash before another checkpoint occurs. + * During replay, we would delete the file and then recreate it, which is fine + * if the contents of the file were repopulated by subsequent WAL entries. + * But if we didn't WAL-log insertions, but instead relied on fsyncing the + * file after populating it (as we do at wal_level=minimal), the contents of + * the file would be lost forever. By leaving the empty file until after the + * next checkpoint, we prevent reassignment of the relfilenumber until it's + * safe, because relfilenumber assignment skips over any existing file. + * + * Additional segments, if any, are truncated and then unlinked. The reason + * for truncating is that other backends may still hold open FDs for these at + * the smgr level, so that the kernel can't remove the file yet. We want to + * reclaim the disk space right away despite that. + * + * We do not need to go through this dance for temp relations, though, because + * we never make WAL entries for temp rels, and so a temp rel poses no threat + * to the health of a regular rel that has taken over its relfilenumber. + * The fact that temp rels and regular rels have different file naming + * patterns provides additional safety. Other backends shouldn't have open + * FDs for them, either. + * + * We also don't do it while performing a binary upgrade. There is no reuse + * hazard in that case, since after a crash or even a simple ERROR, the + * upgrade fails and the whole cluster must be recreated from scratch. + * Furthermore, it is important to remove the files from disk immediately, + * because we may be about to reuse the same relfilenumber. + * + * All the above applies only to the relation's main fork; other forks can + * just be removed immediately, since they are not needed to prevent the + * relfilenumber from being recycled. Also, we do not carefully + * track whether other forks have been created or not, but just attempt to + * unlink them unconditionally; so we should never complain about ENOENT. + * + * If isRedo is true, it's unsurprising for the relation to be already gone. + * Also, we should remove the file immediately instead of queuing a request + * for later, since during redo there's no possibility of creating a + * conflicting relation. + * + * Note: we currently just never warn about ENOENT at all. We could warn in + * the main-fork, non-isRedo case, but it doesn't seem worth the trouble. + * + * Note: any failure should be reported as WARNING not ERROR, because + * we are usually not in a transaction anymore when this is called. + */ +void +mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) +{ + /* Now do the per-fork work */ + if (forknum == InvalidForkNumber) + { + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + mdunlinkfork(rlocator, forknum, isRedo); + } + else + mdunlinkfork(rlocator, forknum, isRedo); +} + +/* + * Truncate a file to release disk space. + */ +static int +do_truncate(const char *path) +{ + int save_errno; + int ret; + + ret = pg_truncate(path, 0); + + /* Log a warning here to avoid repetition in callers. */ + if (ret < 0 && errno != ENOENT) + { + save_errno = errno; + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\": %m", path))); + errno = save_errno; + } + + return ret; +} + +static void +mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) +{ + char *path; + int ret; + int save_errno; + + path = relpath(rlocator, forknum); + + /* + * Truncate and then unlink the first segment, or just register a request + * to unlink it later, as described in the comments for mdunlink(). + */ + if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM || + RelFileLocatorBackendIsTemp(rlocator)) + { + if (!RelFileLocatorBackendIsTemp(rlocator)) + { + /* Prevent other backends' fds from holding on to the disk space */ + ret = do_truncate(path); + + /* Forget any pending sync requests for the first segment */ + save_errno = errno; + register_forget_request(rlocator, forknum, 0 /* first seg */ ); + errno = save_errno; + } + else + ret = 0; + + /* Next unlink the file, unless it was already found to be missing */ + if (ret >= 0 || errno != ENOENT) + { + ret = unlink(path); + if (ret < 0 && errno != ENOENT) + { + save_errno = errno; + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + errno = save_errno; + } + } + } + else + { + /* Prevent other backends' fds from holding on to the disk space */ + ret = do_truncate(path); + + /* Register request to unlink first segment later */ + save_errno = errno; + register_unlink_segment(rlocator, forknum, 0 /* first seg */ ); + errno = save_errno; + } + + /* + * Delete any additional segments. + * + * Note that because we loop until getting ENOENT, we will correctly + * remove all inactive segments as well as active ones. Ideally we'd + * continue the loop until getting exactly that errno, but that risks an + * infinite loop if the problem is directory-wide (for instance, if we + * suddenly can't read the data directory itself). We compromise by + * continuing after a non-ENOENT truncate error, but stopping after any + * unlink error. If there is indeed a directory-wide problem, additional + * unlink attempts wouldn't work anyway. + */ + if (ret >= 0 || errno != ENOENT) + { + char *segpath = (char *) palloc(strlen(path) + 12); + BlockNumber segno; + + for (segno = 1;; segno++) + { + sprintf(segpath, "%s.%u", path, segno); + + if (!RelFileLocatorBackendIsTemp(rlocator)) + { + /* + * Prevent other backends' fds from holding on to the disk + * space. We're done if we see ENOENT, though. + */ + if (do_truncate(segpath) < 0 && errno == ENOENT) + break; + + /* + * Forget any pending sync requests for this segment before we + * try to unlink. + */ + register_forget_request(rlocator, forknum, segno); + } + + if (unlink(segpath) < 0) + { + /* ENOENT is expected after the last segment... */ + if (errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", segpath))); + break; + } + } + pfree(segpath); + } + + pfree(path); +} + +/* + * mdextend() -- Add a block to the specified relation. + * + * The semantics are nearly the same as mdwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync) +{ + off_t seekpos; + int nbytes; + MdfdVec *v; + + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum >= mdnblocks(reln, forknum)); +#endif + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber. (Note that this failure should be unreachable + * because of upstream checks in bufmgr.c.) + */ + if (blocknum == InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rlocator, forknum), + InvalidBlockNumber))); + + v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + + if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space."))); + /* short write: complain appropriately */ + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u", + FilePathName(v->mdfd_vfd), + nbytes, BLCKSZ, blocknum), + errhint("Check free disk space."))); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); +} + +/* + * mdzeroextend() -- Add new zeroed out blocks to the specified relation. + * + * Similar to mdextend(), except the relation can be extended by multiple + * blocks at once and the added blocks will be filled with zeroes. + */ +void +mdzeroextend(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync) +{ + MdfdVec *v; + BlockNumber curblocknum = blocknum; + int remblocks = nblocks; + + Assert(nblocks > 0); + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum >= mdnblocks(reln, forknum)); +#endif + + /* + * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any + * more --- we mustn't create a block whose number actually is + * InvalidBlockNumber or larger. + */ + if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot extend file \"%s\" beyond %u blocks", + relpath(reln->smgr_rlocator, forknum), + InvalidBlockNumber))); + + while (remblocks > 0) + { + BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE); + off_t seekpos = (off_t) BLCKSZ * segstartblock; + int numblocks; + + if (segstartblock + remblocks > RELSEG_SIZE) + numblocks = RELSEG_SIZE - segstartblock; + else + numblocks = remblocks; + + v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE); + + Assert(segstartblock < RELSEG_SIZE); + Assert(segstartblock + numblocks <= RELSEG_SIZE); + + /* + * If available and useful, use posix_fallocate() (via + * FileFallocate()) to extend the relation. That's often more + * efficient than using write(), as it commonly won't cause the kernel + * to allocate page cache space for the extended pages. + * + * However, we don't use FileFallocate() for small extensions, as it + * defeats delayed allocation on some filesystems. Not clear where + * that decision should be made though? For now just use a cutoff of + * 8, anything between 4 and 8 worked OK in some local testing. + */ + if (numblocks > 8) + { + int ret; + + ret = FileFallocate(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + if (ret != 0) + { + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\" with FileFallocate(): %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + } + else + { + int ret; + + /* + * Even if we don't want to use fallocate, we can still extend a + * bit more efficiently than writing each 8kB block individually. + * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry() + * to avoid multiple writes or needing a zeroed buffer for the + * whole length of the extension. + */ + ret = FileZero(v->mdfd_vfd, + seekpos, (off_t) BLCKSZ * numblocks, + WAIT_EVENT_DATA_FILE_EXTEND); + if (ret < 0) + ereport(ERROR, + errcode_for_file_access(), + errmsg("could not extend file \"%s\": %m", + FilePathName(v->mdfd_vfd)), + errhint("Check free disk space.")); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + + remblocks -= numblocks; + curblocknum += numblocks; + } +} + +/* + * mdopenfork() -- Open one fork of the specified relation. + * + * Note we only open the first segment, when there are multiple segments. + * + * If first segment is not present, either ereport or return NULL according + * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL; + * EXTENSION_CREATE means it's OK to extend an existing relation, not to + * invent one out of whole cloth. + */ +static MdfdVec * +mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) +{ + MdfdVec *mdfd; + char *path; + File fd; + + /* No work if already open */ + if (reln->md_num_open_segs[forknum] > 0) + return &reln->md_seg_fds[forknum][0]; + + path = relpath(reln->smgr_rlocator, forknum); + + fd = PathNameOpenFile(path, _mdfd_open_flags()); + + if (fd < 0) + { + if ((behavior & EXTENSION_RETURN_NULL) && + FILE_POSSIBLY_DELETED(errno)) + { + pfree(path); + return NULL; + } + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + + pfree(path); + + _fdvec_resize(reln, forknum, 1); + mdfd = &reln->md_seg_fds[forknum][0]; + mdfd->mdfd_vfd = fd; + mdfd->mdfd_segno = 0; + + Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); + + return mdfd; +} + +/* + * mdopen() -- Initialize newly-opened relation. + */ +void +mdopen(SMgrRelation reln) +{ + /* mark it not open */ + for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++) + reln->md_num_open_segs[forknum] = 0; +} + +/* + * mdclose() -- Close the specified relation, if it isn't closed already. + */ +void +mdclose(SMgrRelation reln, ForkNumber forknum) +{ + int nopensegs = reln->md_num_open_segs[forknum]; + + /* No work if already closed */ + if (nopensegs == 0) + return; + + /* close segments starting from the end */ + while (nopensegs > 0) + { + MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; + + FileClose(v->mdfd_vfd); + _fdvec_resize(reln, forknum, nopensegs - 1); + nopensegs--; + } +} + +/* + * mdprefetch() -- Initiate asynchronous read of the specified block of a relation + */ +bool +mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ +#ifdef USE_PREFETCH + off_t seekpos; + MdfdVec *v; + + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + + v = _mdfd_getseg(reln, forknum, blocknum, false, + InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); + if (v == NULL) + return false; + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + + (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH); +#endif /* USE_PREFETCH */ + + return true; +} + +/* + * mdread() -- Read the specified block from a relation. + */ +void +mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void *buffer) +{ + off_t seekpos; + int nbytes; + MdfdVec *v; + + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + + TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber, + reln->smgr_rlocator.backend); + + v = _mdfd_getseg(reln, forknum, blocknum, false, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + + nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ); + + TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber, + reln->smgr_rlocator.backend, + nbytes, + BLCKSZ); + + if (nbytes != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read block %u in file \"%s\": %m", + blocknum, FilePathName(v->mdfd_vfd)))); + + /* + * Short read: we are at or past EOF, or we read a partial block at + * EOF. Normally this is an error; upper levels should never try to + * read a nonexistent block. However, if zero_damaged_pages is ON or + * we are InRecovery, we should instead return zeroes without + * complaining. This allows, for example, the case of trying to + * update a block that was later truncated away. + */ + if (zero_damaged_pages || InRecovery) + MemSet(buffer, 0, BLCKSZ); + else + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("could not read block %u in file \"%s\": read only %d of %d bytes", + blocknum, FilePathName(v->mdfd_vfd), + nbytes, BLCKSZ))); + } +} + +/* + * mdwrite() -- Write the supplied block at the appropriate location. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use mdextend(). + */ +void +mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync) +{ + off_t seekpos; + int nbytes; + MdfdVec *v; + + /* If this build supports direct I/O, the buffer must be I/O aligned. */ + if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ) + Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer)); + + /* This assert is too expensive to have on normally ... */ +#ifdef CHECK_WRITE_VS_EXTEND + Assert(blocknum < mdnblocks(reln, forknum)); +#endif + + TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber, + reln->smgr_rlocator.backend); + + v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, + EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + + nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE); + + TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, + reln->smgr_rlocator.locator.spcOid, + reln->smgr_rlocator.locator.dbOid, + reln->smgr_rlocator.locator.relNumber, + reln->smgr_rlocator.backend, + nbytes, + BLCKSZ); + + if (nbytes != BLCKSZ) + { + if (nbytes < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write block %u in file \"%s\": %m", + blocknum, FilePathName(v->mdfd_vfd)))); + /* short write: complain appropriately */ + ereport(ERROR, + (errcode(ERRCODE_DISK_FULL), + errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes", + blocknum, + FilePathName(v->mdfd_vfd), + nbytes, BLCKSZ), + errhint("Check free disk space."))); + } + + if (!skipFsync && !SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); +} + +/* + * mdwriteback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks) +{ + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + + /* + * Issue flush requests in as few requests as possible; have to split at + * segment boundaries though, since those are actually separate files. + */ + while (nblocks > 0) + { + BlockNumber nflush = nblocks; + off_t seekpos; + MdfdVec *v; + int segnum_start, + segnum_end; + + v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ , + EXTENSION_DONT_OPEN); + + /* + * We might be flushing buffers of already removed relations, that's + * ok, just ignore that case. If the segment file wasn't open already + * (ie from a recent mdwrite()), then we don't want to re-open it, to + * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave + * us with a descriptor to a file that is about to be unlinked. + */ + if (!v) + return; + + /* compute offset inside the current segment */ + segnum_start = blocknum / RELSEG_SIZE; + + /* compute number of desired writes within the current segment */ + segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; + if (segnum_start != segnum_end) + nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(nflush >= 1); + Assert(nflush <= nblocks); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + + FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH); + + nblocks -= nflush; + blocknum += nflush; + } +} + +/* + * mdnblocks() -- Get the number of blocks stored in a relation. + * + * Important side effect: all active segments of the relation are opened + * and added to the md_seg_fds array. If this routine has not been + * called, then only segments up to the last one actually touched + * are present in the array. + */ +BlockNumber +mdnblocks(SMgrRelation reln, ForkNumber forknum) +{ + MdfdVec *v; + BlockNumber nblocks; + BlockNumber segno; + + mdopenfork(reln, forknum, EXTENSION_FAIL); + + /* mdopen has opened the first segment */ + Assert(reln->md_num_open_segs[forknum] > 0); + + /* + * Start from the last open segments, to avoid redundant seeks. We have + * previously verified that these segments are exactly RELSEG_SIZE long, + * and it's useless to recheck that each time. + * + * NOTE: this assumption could only be wrong if another backend has + * truncated the relation. We rely on higher code levels to handle that + * scenario by closing and re-opening the md fd, which is handled via + * relcache flush. (Since the checkpointer doesn't participate in + * relcache flush, it could have segment entries for inactive segments; + * that's OK because the checkpointer never needs to compute relation + * size.) + */ + segno = reln->md_num_open_segs[forknum] - 1; + v = &reln->md_seg_fds[forknum][segno]; + + for (;;) + { + nblocks = _mdnblocks(reln, forknum, v); + if (nblocks > ((BlockNumber) RELSEG_SIZE)) + elog(FATAL, "segment too big"); + if (nblocks < ((BlockNumber) RELSEG_SIZE)) + return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks; + + /* + * If segment is exactly RELSEG_SIZE, advance to next one. + */ + segno++; + + /* + * We used to pass O_CREAT here, but that has the disadvantage that it + * might create a segment which has vanished through some operating + * system misadventure. In such a case, creating the segment here + * undermines _mdfd_getseg's attempts to notice and report an error + * upon access to a missing segment. + */ + v = _mdfd_openseg(reln, forknum, segno, 0); + if (v == NULL) + return segno * ((BlockNumber) RELSEG_SIZE); + } +} + +/* + * mdtruncate() -- Truncate relation to specified number of blocks. + */ +void +mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) +{ + BlockNumber curnblk; + BlockNumber priorblocks; + int curopensegs; + + /* + * NOTE: mdnblocks makes sure we have opened all active segments, so that + * truncation loop will get them all! + */ + curnblk = mdnblocks(reln, forknum); + if (nblocks > curnblk) + { + /* Bogus request ... but no complaint if InRecovery */ + if (InRecovery) + return; + ereport(ERROR, + (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now", + relpath(reln->smgr_rlocator, forknum), + nblocks, curnblk))); + } + if (nblocks == curnblk) + return; /* no work */ + + /* + * Truncate segments, starting at the last one. Starting at the end makes + * managing the memory for the fd array easier, should there be errors. + */ + curopensegs = reln->md_num_open_segs[forknum]; + while (curopensegs > 0) + { + MdfdVec *v; + + priorblocks = (curopensegs - 1) * RELSEG_SIZE; + + v = &reln->md_seg_fds[forknum][curopensegs - 1]; + + if (priorblocks > nblocks) + { + /* + * This segment is no longer active. We truncate the file, but do + * not delete it, for reasons explained in the header comments. + */ + if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\": %m", + FilePathName(v->mdfd_vfd)))); + + if (!SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + + /* we never drop the 1st segment */ + Assert(v != &reln->md_seg_fds[forknum][0]); + + FileClose(v->mdfd_vfd); + _fdvec_resize(reln, forknum, curopensegs - 1); + } + else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) + { + /* + * This is the last segment we want to keep. Truncate the file to + * the right length. NOTE: if nblocks is exactly a multiple K of + * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but + * keep it. This adheres to the invariant given in the header + * comments. + */ + BlockNumber lastsegblocks = nblocks - priorblocks; + + if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not truncate file \"%s\" to %u blocks: %m", + FilePathName(v->mdfd_vfd), + nblocks))); + if (!SmgrIsTemp(reln)) + register_dirty_segment(reln, forknum, v); + } + else + { + /* + * We still need this segment, so nothing to do for this and any + * earlier segment. + */ + break; + } + curopensegs--; + } +} + +/* + * mdimmedsync() -- Immediately sync a relation to stable storage. + * + * Note that only writes already issued are synced; this routine knows + * nothing of dirty buffers that may exist inside the buffer manager. We + * sync active and inactive segments; smgrDoPendingSyncs() relies on this. + * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of + * some segment, then mdtruncate() renders that segment inactive. If we + * crash before the next checkpoint syncs the newly-inactive segment, that + * segment may survive recovery, reintroducing unwanted data into the table. + */ +void +mdimmedsync(SMgrRelation reln, ForkNumber forknum) +{ + int segno; + int min_inactive_seg; + + /* + * NOTE: mdnblocks makes sure we have opened all active segments, so that + * fsync loop will get them all! + */ + mdnblocks(reln, forknum); + + min_inactive_seg = segno = reln->md_num_open_segs[forknum]; + + /* + * Temporarily open inactive segments, then close them after sync. There + * may be some inactive segments left opened after fsync() error, but that + * is harmless. We don't bother to clean them up and take a risk of + * further trouble. The next mdclose() will soon close them. + */ + while (_mdfd_openseg(reln, forknum, segno, 0) != NULL) + segno++; + + while (segno > 0) + { + MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + + /* + * fsyncs done through mdimmedsync() should be tracked in a separate + * IOContext than those done through mdsyncfiletag() to differentiate + * between unavoidable client backend fsyncs (e.g. those done during + * index build) and those which ideally would have been done by the + * checkpointer. Since other IO operations bypassing the buffer + * manager could also be tracked in such an IOContext, wait until + * these are also tracked to track immediate fsyncs. + */ + if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + FilePathName(v->mdfd_vfd)))); + + /* Close inactive segments immediately */ + if (segno > min_inactive_seg) + { + FileClose(v->mdfd_vfd); + _fdvec_resize(reln, forknum, segno - 1); + } + + segno--; + } +} + +/* + * register_dirty_segment() -- Mark a relation segment as needing fsync + * + * If there is a local pending-ops table, just make an entry in it for + * ProcessSyncRequests to process later. Otherwise, try to pass off the + * fsync request to the checkpointer process. If that fails, just do the + * fsync locally before returning (we hope this will not happen often + * enough to be a performance problem). + */ +static void +register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) +{ + FileTag tag; + + INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno); + + /* Temp relations should never be fsync'd */ + Assert(!SmgrIsTemp(reln)); + + if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ )) + { + instr_time io_start; + + ereport(DEBUG1, + (errmsg_internal("could not forward fsync request because request queue is full"))); + + io_start = pgstat_prepare_io_time(); + + if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + FilePathName(seg->mdfd_vfd)))); + + /* + * We have no way of knowing if the current IOContext is + * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this + * point, so count the fsync as being in the IOCONTEXT_NORMAL + * IOContext. This is probably okay, because the number of backend + * fsyncs doesn't say anything about the efficacy of the + * BufferAccessStrategy. And counting both fsyncs done in + * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under + * IOCONTEXT_NORMAL is likely clearer when investigating the number of + * backend fsyncs. + */ + pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, + IOOP_FSYNC, io_start, 1); + } +} + +/* + * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint + */ +static void +register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum, + BlockNumber segno) +{ + FileTag tag; + + INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno); + + /* Should never be used with temp relations */ + Assert(!RelFileLocatorBackendIsTemp(rlocator)); + + RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ ); +} + +/* + * register_forget_request() -- forget any fsyncs for a relation fork's segment + */ +static void +register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum, + BlockNumber segno) +{ + FileTag tag; + + INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno); + + RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ ); +} + +/* + * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB + */ +void +ForgetDatabaseSyncRequests(Oid dbid) +{ + FileTag tag; + RelFileLocator rlocator; + + rlocator.dbOid = dbid; + rlocator.spcOid = 0; + rlocator.relNumber = 0; + + INIT_MD_FILETAG(tag, rlocator, InvalidForkNumber, InvalidBlockNumber); + + RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ ); +} + +/* + * DropRelationFiles -- drop files of all given relations + */ +void +DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo) +{ + SMgrRelation *srels; + int i; + + srels = palloc(sizeof(SMgrRelation) * ndelrels); + for (i = 0; i < ndelrels; i++) + { + SMgrRelation srel = smgropen(delrels[i], InvalidBackendId); + + if (isRedo) + { + ForkNumber fork; + + for (fork = 0; fork <= MAX_FORKNUM; fork++) + XLogDropRelation(delrels[i], fork); + } + srels[i] = srel; + } + + smgrdounlinkall(srels, ndelrels, isRedo); + + for (i = 0; i < ndelrels; i++) + smgrclose(srels[i]); + pfree(srels); +} + + +/* + * _fdvec_resize() -- Resize the fork's open segments array + */ +static void +_fdvec_resize(SMgrRelation reln, + ForkNumber forknum, + int nseg) +{ + if (nseg == 0) + { + if (reln->md_num_open_segs[forknum] > 0) + { + pfree(reln->md_seg_fds[forknum]); + reln->md_seg_fds[forknum] = NULL; + } + } + else if (reln->md_num_open_segs[forknum] == 0) + { + reln->md_seg_fds[forknum] = + MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg); + } + else + { + /* + * It doesn't seem worthwhile complicating the code to amortize + * repalloc() calls. Those are far faster than PathNameOpenFile() or + * FileClose(), and the memory context internally will sometimes avoid + * doing an actual reallocation. + */ + reln->md_seg_fds[forknum] = + repalloc(reln->md_seg_fds[forknum], + sizeof(MdfdVec) * nseg); + } + + reln->md_num_open_segs[forknum] = nseg; +} + +/* + * Return the filename for the specified segment of the relation. The + * returned string is palloc'd. + */ +static char * +_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno) +{ + char *path, + *fullpath; + + path = relpath(reln->smgr_rlocator, forknum); + + if (segno > 0) + { + fullpath = psprintf("%s.%u", path, segno); + pfree(path); + } + else + fullpath = path; + + return fullpath; +} + +/* + * Open the specified segment of the relation, + * and make a MdfdVec object for it. Returns NULL on failure. + */ +static MdfdVec * +_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, + int oflags) +{ + MdfdVec *v; + File fd; + char *fullpath; + + fullpath = _mdfd_segpath(reln, forknum, segno); + + /* open the file */ + fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags); + + pfree(fullpath); + + if (fd < 0) + return NULL; + + /* + * Segments are always opened in order from lowest to highest, so we must + * be adding a new one at the end. + */ + Assert(segno == reln->md_num_open_segs[forknum]); + + _fdvec_resize(reln, forknum, segno + 1); + + /* fill the entry */ + v = &reln->md_seg_fds[forknum][segno]; + v->mdfd_vfd = fd; + v->mdfd_segno = segno; + + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); + + /* all done */ + return v; +} + +/* + * _mdfd_getseg() -- Find the segment of the relation holding the + * specified block. + * + * If the segment doesn't exist, we ereport, return NULL, or create the + * segment, according to "behavior". Note: skipFsync is only used in the + * EXTENSION_CREATE case. + */ +static MdfdVec * +_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, + bool skipFsync, int behavior) +{ + MdfdVec *v; + BlockNumber targetseg; + BlockNumber nextsegno; + + /* some way to handle non-existent segments needs to be specified */ + Assert(behavior & + (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL | + EXTENSION_DONT_OPEN)); + + targetseg = blkno / ((BlockNumber) RELSEG_SIZE); + + /* if an existing and opened segment, we're done */ + if (targetseg < reln->md_num_open_segs[forknum]) + { + v = &reln->md_seg_fds[forknum][targetseg]; + return v; + } + + /* The caller only wants the segment if we already had it open. */ + if (behavior & EXTENSION_DONT_OPEN) + return NULL; + + /* + * The target segment is not yet open. Iterate over all the segments + * between the last opened and the target segment. This way missing + * segments either raise an error, or get created (according to + * 'behavior'). Start with either the last opened, or the first segment if + * none was opened before. + */ + if (reln->md_num_open_segs[forknum] > 0) + v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1]; + else + { + v = mdopenfork(reln, forknum, behavior); + if (!v) + return NULL; /* if behavior & EXTENSION_RETURN_NULL */ + } + + for (nextsegno = reln->md_num_open_segs[forknum]; + nextsegno <= targetseg; nextsegno++) + { + BlockNumber nblocks = _mdnblocks(reln, forknum, v); + int flags = 0; + + Assert(nextsegno == v->mdfd_segno + 1); + + if (nblocks > ((BlockNumber) RELSEG_SIZE)) + elog(FATAL, "segment too big"); + + if ((behavior & EXTENSION_CREATE) || + (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY))) + { + /* + * Normally we will create new segments only if authorized by the + * caller (i.e., we are doing mdextend()). But when doing WAL + * recovery, create segments anyway; this allows cases such as + * replaying WAL data that has a write into a high-numbered + * segment of a relation that was later deleted. We want to go + * ahead and create the segments so we can finish out the replay. + * + * We have to maintain the invariant that segments before the last + * active segment are of size RELSEG_SIZE; therefore, if + * extending, pad them out with zeroes if needed. (This only + * matters if in recovery, or if the caller is extending the + * relation discontiguously, but that can happen in hash indexes.) + */ + if (nblocks < ((BlockNumber) RELSEG_SIZE)) + { + char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE, + MCXT_ALLOC_ZERO); + + mdextend(reln, forknum, + nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, + zerobuf, skipFsync); + pfree(zerobuf); + } + flags = O_CREAT; + } + else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && + nblocks < ((BlockNumber) RELSEG_SIZE)) + { + /* + * When not extending (or explicitly including truncated + * segments), only open the next segment if the current one is + * exactly RELSEG_SIZE. If not (this branch), either return NULL + * or fail. + */ + if (behavior & EXTENSION_RETURN_NULL) + { + /* + * Some callers discern between reasons for _mdfd_getseg() + * returning NULL based on errno. As there's no failing + * syscall involved in this case, explicitly set errno to + * ENOENT, as that seems the closest interpretation. + */ + errno = ENOENT; + return NULL; + } + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks", + _mdfd_segpath(reln, forknum, nextsegno), + blkno, nblocks))); + } + + v = _mdfd_openseg(reln, forknum, nextsegno, flags); + + if (v == NULL) + { + if ((behavior & EXTENSION_RETURN_NULL) && + FILE_POSSIBLY_DELETED(errno)) + return NULL; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" (target block %u): %m", + _mdfd_segpath(reln, forknum, nextsegno), + blkno))); + } + } + + return v; +} + +/* + * Get number of blocks present in a single disk file + */ +static BlockNumber +_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg) +{ + off_t len; + + len = FileSize(seg->mdfd_vfd); + if (len < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not seek to end of file \"%s\": %m", + FilePathName(seg->mdfd_vfd)))); + /* note that this calculation will ignore any partial block at EOF */ + return (BlockNumber) (len / BLCKSZ); +} + +/* + * Sync a file to disk, given a file tag. Write the path into an output + * buffer so the caller can use it in error messages. + * + * Return 0 on success, -1 on failure, with errno set. + */ +int +mdsyncfiletag(const FileTag *ftag, char *path) +{ + SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId); + File file; + instr_time io_start; + bool need_to_close; + int result, + save_errno; + + /* See if we already have the file open, or need to open it. */ + if (ftag->segno < reln->md_num_open_segs[ftag->forknum]) + { + file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd; + strlcpy(path, FilePathName(file), MAXPGPATH); + need_to_close = false; + } + else + { + char *p; + + p = _mdfd_segpath(reln, ftag->forknum, ftag->segno); + strlcpy(path, p, MAXPGPATH); + pfree(p); + + file = PathNameOpenFile(path, _mdfd_open_flags()); + if (file < 0) + return -1; + need_to_close = true; + } + + io_start = pgstat_prepare_io_time(); + + /* Sync the file. */ + result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC); + save_errno = errno; + + if (need_to_close) + FileClose(file); + + pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL, + IOOP_FSYNC, io_start, 1); + + errno = save_errno; + return result; +} + +/* + * Unlink a file, given a file tag. Write the path into an output + * buffer so the caller can use it in error messages. + * + * Return 0 on success, -1 on failure, with errno set. + */ +int +mdunlinkfiletag(const FileTag *ftag, char *path) +{ + char *p; + + /* Compute the path. */ + p = relpathperm(ftag->rlocator, MAIN_FORKNUM); + strlcpy(path, p, MAXPGPATH); + pfree(p); + + /* Try to unlink the file. */ + return unlink(path); +} + +/* + * Check if a given candidate request matches a given tag, when processing + * a SYNC_FILTER_REQUEST request. This will be called for all pending + * requests to find out whether to forget them. + */ +bool +mdfiletagmatches(const FileTag *ftag, const FileTag *candidate) +{ + /* + * For now we only use filter requests as a way to drop all scheduled + * callbacks relating to a given database, when dropping the database. + * We'll return true for all candidates that have the same database OID as + * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten. + */ + return ftag->rlocator.dbOid == candidate->rlocator.dbOid; +} diff --git a/src/backend/storage/smgr/meson.build b/src/backend/storage/smgr/meson.build new file mode 100644 index 0000000..e1ba6ed --- /dev/null +++ b/src/backend/storage/smgr/meson.build @@ -0,0 +1,6 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'md.c', + 'smgr.c', +) diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c new file mode 100644 index 0000000..5d0f3d5 --- /dev/null +++ b/src/backend/storage/smgr/smgr.c @@ -0,0 +1,767 @@ +/*------------------------------------------------------------------------- + * + * smgr.c + * public interface routines to storage manager switch. + * + * All file system operations in POSTGRES dispatch through these + * routines. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/smgr/smgr.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xlogutils.h" +#include "lib/ilist.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/md.h" +#include "storage/smgr.h" +#include "utils/hsearch.h" +#include "utils/inval.h" + + +/* + * This struct of function pointers defines the API between smgr.c and + * any individual storage manager module. Note that smgr subfunctions are + * generally expected to report problems via elog(ERROR). An exception is + * that smgr_unlink should use elog(WARNING), rather than erroring out, + * because we normally unlink relations during post-commit/abort cleanup, + * and so it's too late to raise an error. Also, various conditions that + * would normally be errors should be allowed during bootstrap and/or WAL + * recovery --- see comments in md.c for details. + */ +typedef struct f_smgr +{ + void (*smgr_init) (void); /* may be NULL */ + void (*smgr_shutdown) (void); /* may be NULL */ + void (*smgr_open) (SMgrRelation reln); + void (*smgr_close) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_create) (SMgrRelation reln, ForkNumber forknum, + bool isRedo); + bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum, + bool isRedo); + void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks, bool skipFsync); + bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); + void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, void *buffer); + void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, const void *buffer, bool skipFsync); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, BlockNumber nblocks); + BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); + void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, + BlockNumber nblocks); + void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum); +} f_smgr; + +static const f_smgr smgrsw[] = { + /* magnetic disk */ + { + .smgr_init = mdinit, + .smgr_shutdown = NULL, + .smgr_open = mdopen, + .smgr_close = mdclose, + .smgr_create = mdcreate, + .smgr_exists = mdexists, + .smgr_unlink = mdunlink, + .smgr_extend = mdextend, + .smgr_zeroextend = mdzeroextend, + .smgr_prefetch = mdprefetch, + .smgr_read = mdread, + .smgr_write = mdwrite, + .smgr_writeback = mdwriteback, + .smgr_nblocks = mdnblocks, + .smgr_truncate = mdtruncate, + .smgr_immedsync = mdimmedsync, + } +}; + +static const int NSmgr = lengthof(smgrsw); + +/* + * Each backend has a hashtable that stores all extant SMgrRelation objects. + * In addition, "unowned" SMgrRelation objects are chained together in a list. + */ +static HTAB *SMgrRelationHash = NULL; + +static dlist_head unowned_relns; + +/* local function prototypes */ +static void smgrshutdown(int code, Datum arg); + + +/* + * smgrinit(), smgrshutdown() -- Initialize or shut down storage + * managers. + * + * Note: smgrinit is called during backend startup (normal or standalone + * case), *not* during postmaster start. Therefore, any resources created + * here or destroyed in smgrshutdown are backend-local. + */ +void +smgrinit(void) +{ + int i; + + for (i = 0; i < NSmgr; i++) + { + if (smgrsw[i].smgr_init) + smgrsw[i].smgr_init(); + } + + /* register the shutdown proc */ + on_proc_exit(smgrshutdown, 0); +} + +/* + * on_proc_exit hook for smgr cleanup during backend shutdown + */ +static void +smgrshutdown(int code, Datum arg) +{ + int i; + + for (i = 0; i < NSmgr; i++) + { + if (smgrsw[i].smgr_shutdown) + smgrsw[i].smgr_shutdown(); + } +} + +/* + * smgropen() -- Return an SMgrRelation object, creating it if need be. + * + * This does not attempt to actually open the underlying file. + */ +SMgrRelation +smgropen(RelFileLocator rlocator, BackendId backend) +{ + RelFileLocatorBackend brlocator; + SMgrRelation reln; + bool found; + + if (SMgrRelationHash == NULL) + { + /* First time through: initialize the hash table */ + HASHCTL ctl; + + ctl.keysize = sizeof(RelFileLocatorBackend); + ctl.entrysize = sizeof(SMgrRelationData); + SMgrRelationHash = hash_create("smgr relation table", 400, + &ctl, HASH_ELEM | HASH_BLOBS); + dlist_init(&unowned_relns); + } + + /* Look up or create an entry */ + brlocator.locator = rlocator; + brlocator.backend = backend; + reln = (SMgrRelation) hash_search(SMgrRelationHash, + &brlocator, + HASH_ENTER, &found); + + /* Initialize it if not present before */ + if (!found) + { + /* hash_search already filled in the lookup key */ + reln->smgr_owner = NULL; + reln->smgr_targblock = InvalidBlockNumber; + for (int i = 0; i <= MAX_FORKNUM; ++i) + reln->smgr_cached_nblocks[i] = InvalidBlockNumber; + reln->smgr_which = 0; /* we only have md.c at present */ + + /* implementation-specific initialization */ + smgrsw[reln->smgr_which].smgr_open(reln); + + /* it has no owner yet */ + dlist_push_tail(&unowned_relns, &reln->node); + } + + return reln; +} + +/* + * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object + * + * There can be only one owner at a time; this is sufficient since currently + * the only such owners exist in the relcache. + */ +void +smgrsetowner(SMgrRelation *owner, SMgrRelation reln) +{ + /* We don't support "disowning" an SMgrRelation here, use smgrclearowner */ + Assert(owner != NULL); + + /* + * First, unhook any old owner. (Normally there shouldn't be any, but it + * seems possible that this can happen during swap_relation_files() + * depending on the order of processing. It's ok to close the old + * relcache entry early in that case.) + * + * If there isn't an old owner, then the reln should be in the unowned + * list, and we need to remove it. + */ + if (reln->smgr_owner) + *(reln->smgr_owner) = NULL; + else + dlist_delete(&reln->node); + + /* Now establish the ownership relationship. */ + reln->smgr_owner = owner; + *owner = reln; +} + +/* + * smgrclearowner() -- Remove long-lived reference to an SMgrRelation object + * if one exists + */ +void +smgrclearowner(SMgrRelation *owner, SMgrRelation reln) +{ + /* Do nothing if the SMgrRelation object is not owned by the owner */ + if (reln->smgr_owner != owner) + return; + + /* unset the owner's reference */ + *owner = NULL; + + /* unset our reference to the owner */ + reln->smgr_owner = NULL; + + /* add to list of unowned relations */ + dlist_push_tail(&unowned_relns, &reln->node); +} + +/* + * smgrexists() -- Does the underlying file for a fork exist? + */ +bool +smgrexists(SMgrRelation reln, ForkNumber forknum) +{ + return smgrsw[reln->smgr_which].smgr_exists(reln, forknum); +} + +/* + * smgrclose() -- Close and delete an SMgrRelation object. + */ +void +smgrclose(SMgrRelation reln) +{ + SMgrRelation *owner; + ForkNumber forknum; + + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + smgrsw[reln->smgr_which].smgr_close(reln, forknum); + + owner = reln->smgr_owner; + + if (!owner) + dlist_delete(&reln->node); + + if (hash_search(SMgrRelationHash, + &(reln->smgr_rlocator), + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "SMgrRelation hashtable corrupted"); + + /* + * Unhook the owner pointer, if any. We do this last since in the remote + * possibility of failure above, the SMgrRelation object will still exist. + */ + if (owner) + *owner = NULL; +} + +/* + * smgrrelease() -- Release all resources used by this object. + * + * The object remains valid. + */ +void +smgrrelease(SMgrRelation reln) +{ + for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + smgrsw[reln->smgr_which].smgr_close(reln, forknum); + reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; + } + reln->smgr_targblock = InvalidBlockNumber; +} + +/* + * smgrreleaseall() -- Release resources used by all objects. + * + * This is called for PROCSIGNAL_BARRIER_SMGRRELEASE. + */ +void +smgrreleaseall(void) +{ + HASH_SEQ_STATUS status; + SMgrRelation reln; + + /* Nothing to do if hashtable not set up */ + if (SMgrRelationHash == NULL) + return; + + hash_seq_init(&status, SMgrRelationHash); + + while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL) + smgrrelease(reln); +} + +/* + * smgrcloseall() -- Close all existing SMgrRelation objects. + */ +void +smgrcloseall(void) +{ + HASH_SEQ_STATUS status; + SMgrRelation reln; + + /* Nothing to do if hashtable not set up */ + if (SMgrRelationHash == NULL) + return; + + hash_seq_init(&status, SMgrRelationHash); + + while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL) + smgrclose(reln); +} + +/* + * smgrcloserellocator() -- Close SMgrRelation object for given RelFileLocator, + * if one exists. + * + * This has the same effects as smgrclose(smgropen(rlocator)), but it avoids + * uselessly creating a hashtable entry only to drop it again when no + * such entry exists already. + */ +void +smgrcloserellocator(RelFileLocatorBackend rlocator) +{ + SMgrRelation reln; + + /* Nothing to do if hashtable not set up */ + if (SMgrRelationHash == NULL) + return; + + reln = (SMgrRelation) hash_search(SMgrRelationHash, + &rlocator, + HASH_FIND, NULL); + if (reln != NULL) + smgrclose(reln); +} + +/* + * smgrcreate() -- Create a new relation. + * + * Given an already-created (but presumably unused) SMgrRelation, + * cause the underlying disk file or other storage for the fork + * to be created. + */ +void +smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) +{ + smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo); +} + +/* + * smgrdosyncall() -- Immediately sync all forks of all given relations + * + * All forks of all given relations are synced out to the store. + * + * This is equivalent to FlushRelationBuffers() for each smgr relation, + * then calling smgrimmedsync() for all forks of each relation, but it's + * significantly quicker so should be preferred when possible. + */ +void +smgrdosyncall(SMgrRelation *rels, int nrels) +{ + int i = 0; + ForkNumber forknum; + + if (nrels == 0) + return; + + FlushRelationsAllBuffers(rels, nrels); + + /* + * Sync the physical file(s). + */ + for (i = 0; i < nrels; i++) + { + int which = rels[i]->smgr_which; + + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + { + if (smgrsw[which].smgr_exists(rels[i], forknum)) + smgrsw[which].smgr_immedsync(rels[i], forknum); + } + } +} + +/* + * smgrdounlinkall() -- Immediately unlink all forks of all given relations + * + * All forks of all given relations are removed from the store. This + * should not be used during transactional operations, since it can't be + * undone. + * + * If isRedo is true, it is okay for the underlying file(s) to be gone + * already. + */ +void +smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo) +{ + int i = 0; + RelFileLocatorBackend *rlocators; + ForkNumber forknum; + + if (nrels == 0) + return; + + /* + * Get rid of any remaining buffers for the relations. bufmgr will just + * drop them without bothering to write the contents. + */ + DropRelationsAllBuffers(rels, nrels); + + /* + * create an array which contains all relations to be dropped, and close + * each relation's forks at the smgr level while at it + */ + rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels); + for (i = 0; i < nrels; i++) + { + RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator; + int which = rels[i]->smgr_which; + + rlocators[i] = rlocator; + + /* Close the forks at smgr level */ + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + smgrsw[which].smgr_close(rels[i], forknum); + } + + /* + * Send a shared-inval message to force other backends to close any + * dangling smgr references they may have for these rels. We should do + * this before starting the actual unlinking, in case we fail partway + * through that step. Note that the sinval messages will eventually come + * back to this backend, too, and thereby provide a backstop that we + * closed our own smgr rel. + */ + for (i = 0; i < nrels; i++) + CacheInvalidateSmgr(rlocators[i]); + + /* + * Delete the physical file(s). + * + * Note: smgr_unlink must treat deletion failure as a WARNING, not an + * ERROR, because we've already decided to commit or abort the current + * xact. + */ + + for (i = 0; i < nrels; i++) + { + int which = rels[i]->smgr_which; + + for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) + smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo); + } + + pfree(rlocators); +} + + +/* + * smgrextend() -- Add a new block to a file. + * + * The semantics are nearly the same as smgrwrite(): write at the + * specified position. However, this is to be used for the case of + * extending a relation (i.e., blocknum is at or beyond the current + * EOF). Note that we assume writing a block beyond current EOF + * causes intervening file space to become filled with zeroes. + */ +void +smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync) +{ + smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum, + buffer, skipFsync); + + /* + * Normally we expect this to increase nblocks by one, but if the cached + * value isn't as expected, just invalidate it so the next call asks the + * kernel. + */ + if (reln->smgr_cached_nblocks[forknum] == blocknum) + reln->smgr_cached_nblocks[forknum] = blocknum + 1; + else + reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; +} + +/* + * smgrzeroextend() -- Add new zeroed out blocks to a file. + * + * Similar to smgrextend(), except the relation can be extended by + * multiple blocks at once and the added blocks will be filled with + * zeroes. + */ +void +smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks, bool skipFsync) +{ + smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum, + nblocks, skipFsync); + + /* + * Normally we expect this to increase the fork size by nblocks, but if + * the cached value isn't as expected, just invalidate it so the next call + * asks the kernel. + */ + if (reln->smgr_cached_nblocks[forknum] == blocknum) + reln->smgr_cached_nblocks[forknum] = blocknum + nblocks; + else + reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber; +} + +/* + * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation. + * + * In recovery only, this can return false to indicate that a file + * doesn't exist (presumably it has been dropped by a later WAL + * record). + */ +bool +smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum); +} + +/* + * smgrread() -- read a particular block from a relation into the supplied + * buffer. + * + * This routine is called from the buffer manager in order to + * instantiate pages in the shared buffer cache. All storage managers + * return pages in the format that POSTGRES expects. + */ +void +smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + void *buffer) +{ + smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer); +} + +/* + * smgrwrite() -- Write the supplied buffer out. + * + * This is to be used only for updating already-existing blocks of a + * relation (ie, those before the current EOF). To extend a relation, + * use smgrextend(). + * + * This is not a synchronous write -- the block is not necessarily + * on disk at return, only dumped out to the kernel. However, + * provisions will be made to fsync the write before the next checkpoint. + * + * skipFsync indicates that the caller will make other provisions to + * fsync the relation, so we needn't bother. Temporary relations also + * do not require fsync. + */ +void +smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + const void *buffer, bool skipFsync) +{ + smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum, + buffer, skipFsync); +} + + +/* + * smgrwriteback() -- Trigger kernel writeback for the supplied range of + * blocks. + */ +void +smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + BlockNumber nblocks) +{ + smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum, + nblocks); +} + +/* + * smgrnblocks() -- Calculate the number of blocks in the + * supplied relation. + */ +BlockNumber +smgrnblocks(SMgrRelation reln, ForkNumber forknum) +{ + BlockNumber result; + + /* Check and return if we get the cached value for the number of blocks. */ + result = smgrnblocks_cached(reln, forknum); + if (result != InvalidBlockNumber) + return result; + + result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum); + + reln->smgr_cached_nblocks[forknum] = result; + + return result; +} + +/* + * smgrnblocks_cached() -- Get the cached number of blocks in the supplied + * relation. + * + * Returns an InvalidBlockNumber when not in recovery and when the relation + * fork size is not cached. + */ +BlockNumber +smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum) +{ + /* + * For now, we only use cached values in recovery due to lack of a shared + * invalidation mechanism for changes in file size. + */ + if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber) + return reln->smgr_cached_nblocks[forknum]; + + return InvalidBlockNumber; +} + +/* + * smgrtruncate() -- Truncate the given forks of supplied relation to + * each specified numbers of blocks + * + * The truncation is done immediately, so this can't be rolled back. + * + * The caller must hold AccessExclusiveLock on the relation, to ensure that + * other backends receive the smgr invalidation event that this function sends + * before they access any forks of the relation again. + */ +void +smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks) +{ + int i; + + /* + * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will + * just drop them without bothering to write the contents. + */ + DropRelationBuffers(reln, forknum, nforks, nblocks); + + /* + * Send a shared-inval message to force other backends to close any smgr + * references they may have for this rel. This is useful because they + * might have open file pointers to segments that got removed, and/or + * smgr_targblock variables pointing past the new rel end. (The inval + * message will come back to our backend, too, causing a + * probably-unnecessary local smgr flush. But we don't expect that this + * is a performance-critical path.) As in the unlink code, we want to be + * sure the message is sent before we start changing things on-disk. + */ + CacheInvalidateSmgr(reln->smgr_rlocator); + + /* Do the truncation */ + for (i = 0; i < nforks; i++) + { + /* Make the cached size is invalid if we encounter an error. */ + reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber; + + smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]); + + /* + * We might as well update the local smgr_cached_nblocks values. The + * smgr cache inval message that this function sent will cause other + * backends to invalidate their copies of smgr_fsm_nblocks and + * smgr_vm_nblocks, and these ones too at the next command boundary. + * But these ensure they aren't outright wrong until then. + */ + reln->smgr_cached_nblocks[forknum[i]] = nblocks[i]; + } +} + +/* + * smgrimmedsync() -- Force the specified relation to stable storage. + * + * Synchronously force all previous writes to the specified relation + * down to disk. + * + * This is useful for building completely new relations (eg, new + * indexes). Instead of incrementally WAL-logging the index build + * steps, we can just write completed index pages to disk with smgrwrite + * or smgrextend, and then fsync the completed index file before + * committing the transaction. (This is sufficient for purposes of + * crash recovery, since it effectively duplicates forcing a checkpoint + * for the completed index. But it is *not* sufficient if one wishes + * to use the WAL log for PITR or replication purposes: in that case + * we have to make WAL entries as well.) + * + * The preceding writes should specify skipFsync = true to avoid + * duplicative fsyncs. + * + * Note that you need to do FlushRelationBuffers() first if there is + * any possibility that there are dirty buffers for the relation; + * otherwise the sync is not very meaningful. + */ +void +smgrimmedsync(SMgrRelation reln, ForkNumber forknum) +{ + smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum); +} + +/* + * AtEOXact_SMgr + * + * This routine is called during transaction commit or abort (it doesn't + * particularly care which). All transient SMgrRelation objects are closed. + * + * We do this as a compromise between wanting transient SMgrRelations to + * live awhile (to amortize the costs of blind writes of multiple blocks) + * and needing them to not live forever (since we're probably holding open + * a kernel file descriptor for the underlying file, and we need to ensure + * that gets closed reasonably soon if the file gets deleted). + */ +void +AtEOXact_SMgr(void) +{ + dlist_mutable_iter iter; + + /* + * Zap all unowned SMgrRelations. We rely on smgrclose() to remove each + * one from the list. + */ + dlist_foreach_modify(iter, &unowned_relns) + { + SMgrRelation rel = dlist_container(SMgrRelationData, node, + iter.cur); + + Assert(rel->smgr_owner == NULL); + + smgrclose(rel); + } +} + +/* + * This routine is called when we are ordered to release all open files by a + * ProcSignalBarrier. + */ +bool +ProcessBarrierSmgrRelease(void) +{ + smgrreleaseall(); + return true; +} diff --git a/src/backend/storage/sync/Makefile b/src/backend/storage/sync/Makefile new file mode 100644 index 0000000..be88b44 --- /dev/null +++ b/src/backend/storage/sync/Makefile @@ -0,0 +1,18 @@ +#------------------------------------------------------------------------- +# +# Makefile-- +# Makefile for storage/sync +# +# IDENTIFICATION +# src/backend/storage/sync/Makefile +# +#------------------------------------------------------------------------- + +subdir = src/backend/storage/sync +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + sync.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/sync/meson.build b/src/backend/storage/sync/meson.build new file mode 100644 index 0000000..1b49f16 --- /dev/null +++ b/src/backend/storage/sync/meson.build @@ -0,0 +1,6 @@ +# Copyright (c) 2022-2023, PostgreSQL Global Development Group + +backend_sources += files( + 'sync.c', + +) diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c new file mode 100644 index 0000000..04fcb06 --- /dev/null +++ b/src/backend/storage/sync/sync.c @@ -0,0 +1,624 @@ +/*------------------------------------------------------------------------- + * + * sync.c + * File synchronization management code. + * + * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/sync/sync.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include +#include + +#include "access/commit_ts.h" +#include "access/clog.h" +#include "access/multixact.h" +#include "access/xlog.h" +#include "access/xlogutils.h" +#include "commands/tablespace.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "portability/instr_time.h" +#include "postmaster/bgwriter.h" +#include "storage/bufmgr.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/md.h" +#include "utils/hsearch.h" +#include "utils/inval.h" +#include "utils/memutils.h" + +/* + * In some contexts (currently, standalone backends and the checkpointer) + * we keep track of pending fsync operations: we need to remember all relation + * segments that have been written since the last checkpoint, so that we can + * fsync them down to disk before completing the next checkpoint. This hash + * table remembers the pending operations. We use a hash table mostly as + * a convenient way of merging duplicate requests. + * + * We use a similar mechanism to remember no-longer-needed files that can + * be deleted after the next checkpoint, but we use a linked list instead of + * a hash table, because we don't expect there to be any duplicate requests. + * + * These mechanisms are only used for non-temp relations; we never fsync + * temp rels, nor do we need to postpone their deletion (see comments in + * mdunlink). + * + * (Regular backends do not track pending operations locally, but forward + * them to the checkpointer.) + */ +typedef uint16 CycleCtr; /* can be any convenient integer size */ + +typedef struct +{ + FileTag tag; /* identifies handler and file */ + CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */ + bool canceled; /* canceled is true if we canceled "recently" */ +} PendingFsyncEntry; + +typedef struct +{ + FileTag tag; /* identifies handler and file */ + CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */ + bool canceled; /* true if request has been canceled */ +} PendingUnlinkEntry; + +static HTAB *pendingOps = NULL; +static List *pendingUnlinks = NIL; +static MemoryContext pendingOpsCxt; /* context for the above */ + +static CycleCtr sync_cycle_ctr = 0; +static CycleCtr checkpoint_cycle_ctr = 0; + +/* Intervals for calling AbsorbSyncRequests */ +#define FSYNCS_PER_ABSORB 10 +#define UNLINKS_PER_ABSORB 10 + +/* + * Function pointers for handling sync and unlink requests. + */ +typedef struct SyncOps +{ + int (*sync_syncfiletag) (const FileTag *ftag, char *path); + int (*sync_unlinkfiletag) (const FileTag *ftag, char *path); + bool (*sync_filetagmatches) (const FileTag *ftag, + const FileTag *candidate); +} SyncOps; + +/* + * These indexes must correspond to the values of the SyncRequestHandler enum. + */ +static const SyncOps syncsw[] = { + /* magnetic disk */ + [SYNC_HANDLER_MD] = { + .sync_syncfiletag = mdsyncfiletag, + .sync_unlinkfiletag = mdunlinkfiletag, + .sync_filetagmatches = mdfiletagmatches + }, + /* pg_xact */ + [SYNC_HANDLER_CLOG] = { + .sync_syncfiletag = clogsyncfiletag + }, + /* pg_commit_ts */ + [SYNC_HANDLER_COMMIT_TS] = { + .sync_syncfiletag = committssyncfiletag + }, + /* pg_multixact/offsets */ + [SYNC_HANDLER_MULTIXACT_OFFSET] = { + .sync_syncfiletag = multixactoffsetssyncfiletag + }, + /* pg_multixact/members */ + [SYNC_HANDLER_MULTIXACT_MEMBER] = { + .sync_syncfiletag = multixactmemberssyncfiletag + } +}; + +/* + * Initialize data structures for the file sync tracking. + */ +void +InitSync(void) +{ + /* + * Create pending-operations hashtable if we need it. Currently, we need + * it if we are standalone (not under a postmaster) or if we are a + * checkpointer auxiliary process. + */ + if (!IsUnderPostmaster || AmCheckpointerProcess()) + { + HASHCTL hash_ctl; + + /* + * XXX: The checkpointer needs to add entries to the pending ops table + * when absorbing fsync requests. That is done within a critical + * section, which isn't usually allowed, but we make an exception. It + * means that there's a theoretical possibility that you run out of + * memory while absorbing fsync requests, which leads to a PANIC. + * Fortunately the hash table is small so that's unlikely to happen in + * practice. + */ + pendingOpsCxt = AllocSetContextCreate(TopMemoryContext, + "Pending ops context", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(pendingOpsCxt, true); + + hash_ctl.keysize = sizeof(FileTag); + hash_ctl.entrysize = sizeof(PendingFsyncEntry); + hash_ctl.hcxt = pendingOpsCxt; + pendingOps = hash_create("Pending Ops Table", + 100L, + &hash_ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + pendingUnlinks = NIL; + } +} + +/* + * SyncPreCheckpoint() -- Do pre-checkpoint work + * + * To distinguish unlink requests that arrived before this checkpoint + * started from those that arrived during the checkpoint, we use a cycle + * counter similar to the one we use for fsync requests. That cycle + * counter is incremented here. + * + * This must be called *before* the checkpoint REDO point is determined. + * That ensures that we won't delete files too soon. Since this calls + * AbsorbSyncRequests(), which performs memory allocations, it cannot be + * called within a critical section. + * + * Note that we can't do anything here that depends on the assumption + * that the checkpoint will be completed. + */ +void +SyncPreCheckpoint(void) +{ + /* + * Operations such as DROP TABLESPACE assume that the next checkpoint will + * process all recently forwarded unlink requests, but if they aren't + * absorbed prior to advancing the cycle counter, they won't be processed + * until a future checkpoint. The following absorb ensures that any + * unlink requests forwarded before the checkpoint began will be processed + * in the current checkpoint. + */ + AbsorbSyncRequests(); + + /* + * Any unlink requests arriving after this point will be assigned the next + * cycle counter, and won't be unlinked until next checkpoint. + */ + checkpoint_cycle_ctr++; +} + +/* + * SyncPostCheckpoint() -- Do post-checkpoint work + * + * Remove any lingering files that can now be safely removed. + */ +void +SyncPostCheckpoint(void) +{ + int absorb_counter; + ListCell *lc; + + absorb_counter = UNLINKS_PER_ABSORB; + foreach(lc, pendingUnlinks) + { + PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc); + char path[MAXPGPATH]; + + /* Skip over any canceled entries */ + if (entry->canceled) + continue; + + /* + * New entries are appended to the end, so if the entry is new we've + * reached the end of old entries. + * + * Note: if just the right number of consecutive checkpoints fail, we + * could be fooled here by cycle_ctr wraparound. However, the only + * consequence is that we'd delay unlinking for one more checkpoint, + * which is perfectly tolerable. + */ + if (entry->cycle_ctr == checkpoint_cycle_ctr) + break; + + /* Unlink the file */ + if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag, + path) < 0) + { + /* + * There's a race condition, when the database is dropped at the + * same time that we process the pending unlink requests. If the + * DROP DATABASE deletes the file before we do, we will get ENOENT + * here. rmtree() also has to ignore ENOENT errors, to deal with + * the possibility that we delete the file first. + */ + if (errno != ENOENT) + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", path))); + } + + /* Mark the list entry as canceled, just in case */ + entry->canceled = true; + + /* + * As in ProcessSyncRequests, we don't want to stop absorbing fsync + * requests for a long time when there are many deletions to be done. + * We can safely call AbsorbSyncRequests() at this point in the loop. + */ + if (--absorb_counter <= 0) + { + AbsorbSyncRequests(); + absorb_counter = UNLINKS_PER_ABSORB; + } + } + + /* + * If we reached the end of the list, we can just remove the whole list + * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise, + * we must keep the entries at or after "lc". + */ + if (lc == NULL) + { + list_free_deep(pendingUnlinks); + pendingUnlinks = NIL; + } + else + { + int ntodelete = list_cell_number(pendingUnlinks, lc); + + for (int i = 0; i < ntodelete; i++) + pfree(list_nth(pendingUnlinks, i)); + + pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete); + } +} + +/* + * ProcessSyncRequests() -- Process queued fsync requests. + */ +void +ProcessSyncRequests(void) +{ + static bool sync_in_progress = false; + + HASH_SEQ_STATUS hstat; + PendingFsyncEntry *entry; + int absorb_counter; + + /* Statistics on sync times */ + int processed = 0; + instr_time sync_start, + sync_end, + sync_diff; + uint64 elapsed; + uint64 longest = 0; + uint64 total_elapsed = 0; + + /* + * This is only called during checkpoints, and checkpoints should only + * occur in processes that have created a pendingOps. + */ + if (!pendingOps) + elog(ERROR, "cannot sync without a pendingOps table"); + + /* + * If we are in the checkpointer, the sync had better include all fsync + * requests that were queued by backends up to this point. The tightest + * race condition that could occur is that a buffer that must be written + * and fsync'd for the checkpoint could have been dumped by a backend just + * before it was visited by BufferSync(). We know the backend will have + * queued an fsync request before clearing the buffer's dirtybit, so we + * are safe as long as we do an Absorb after completing BufferSync(). + */ + AbsorbSyncRequests(); + + /* + * To avoid excess fsync'ing (in the worst case, maybe a never-terminating + * checkpoint), we want to ignore fsync requests that are entered into the + * hashtable after this point --- they should be processed next time, + * instead. We use sync_cycle_ctr to tell old entries apart from new + * ones: new ones will have cycle_ctr equal to the incremented value of + * sync_cycle_ctr. + * + * In normal circumstances, all entries present in the table at this point + * will have cycle_ctr exactly equal to the current (about to be old) + * value of sync_cycle_ctr. However, if we fail partway through the + * fsync'ing loop, then older values of cycle_ctr might remain when we + * come back here to try again. Repeated checkpoint failures would + * eventually wrap the counter around to the point where an old entry + * might appear new, causing us to skip it, possibly allowing a checkpoint + * to succeed that should not have. To forestall wraparound, any time the + * previous ProcessSyncRequests() failed to complete, run through the + * table and forcibly set cycle_ctr = sync_cycle_ctr. + * + * Think not to merge this loop with the main loop, as the problem is + * exactly that that loop may fail before having visited all the entries. + * From a performance point of view it doesn't matter anyway, as this path + * will never be taken in a system that's functioning normally. + */ + if (sync_in_progress) + { + /* prior try failed, so update any stale cycle_ctr values */ + hash_seq_init(&hstat, pendingOps); + while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + entry->cycle_ctr = sync_cycle_ctr; + } + } + + /* Advance counter so that new hashtable entries are distinguishable */ + sync_cycle_ctr++; + + /* Set flag to detect failure if we don't reach the end of the loop */ + sync_in_progress = true; + + /* Now scan the hashtable for fsync requests to process */ + absorb_counter = FSYNCS_PER_ABSORB; + hash_seq_init(&hstat, pendingOps); + while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + int failures; + + /* + * If the entry is new then don't process it this time; it is new. + * Note "continue" bypasses the hash-remove call at the bottom of the + * loop. + */ + if (entry->cycle_ctr == sync_cycle_ctr) + continue; + + /* Else assert we haven't missed it */ + Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr); + + /* + * If fsync is off then we don't have to bother opening the file at + * all. (We delay checking until this point so that changing fsync on + * the fly behaves sensibly.) + */ + if (enableFsync) + { + /* + * If in checkpointer, we want to absorb pending requests every so + * often to prevent overflow of the fsync request queue. It is + * unspecified whether newly-added entries will be visited by + * hash_seq_search, but we don't care since we don't need to + * process them anyway. + */ + if (--absorb_counter <= 0) + { + AbsorbSyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; + } + + /* + * The fsync table could contain requests to fsync segments that + * have been deleted (unlinked) by the time we get to them. Rather + * than just hoping an ENOENT (or EACCES on Windows) error can be + * ignored, what we do on error is absorb pending requests and + * then retry. Since mdunlink() queues a "cancel" message before + * actually unlinking, the fsync request is guaranteed to be + * marked canceled after the absorb if it really was this case. + * DROP DATABASE likewise has to tell us to forget fsync requests + * before it starts deletions. + */ + for (failures = 0; !entry->canceled; failures++) + { + char path[MAXPGPATH]; + + INSTR_TIME_SET_CURRENT(sync_start); + if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag, + path) == 0) + { + /* Success; update statistics about sync timing */ + INSTR_TIME_SET_CURRENT(sync_end); + sync_diff = sync_end; + INSTR_TIME_SUBTRACT(sync_diff, sync_start); + elapsed = INSTR_TIME_GET_MICROSEC(sync_diff); + if (elapsed > longest) + longest = elapsed; + total_elapsed += elapsed; + processed++; + + if (log_checkpoints) + elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms", + processed, + path, + (double) elapsed / 1000); + + break; /* out of retry loop */ + } + + /* + * It is possible that the relation has been dropped or + * truncated since the fsync request was entered. Therefore, + * allow ENOENT, but only if we didn't fail already on this + * file. + */ + if (!FILE_POSSIBLY_DELETED(errno) || failures > 0) + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", + path))); + else + ereport(DEBUG1, + (errcode_for_file_access(), + errmsg_internal("could not fsync file \"%s\" but retrying: %m", + path))); + + /* + * Absorb incoming requests and check to see if a cancel + * arrived for this relation fork. + */ + AbsorbSyncRequests(); + absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */ + } /* end retry loop */ + } + + /* We are done with this entry, remove it */ + if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOps corrupted"); + } /* end loop over hashtable entries */ + + /* Return sync performance metrics for report at checkpoint end */ + CheckpointStats.ckpt_sync_rels = processed; + CheckpointStats.ckpt_longest_sync = longest; + CheckpointStats.ckpt_agg_sync_time = total_elapsed; + + /* Flag successful completion of ProcessSyncRequests */ + sync_in_progress = false; +} + +/* + * RememberSyncRequest() -- callback from checkpointer side of sync request + * + * We stuff fsync requests into the local hash table for execution + * during the checkpointer's next checkpoint. UNLINK requests go into a + * separate linked list, however, because they get processed separately. + * + * See sync.h for more information on the types of sync requests supported. + */ +void +RememberSyncRequest(const FileTag *ftag, SyncRequestType type) +{ + Assert(pendingOps); + + if (type == SYNC_FORGET_REQUEST) + { + PendingFsyncEntry *entry; + + /* Cancel previously entered request */ + entry = (PendingFsyncEntry *) hash_search(pendingOps, + ftag, + HASH_FIND, + NULL); + if (entry != NULL) + entry->canceled = true; + } + else if (type == SYNC_FILTER_REQUEST) + { + HASH_SEQ_STATUS hstat; + PendingFsyncEntry *pfe; + ListCell *cell; + + /* Cancel matching fsync requests */ + hash_seq_init(&hstat, pendingOps); + while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL) + { + if (pfe->tag.handler == ftag->handler && + syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag)) + pfe->canceled = true; + } + + /* Cancel matching unlink requests */ + foreach(cell, pendingUnlinks) + { + PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell); + + if (pue->tag.handler == ftag->handler && + syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag)) + pue->canceled = true; + } + } + else if (type == SYNC_UNLINK_REQUEST) + { + /* Unlink request: put it in the linked list */ + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); + PendingUnlinkEntry *entry; + + entry = palloc(sizeof(PendingUnlinkEntry)); + entry->tag = *ftag; + entry->cycle_ctr = checkpoint_cycle_ctr; + entry->canceled = false; + + pendingUnlinks = lappend(pendingUnlinks, entry); + + MemoryContextSwitchTo(oldcxt); + } + else + { + /* Normal case: enter a request to fsync this segment */ + MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt); + PendingFsyncEntry *entry; + bool found; + + Assert(type == SYNC_REQUEST); + + entry = (PendingFsyncEntry *) hash_search(pendingOps, + ftag, + HASH_ENTER, + &found); + /* if new entry, or was previously canceled, initialize it */ + if (!found || entry->canceled) + { + entry->cycle_ctr = sync_cycle_ctr; + entry->canceled = false; + } + + /* + * NB: it's intentional that we don't change cycle_ctr if the entry + * already exists. The cycle_ctr must represent the oldest fsync + * request that could be in the entry. + */ + + MemoryContextSwitchTo(oldcxt); + } +} + +/* + * Register the sync request locally, or forward it to the checkpointer. + * + * If retryOnError is true, we'll keep trying if there is no space in the + * queue. Return true if we succeeded, or false if there wasn't space. + */ +bool +RegisterSyncRequest(const FileTag *ftag, SyncRequestType type, + bool retryOnError) +{ + bool ret; + + if (pendingOps != NULL) + { + /* standalone backend or startup process: fsync state is local */ + RememberSyncRequest(ftag, type); + return true; + } + + for (;;) + { + /* + * Notify the checkpointer about it. If we fail to queue a message in + * retryOnError mode, we have to sleep and try again ... ugly, but + * hopefully won't happen often. + * + * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an + * error in the case of SYNC_UNLINK_REQUEST would leave the + * no-longer-used file still present on disk, which would be bad, so + * I'm inclined to assume that the checkpointer will always empty the + * queue soon. + */ + ret = ForwardSyncRequest(ftag, type); + + /* + * If we are successful in queueing the request, or we failed and were + * instructed not to retry on error, break. + */ + if (ret || (!ret && !retryOnError)) + break; + + WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10, + WAIT_EVENT_REGISTER_SYNC_REQUEST); + } + + return ret; +} -- cgit v1.2.3