summaryrefslogtreecommitdiffstats
path: root/src/backend/storage
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage')
-rw-r--r--src/backend/storage/Makefile13
-rw-r--r--src/backend/storage/buffer/Makefile22
-rw-r--r--src/backend/storage/buffer/README276
-rw-r--r--src/backend/storage/buffer/buf_init.c186
-rw-r--r--src/backend/storage/buffer/buf_table.c162
-rw-r--r--src/backend/storage/buffer/bufmgr.c5594
-rw-r--r--src/backend/storage/buffer/freelist.c774
-rw-r--r--src/backend/storage/buffer/localbuf.c821
-rw-r--r--src/backend/storage/buffer/meson.build9
-rw-r--r--src/backend/storage/file/Makefile23
-rw-r--r--src/backend/storage/file/buffile.c1039
-rw-r--r--src/backend/storage/file/copydir.c216
-rw-r--r--src/backend/storage/file/fd.c3976
-rw-r--r--src/backend/storage/file/fileset.c205
-rw-r--r--src/backend/storage/file/meson.build10
-rw-r--r--src/backend/storage/file/reinit.c422
-rw-r--r--src/backend/storage/file/sharedfileset.c120
-rw-r--r--src/backend/storage/freespace/Makefile20
-rw-r--r--src/backend/storage/freespace/README196
-rw-r--r--src/backend/storage/freespace/freespace.c865
-rw-r--r--src/backend/storage/freespace/fsmpage.c374
-rw-r--r--src/backend/storage/freespace/indexfsm.c74
-rw-r--r--src/backend/storage/freespace/meson.build7
-rw-r--r--src/backend/storage/ipc/Makefile29
-rw-r--r--src/backend/storage/ipc/barrier.c333
-rw-r--r--src/backend/storage/ipc/dsm.c1257
-rw-r--r--src/backend/storage/ipc/dsm_impl.c1053
-rw-r--r--src/backend/storage/ipc/ipc.c439
-rw-r--r--src/backend/storage/ipc/ipci.c354
-rw-r--r--src/backend/storage/ipc/latch.c2268
-rw-r--r--src/backend/storage/ipc/meson.build21
-rw-r--r--src/backend/storage/ipc/pmsignal.c462
-rw-r--r--src/backend/storage/ipc/procarray.c5224
-rw-r--r--src/backend/storage/ipc/procsignal.c688
-rw-r--r--src/backend/storage/ipc/shm_mq.c1329
-rw-r--r--src/backend/storage/ipc/shm_toc.c272
-rw-r--r--src/backend/storage/ipc/shmem.c584
-rw-r--r--src/backend/storage/ipc/signalfuncs.c318
-rw-r--r--src/backend/storage/ipc/sinval.c205
-rw-r--r--src/backend/storage/ipc/sinvaladt.c791
-rw-r--r--src/backend/storage/ipc/standby.c1518
-rw-r--r--src/backend/storage/large_object/Makefile18
-rw-r--r--src/backend/storage/large_object/inv_api.c954
-rw-r--r--src/backend/storage/large_object/meson.build5
-rw-r--r--src/backend/storage/lmgr/.gitignore3
-rw-r--r--src/backend/storage/lmgr/Makefile52
-rw-r--r--src/backend/storage/lmgr/README731
-rw-r--r--src/backend/storage/lmgr/README-SSI646
-rw-r--r--src/backend/storage/lmgr/README.barrier197
-rw-r--r--src/backend/storage/lmgr/condition_variable.c360
-rw-r--r--src/backend/storage/lmgr/deadlock.c1159
-rw-r--r--src/backend/storage/lmgr/generate-lwlocknames.pl77
-rw-r--r--src/backend/storage/lmgr/lmgr.c1270
-rw-r--r--src/backend/storage/lmgr/lock.c4651
-rw-r--r--src/backend/storage/lmgr/lwlock.c1973
-rw-r--r--src/backend/storage/lmgr/lwlocknames.c52
-rw-r--r--src/backend/storage/lmgr/lwlocknames.h50
-rw-r--r--src/backend/storage/lmgr/lwlocknames.txt55
-rw-r--r--src/backend/storage/lmgr/meson.build15
-rw-r--r--src/backend/storage/lmgr/predicate.c4997
-rw-r--r--src/backend/storage/lmgr/proc.c1897
-rw-r--r--src/backend/storage/lmgr/s_lock.c324
-rw-r--r--src/backend/storage/lmgr/spin.c180
-rw-r--r--src/backend/storage/meson.build11
-rw-r--r--src/backend/storage/page/Makefile23
-rw-r--r--src/backend/storage/page/README64
-rw-r--r--src/backend/storage/page/bufpage.c1549
-rw-r--r--src/backend/storage/page/checksum.c22
-rw-r--r--src/backend/storage/page/itemptr.c131
-rw-r--r--src/backend/storage/page/meson.build7
-rw-r--r--src/backend/storage/smgr/Makefile19
-rw-r--r--src/backend/storage/smgr/README52
-rw-r--r--src/backend/storage/smgr/md.c1623
-rw-r--r--src/backend/storage/smgr/meson.build6
-rw-r--r--src/backend/storage/smgr/smgr.c767
-rw-r--r--src/backend/storage/sync/Makefile18
-rw-r--r--src/backend/storage/sync/meson.build6
-rw-r--r--src/backend/storage/sync/sync.c624
78 files changed, 57137 insertions, 0 deletions
diff --git a/src/backend/storage/Makefile b/src/backend/storage/Makefile
new file mode 100644
index 0000000..8376cdf
--- /dev/null
+++ b/src/backend/storage/Makefile
@@ -0,0 +1,13 @@
+#
+# Makefile for the storage manager subsystem
+#
+# src/backend/storage/Makefile
+#
+
+subdir = src/backend/storage
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+SUBDIRS = buffer file freespace ipc large_object lmgr page smgr sync
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile
new file mode 100644
index 0000000..fd7c40d
--- /dev/null
+++ b/src/backend/storage/buffer/Makefile
@@ -0,0 +1,22 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/buffer
+#
+# IDENTIFICATION
+# src/backend/storage/buffer/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/buffer
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ buf_init.o \
+ buf_table.o \
+ bufmgr.o \
+ freelist.o \
+ localbuf.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README
new file mode 100644
index 0000000..011af7a
--- /dev/null
+++ b/src/backend/storage/buffer/README
@@ -0,0 +1,276 @@
+src/backend/storage/buffer/README
+
+Notes About Shared Buffer Access Rules
+======================================
+
+There are two separate access control mechanisms for shared disk buffers:
+reference counts (a/k/a pin counts) and buffer content locks. (Actually,
+there's a third level of access control: one must hold the appropriate kind
+of lock on a relation before one can legally access any page belonging to
+the relation. Relation-level locks are not discussed here.)
+
+Pins: one must "hold a pin on" a buffer (increment its reference count)
+before being allowed to do anything at all with it. An unpinned buffer is
+subject to being reclaimed and reused for a different page at any instant,
+so touching it is unsafe. Normally a pin is acquired via ReadBuffer and
+released via ReleaseBuffer. It is OK and indeed common for a single
+backend to pin a page more than once concurrently; the buffer manager
+handles this efficiently. It is considered OK to hold a pin for long
+intervals --- for example, sequential scans hold a pin on the current page
+until done processing all the tuples on the page, which could be quite a
+while if the scan is the outer scan of a join. Similarly, a btree index
+scan may hold a pin on the current index page. This is OK because normal
+operations never wait for a page's pin count to drop to zero. (Anything
+that might need to do such a wait is instead handled by waiting to obtain
+the relation-level lock, which is why you'd better hold one first.) Pins
+may not be held across transaction boundaries, however.
+
+Buffer content locks: there are two kinds of buffer lock, shared and exclusive,
+which act just as you'd expect: multiple backends can hold shared locks on
+the same buffer, but an exclusive lock prevents anyone else from holding
+either shared or exclusive lock. (These can alternatively be called READ
+and WRITE locks.) These locks are intended to be short-term: they should not
+be held for long. Buffer locks are acquired and released by LockBuffer().
+It will *not* work for a single backend to try to acquire multiple locks on
+the same buffer. One must pin a buffer before trying to lock it.
+
+Buffer access rules:
+
+1. To scan a page for tuples, one must hold a pin and either shared or
+exclusive content lock. To examine the commit status (XIDs and status bits)
+of a tuple in a shared buffer, one must likewise hold a pin and either shared
+or exclusive lock.
+
+2. Once one has determined that a tuple is interesting (visible to the
+current transaction) one may drop the content lock, yet continue to access
+the tuple's data for as long as one holds the buffer pin. This is what is
+typically done by heap scans, since the tuple returned by heap_fetch
+contains a pointer to tuple data in the shared buffer. Therefore the
+tuple cannot go away while the pin is held (see rule #5). Its state could
+change, but that is assumed not to matter after the initial determination
+of visibility is made.
+
+3. To add a tuple or change the xmin/xmax fields of an existing tuple,
+one must hold a pin and an exclusive content lock on the containing buffer.
+This ensures that no one else might see a partially-updated state of the
+tuple while they are doing visibility checks.
+
+4. It is considered OK to update tuple commit status bits (ie, OR the
+values HEAP_XMIN_COMMITTED, HEAP_XMIN_INVALID, HEAP_XMAX_COMMITTED, or
+HEAP_XMAX_INVALID into t_infomask) while holding only a shared lock and
+pin on a buffer. This is OK because another backend looking at the tuple
+at about the same time would OR the same bits into the field, so there
+is little or no risk of conflicting update; what's more, if there did
+manage to be a conflict it would merely mean that one bit-update would
+be lost and need to be done again later. These four bits are only hints
+(they cache the results of transaction status lookups in pg_xact), so no
+great harm is done if they get reset to zero by conflicting updates.
+Note, however, that a tuple is frozen by setting both HEAP_XMIN_INVALID
+and HEAP_XMIN_COMMITTED; this is a critical update and accordingly requires
+an exclusive buffer lock (and it must also be WAL-logged).
+
+5. To physically remove a tuple or compact free space on a page, one
+must hold a pin and an exclusive lock, *and* observe while holding the
+exclusive lock that the buffer's shared reference count is one (ie,
+no other backend holds a pin). If these conditions are met then no other
+backend can perform a page scan until the exclusive lock is dropped, and
+no other backend can be holding a reference to an existing tuple that it
+might expect to examine again. Note that another backend might pin the
+buffer (increment the refcount) while one is performing the cleanup, but
+it won't be able to actually examine the page until it acquires shared
+or exclusive content lock.
+
+
+Obtaining the lock needed under rule #5 is done by the bufmgr routines
+LockBufferForCleanup() or ConditionalLockBufferForCleanup(). They first get
+an exclusive lock and then check to see if the shared pin count is currently
+1. If not, ConditionalLockBufferForCleanup() releases the exclusive lock and
+then returns false, while LockBufferForCleanup() releases the exclusive lock
+(but not the caller's pin) and waits until signaled by another backend,
+whereupon it tries again. The signal will occur when UnpinBuffer decrements
+the shared pin count to 1. As indicated above, this operation might have to
+wait a good while before it acquires the lock, but that shouldn't matter much
+for concurrent VACUUM. The current implementation only supports a single
+waiter for pin-count-1 on any particular shared buffer. This is enough for
+VACUUM's use, since we don't allow multiple VACUUMs concurrently on a single
+relation anyway. Anyone wishing to obtain a cleanup lock outside of recovery
+or a VACUUM must use the conditional variant of the function.
+
+
+Buffer Manager's Internal Locking
+---------------------------------
+
+Before PostgreSQL 8.1, all operations of the shared buffer manager itself
+were protected by a single system-wide lock, the BufMgrLock, which
+unsurprisingly proved to be a source of contention. The new locking scheme
+avoids grabbing system-wide exclusive locks in common code paths. It works
+like this:
+
+* There is a system-wide LWLock, the BufMappingLock, that notionally
+protects the mapping from buffer tags (page identifiers) to buffers.
+(Physically, it can be thought of as protecting the hash table maintained
+by buf_table.c.) To look up whether a buffer exists for a tag, it is
+sufficient to obtain share lock on the BufMappingLock. Note that one
+must pin the found buffer, if any, before releasing the BufMappingLock.
+To alter the page assignment of any buffer, one must hold exclusive lock
+on the BufMappingLock. This lock must be held across adjusting the buffer's
+header fields and changing the buf_table hash table. The only common
+operation that needs exclusive lock is reading in a page that was not
+in shared buffers already, which will require at least a kernel call
+and usually a wait for I/O, so it will be slow anyway.
+
+* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS
+separate locks, each guarding a portion of the buffer tag space. This allows
+further reduction of contention in the normal code paths. The partition
+that a particular buffer tag belongs to is determined from the low-order
+bits of the tag's hash value. The rules stated above apply to each partition
+independently. If it is necessary to lock more than one partition at a time,
+they must be locked in partition-number order to avoid risk of deadlock.
+
+* A separate system-wide spinlock, buffer_strategy_lock, provides mutual
+exclusion for operations that access the buffer free list or select
+buffers for replacement. A spinlock is used here rather than a lightweight
+lock for efficiency; no other locks of any sort should be acquired while
+buffer_strategy_lock is held. This is essential to allow buffer replacement
+to happen in multiple backends with reasonable concurrency.
+
+* Each buffer header contains a spinlock that must be taken when examining
+or changing fields of that buffer header. This allows operations such as
+ReleaseBuffer to make local state changes without taking any system-wide
+lock. We use a spinlock, not an LWLock, since there are no cases where
+the lock needs to be held for more than a few instructions.
+
+Note that a buffer header's spinlock does not control access to the data
+held within the buffer. Each buffer header also contains an LWLock, the
+"buffer content lock", that *does* represent the right to access the data
+in the buffer. It is used per the rules above.
+
+* The BM_IO_IN_PROGRESS flag acts as a kind of lock, used to wait for I/O on a
+buffer to complete (and in releases before 14, it was accompanied by a
+per-buffer LWLock). The process doing a read or write sets the flag for the
+duration, and processes that need to wait for it to be cleared sleep on a
+condition variable.
+
+
+Normal Buffer Replacement Strategy
+----------------------------------
+
+There is a "free list" of buffers that are prime candidates for replacement.
+In particular, buffers that are completely free (contain no valid page) are
+always in this list. We could also throw buffers into this list if we
+consider their pages unlikely to be needed soon; however, the current
+algorithm never does that. The list is singly-linked using fields in the
+buffer headers; we maintain head and tail pointers in global variables.
+(Note: although the list links are in the buffer headers, they are
+considered to be protected by the buffer_strategy_lock, not the buffer-header
+spinlocks.) To choose a victim buffer to recycle when there are no free
+buffers available, we use a simple clock-sweep algorithm, which avoids the
+need to take system-wide locks during common operations. It works like
+this:
+
+Each buffer header contains a usage counter, which is incremented (up to a
+small limit value) whenever the buffer is pinned. (This requires only the
+buffer header spinlock, which would have to be taken anyway to increment the
+buffer reference count, so it's nearly free.)
+
+The "clock hand" is a buffer index, nextVictimBuffer, that moves circularly
+through all the available buffers. nextVictimBuffer is protected by the
+buffer_strategy_lock.
+
+The algorithm for a process that needs to obtain a victim buffer is:
+
+1. Obtain buffer_strategy_lock.
+
+2. If buffer free list is nonempty, remove its head buffer. Release
+buffer_strategy_lock. If the buffer is pinned or has a nonzero usage count,
+it cannot be used; ignore it go back to step 1. Otherwise, pin the buffer,
+and return it.
+
+3. Otherwise, the buffer free list is empty. Select the buffer pointed to by
+nextVictimBuffer, and circularly advance nextVictimBuffer for next time.
+Release buffer_strategy_lock.
+
+4. If the selected buffer is pinned or has a nonzero usage count, it cannot
+be used. Decrement its usage count (if nonzero), reacquire
+buffer_strategy_lock, and return to step 3 to examine the next buffer.
+
+5. Pin the selected buffer, and return.
+
+(Note that if the selected buffer is dirty, we will have to write it out
+before we can recycle it; if someone else pins the buffer meanwhile we will
+have to give up and try another buffer. This however is not a concern
+of the basic select-a-victim-buffer algorithm.)
+
+
+Buffer Ring Replacement Strategy
+---------------------------------
+
+When running a query that needs to access a large number of pages just once,
+such as VACUUM or a large sequential scan, a different strategy is used.
+A page that has been touched only by such a scan is unlikely to be needed
+again soon, so instead of running the normal clock sweep algorithm and
+blowing out the entire buffer cache, a small ring of buffers is allocated
+using the normal clock sweep algorithm and those buffers are reused for the
+whole scan. This also implies that much of the write traffic caused by such
+a statement will be done by the backend itself and not pushed off onto other
+processes.
+
+For sequential scans, a 256KB ring is used. That's small enough to fit in L2
+cache, which makes transferring pages from OS cache to shared buffer cache
+efficient. Even less would often be enough, but the ring must be big enough
+to accommodate all pages in the scan that are pinned concurrently. 256KB
+should also be enough to leave a small cache trail for other backends to
+join in a synchronized seq scan. If a ring buffer is dirtied and its LSN
+updated, we would normally have to write and flush WAL before we could
+re-use the buffer; in this case we instead discard the buffer from the ring
+and (later) choose a replacement using the normal clock-sweep algorithm.
+Hence this strategy works best for scans that are read-only (or at worst
+update hint bits). In a scan that modifies every page in the scan, like a
+bulk UPDATE or DELETE, the buffers in the ring will always be dirtied and
+the ring strategy effectively degrades to the normal strategy.
+
+VACUUM uses a ring like sequential scans, however, the size of this ring is
+controlled by the vacuum_buffer_usage_limit GUC. Dirty pages are not removed
+from the ring. Instead, WAL is flushed if needed to allow reuse of the
+buffers. Before introducing the buffer ring strategy in 8.3, VACUUM's buffers
+were sent to the freelist, which was effectively a buffer ring of 1 buffer,
+resulting in excessive WAL flushing.
+
+Bulk writes work similarly to VACUUM. Currently this applies only to
+COPY IN and CREATE TABLE AS SELECT. (Might it be interesting to make
+seqscan UPDATE and DELETE use the bulkwrite strategy?) For bulk writes
+we use a ring size of 16MB (but not more than 1/8th of shared_buffers).
+Smaller sizes have been shown to result in the COPY blocking too often
+for WAL flushes. While it's okay for a background vacuum to be slowed by
+doing its own WAL flushing, we'd prefer that COPY not be subject to that,
+so we let it use up a bit more of the buffer arena.
+
+
+Background Writer's Processing
+------------------------------
+
+The background writer is designed to write out pages that are likely to be
+recycled soon, thereby offloading the writing work from active backends.
+To do this, it scans forward circularly from the current position of
+nextVictimBuffer (which it does not change!), looking for buffers that are
+dirty and not pinned nor marked with a positive usage count. It pins,
+writes, and releases any such buffer.
+
+If we can assume that reading nextVictimBuffer is an atomic action, then
+the writer doesn't even need to take buffer_strategy_lock in order to look
+for buffers to write; it needs only to spinlock each buffer header for long
+enough to check the dirtybit. Even without that assumption, the writer
+only needs to take the lock long enough to read the variable value, not
+while scanning the buffers. (This is a very substantial improvement in
+the contention cost of the writer compared to PG 8.0.)
+
+The background writer takes shared content lock on a buffer while writing it
+out (and anyone else who flushes buffer contents to disk must do so too).
+This ensures that the page image transferred to disk is reasonably consistent.
+We might miss a hint-bit update or two but that isn't a problem, for the same
+reasons mentioned under buffer access rules.
+
+As of 8.4, background writer starts during recovery mode when there is
+some form of potentially extended recovery to perform. It performs an
+identical service to normal processing, except that checkpoints it
+writes are technically restartpoints.
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
new file mode 100644
index 0000000..0057443
--- /dev/null
+++ b/src/backend/storage/buffer/buf_init.c
@@ -0,0 +1,186 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_init.c
+ * buffer manager initialization routines
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/buffer/buf_init.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/proc.h"
+
+BufferDescPadded *BufferDescriptors;
+char *BufferBlocks;
+ConditionVariableMinimallyPadded *BufferIOCVArray;
+WritebackContext BackendWritebackContext;
+CkptSortItem *CkptBufferIds;
+
+
+/*
+ * Data Structures:
+ * buffers live in a freelist and a lookup data structure.
+ *
+ *
+ * Buffer Lookup:
+ * Two important notes. First, the buffer has to be
+ * available for lookup BEFORE an IO begins. Otherwise
+ * a second process trying to read the buffer will
+ * allocate its own copy and the buffer pool will
+ * become inconsistent.
+ *
+ * Buffer Replacement:
+ * see freelist.c. A buffer cannot be replaced while in
+ * use either by data manager or during IO.
+ *
+ *
+ * Synchronization/Locking:
+ *
+ * IO_IN_PROGRESS -- this is a flag in the buffer descriptor.
+ * It must be set when an IO is initiated and cleared at
+ * the end of the IO. It is there to make sure that one
+ * process doesn't start to use a buffer while another is
+ * faulting it in. see WaitIO and related routines.
+ *
+ * refcount -- Counts the number of processes holding pins on a buffer.
+ * A buffer is pinned during IO and immediately after a BufferAlloc().
+ * Pins must be released before end of transaction. For efficiency the
+ * shared refcount isn't increased if an individual backend pins a buffer
+ * multiple times. Check the PrivateRefCount infrastructure in bufmgr.c.
+ */
+
+
+/*
+ * Initialize shared buffer pool
+ *
+ * This is called once during shared-memory initialization (either in the
+ * postmaster, or in a standalone backend).
+ */
+void
+InitBufferPool(void)
+{
+ bool foundBufs,
+ foundDescs,
+ foundIOCV,
+ foundBufCkpt;
+
+ /* Align descriptors to a cacheline boundary. */
+ BufferDescriptors = (BufferDescPadded *)
+ ShmemInitStruct("Buffer Descriptors",
+ NBuffers * sizeof(BufferDescPadded),
+ &foundDescs);
+
+ /* Align buffer pool on IO page size boundary. */
+ BufferBlocks = (char *)
+ TYPEALIGN(PG_IO_ALIGN_SIZE,
+ ShmemInitStruct("Buffer Blocks",
+ NBuffers * (Size) BLCKSZ + PG_IO_ALIGN_SIZE,
+ &foundBufs));
+
+ /* Align condition variables to cacheline boundary. */
+ BufferIOCVArray = (ConditionVariableMinimallyPadded *)
+ ShmemInitStruct("Buffer IO Condition Variables",
+ NBuffers * sizeof(ConditionVariableMinimallyPadded),
+ &foundIOCV);
+
+ /*
+ * The array used to sort to-be-checkpointed buffer ids is located in
+ * shared memory, to avoid having to allocate significant amounts of
+ * memory at runtime. As that'd be in the middle of a checkpoint, or when
+ * the checkpointer is restarted, memory allocation failures would be
+ * painful.
+ */
+ CkptBufferIds = (CkptSortItem *)
+ ShmemInitStruct("Checkpoint BufferIds",
+ NBuffers * sizeof(CkptSortItem), &foundBufCkpt);
+
+ if (foundDescs || foundBufs || foundIOCV || foundBufCkpt)
+ {
+ /* should find all of these, or none of them */
+ Assert(foundDescs && foundBufs && foundIOCV && foundBufCkpt);
+ /* note: this path is only taken in EXEC_BACKEND case */
+ }
+ else
+ {
+ int i;
+
+ /*
+ * Initialize all the buffer headers.
+ */
+ for (i = 0; i < NBuffers; i++)
+ {
+ BufferDesc *buf = GetBufferDescriptor(i);
+
+ ClearBufferTag(&buf->tag);
+
+ pg_atomic_init_u32(&buf->state, 0);
+ buf->wait_backend_pgprocno = INVALID_PGPROCNO;
+
+ buf->buf_id = i;
+
+ /*
+ * Initially link all the buffers together as unused. Subsequent
+ * management of this list is done by freelist.c.
+ */
+ buf->freeNext = i + 1;
+
+ LWLockInitialize(BufferDescriptorGetContentLock(buf),
+ LWTRANCHE_BUFFER_CONTENT);
+
+ ConditionVariableInit(BufferDescriptorGetIOCV(buf));
+ }
+
+ /* Correct last entry of linked list */
+ GetBufferDescriptor(NBuffers - 1)->freeNext = FREENEXT_END_OF_LIST;
+ }
+
+ /* Init other shared buffer-management stuff */
+ StrategyInitialize(!foundDescs);
+
+ /* Initialize per-backend file flush context */
+ WritebackContextInit(&BackendWritebackContext,
+ &backend_flush_after);
+}
+
+/*
+ * BufferShmemSize
+ *
+ * compute the size of shared memory for the buffer pool including
+ * data pages, buffer descriptors, hash tables, etc.
+ */
+Size
+BufferShmemSize(void)
+{
+ Size size = 0;
+
+ /* size of buffer descriptors */
+ size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded)));
+ /* to allow aligning buffer descriptors */
+ size = add_size(size, PG_CACHE_LINE_SIZE);
+
+ /* size of data pages, plus alignment padding */
+ size = add_size(size, PG_IO_ALIGN_SIZE);
+ size = add_size(size, mul_size(NBuffers, BLCKSZ));
+
+ /* size of stuff controlled by freelist.c */
+ size = add_size(size, StrategyShmemSize());
+
+ /* size of I/O condition variables */
+ size = add_size(size, mul_size(NBuffers,
+ sizeof(ConditionVariableMinimallyPadded)));
+ /* to allow aligning the above */
+ size = add_size(size, PG_CACHE_LINE_SIZE);
+
+ /* size of checkpoint sort array in bufmgr.c */
+ size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem)));
+
+ return size;
+}
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
new file mode 100644
index 0000000..2b96639
--- /dev/null
+++ b/src/backend/storage/buffer/buf_table.c
@@ -0,0 +1,162 @@
+/*-------------------------------------------------------------------------
+ *
+ * buf_table.c
+ * routines for mapping BufferTags to buffer indexes.
+ *
+ * Note: the routines in this file do no locking of their own. The caller
+ * must hold a suitable lock on the appropriate BufMappingLock, as specified
+ * in the comments. We can't do the locking inside these functions because
+ * in most cases the caller needs to adjust the buffer header contents
+ * before the lock is released (see notes in README).
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/buffer/buf_table.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+ BufferTag key; /* Tag of a disk page */
+ int id; /* Associated buffer ID */
+} BufferLookupEnt;
+
+static HTAB *SharedBufHash;
+
+
+/*
+ * Estimate space needed for mapping hashtable
+ * size is the desired hash table size (possibly more than NBuffers)
+ */
+Size
+BufTableShmemSize(int size)
+{
+ return hash_estimate_size(size, sizeof(BufferLookupEnt));
+}
+
+/*
+ * Initialize shmem hash table for mapping buffers
+ * size is the desired hash table size (possibly more than NBuffers)
+ */
+void
+InitBufTable(int size)
+{
+ HASHCTL info;
+
+ /* assume no locking is needed yet */
+
+ /* BufferTag maps to Buffer */
+ info.keysize = sizeof(BufferTag);
+ info.entrysize = sizeof(BufferLookupEnt);
+ info.num_partitions = NUM_BUFFER_PARTITIONS;
+
+ SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
+ size, size,
+ &info,
+ HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
+}
+
+/*
+ * BufTableHashCode
+ * Compute the hash code associated with a BufferTag
+ *
+ * This must be passed to the lookup/insert/delete routines along with the
+ * tag. We do it like this because the callers need to know the hash code
+ * in order to determine which buffer partition to lock, and we don't want
+ * to do the hash computation twice (hash_any is a bit slow).
+ */
+uint32
+BufTableHashCode(BufferTag *tagPtr)
+{
+ return get_hash_value(SharedBufHash, (void *) tagPtr);
+}
+
+/*
+ * BufTableLookup
+ * Lookup the given BufferTag; return buffer ID, or -1 if not found
+ *
+ * Caller must hold at least share lock on BufMappingLock for tag's partition
+ */
+int
+BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
+{
+ BufferLookupEnt *result;
+
+ result = (BufferLookupEnt *)
+ hash_search_with_hash_value(SharedBufHash,
+ tagPtr,
+ hashcode,
+ HASH_FIND,
+ NULL);
+
+ if (!result)
+ return -1;
+
+ return result->id;
+}
+
+/*
+ * BufTableInsert
+ * Insert a hashtable entry for given tag and buffer ID,
+ * unless an entry already exists for that tag
+ *
+ * Returns -1 on successful insertion. If a conflicting entry exists
+ * already, returns the buffer ID in that entry.
+ *
+ * Caller must hold exclusive lock on BufMappingLock for tag's partition
+ */
+int
+BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
+{
+ BufferLookupEnt *result;
+ bool found;
+
+ Assert(buf_id >= 0); /* -1 is reserved for not-in-table */
+ Assert(tagPtr->blockNum != P_NEW); /* invalid tag */
+
+ result = (BufferLookupEnt *)
+ hash_search_with_hash_value(SharedBufHash,
+ tagPtr,
+ hashcode,
+ HASH_ENTER,
+ &found);
+
+ if (found) /* found something already in the table */
+ return result->id;
+
+ result->id = buf_id;
+
+ return -1;
+}
+
+/*
+ * BufTableDelete
+ * Delete the hashtable entry for given tag (which must exist)
+ *
+ * Caller must hold exclusive lock on BufMappingLock for tag's partition
+ */
+void
+BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
+{
+ BufferLookupEnt *result;
+
+ result = (BufferLookupEnt *)
+ hash_search_with_hash_value(SharedBufHash,
+ tagPtr,
+ hashcode,
+ HASH_REMOVE,
+ NULL);
+
+ if (!result) /* shouldn't happen */
+ elog(ERROR, "shared buffer hash table corrupted");
+}
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
new file mode 100644
index 0000000..e066a3f
--- /dev/null
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -0,0 +1,5594 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufmgr.c
+ * buffer manager interface routines
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/buffer/bufmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * Principal entry points:
+ *
+ * ReadBuffer() -- find or create a buffer holding the requested page,
+ * and pin it so that no one can destroy it while this process
+ * is using it.
+ *
+ * ReleaseBuffer() -- unpin a buffer
+ *
+ * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty".
+ * The disk write is delayed until buffer replacement or checkpoint.
+ *
+ * See also these files:
+ * freelist.c -- chooses victim for buffer replacement
+ * buf_table.c -- manages the buffer lookup table
+ */
+#include "postgres.h"
+
+#include <sys/file.h>
+#include <unistd.h>
+
+#include "access/tableam.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "catalog/catalog.h"
+#include "catalog/storage.h"
+#include "catalog/storage_xlog.h"
+#include "executor/instrument.h"
+#include "lib/binaryheap.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/smgr.h"
+#include "storage/standby.h"
+#include "utils/memdebug.h"
+#include "utils/ps_status.h"
+#include "utils/rel.h"
+#include "utils/resowner_private.h"
+#include "utils/timestamp.h"
+
+
+/* Note: these two macros only work on shared buffers, not local ones! */
+#define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ))
+#define BufferGetLSN(bufHdr) (PageGetLSN(BufHdrGetBlock(bufHdr)))
+
+/* Note: this macro only works on local buffers, not shared ones! */
+#define LocalBufHdrGetBlock(bufHdr) \
+ LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
+
+/* Bits in SyncOneBuffer's return value */
+#define BUF_WRITTEN 0x01
+#define BUF_REUSABLE 0x02
+
+#define RELS_BSEARCH_THRESHOLD 20
+
+/*
+ * This is the size (in the number of blocks) above which we scan the
+ * entire buffer pool to remove the buffers for all the pages of relation
+ * being dropped. For the relations with size below this threshold, we find
+ * the buffers by doing lookups in BufMapping table.
+ */
+#define BUF_DROP_FULL_SCAN_THRESHOLD (uint64) (NBuffers / 32)
+
+typedef struct PrivateRefCountEntry
+{
+ Buffer buffer;
+ int32 refcount;
+} PrivateRefCountEntry;
+
+/* 64 bytes, about the size of a cache line on common systems */
+#define REFCOUNT_ARRAY_ENTRIES 8
+
+/*
+ * Status of buffers to checkpoint for a particular tablespace, used
+ * internally in BufferSync.
+ */
+typedef struct CkptTsStatus
+{
+ /* oid of the tablespace */
+ Oid tsId;
+
+ /*
+ * Checkpoint progress for this tablespace. To make progress comparable
+ * between tablespaces the progress is, for each tablespace, measured as a
+ * number between 0 and the total number of to-be-checkpointed pages. Each
+ * page checkpointed in this tablespace increments this space's progress
+ * by progress_slice.
+ */
+ float8 progress;
+ float8 progress_slice;
+
+ /* number of to-be checkpointed pages in this tablespace */
+ int num_to_scan;
+ /* already processed pages in this tablespace */
+ int num_scanned;
+
+ /* current offset in CkptBufferIds for this tablespace */
+ int index;
+} CkptTsStatus;
+
+/*
+ * Type for array used to sort SMgrRelations
+ *
+ * FlushRelationsAllBuffers shares the same comparator function with
+ * DropRelationsAllBuffers. Pointer to this struct and RelFileLocator must be
+ * compatible.
+ */
+typedef struct SMgrSortArray
+{
+ RelFileLocator rlocator; /* This must be the first member */
+ SMgrRelation srel;
+} SMgrSortArray;
+
+/* GUC variables */
+bool zero_damaged_pages = false;
+int bgwriter_lru_maxpages = 100;
+double bgwriter_lru_multiplier = 2.0;
+bool track_io_timing = false;
+
+/*
+ * How many buffers PrefetchBuffer callers should try to stay ahead of their
+ * ReadBuffer calls by. Zero means "never prefetch". This value is only used
+ * for buffers not belonging to tablespaces that have their
+ * effective_io_concurrency parameter set.
+ */
+int effective_io_concurrency = DEFAULT_EFFECTIVE_IO_CONCURRENCY;
+
+/*
+ * Like effective_io_concurrency, but used by maintenance code paths that might
+ * benefit from a higher setting because they work on behalf of many sessions.
+ * Overridden by the tablespace setting of the same name.
+ */
+int maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
+
+/*
+ * GUC variables about triggering kernel writeback for buffers written; OS
+ * dependent defaults are set via the GUC mechanism.
+ */
+int checkpoint_flush_after = DEFAULT_CHECKPOINT_FLUSH_AFTER;
+int bgwriter_flush_after = DEFAULT_BGWRITER_FLUSH_AFTER;
+int backend_flush_after = DEFAULT_BACKEND_FLUSH_AFTER;
+
+/* local state for LockBufferForCleanup */
+static BufferDesc *PinCountWaitBuf = NULL;
+
+/*
+ * Backend-Private refcount management:
+ *
+ * Each buffer also has a private refcount that keeps track of the number of
+ * times the buffer is pinned in the current process. This is so that the
+ * shared refcount needs to be modified only once if a buffer is pinned more
+ * than once by an individual backend. It's also used to check that no buffers
+ * are still pinned at the end of transactions and when exiting.
+ *
+ *
+ * To avoid - as we used to - requiring an array with NBuffers entries to keep
+ * track of local buffers, we use a small sequentially searched array
+ * (PrivateRefCountArray) and an overflow hash table (PrivateRefCountHash) to
+ * keep track of backend local pins.
+ *
+ * Until no more than REFCOUNT_ARRAY_ENTRIES buffers are pinned at once, all
+ * refcounts are kept track of in the array; after that, new array entries
+ * displace old ones into the hash table. That way a frequently used entry
+ * can't get "stuck" in the hashtable while infrequent ones clog the array.
+ *
+ * Note that in most scenarios the number of pinned buffers will not exceed
+ * REFCOUNT_ARRAY_ENTRIES.
+ *
+ *
+ * To enter a buffer into the refcount tracking mechanism first reserve a free
+ * entry using ReservePrivateRefCountEntry() and then later, if necessary,
+ * fill it with NewPrivateRefCountEntry(). That split lets us avoid doing
+ * memory allocations in NewPrivateRefCountEntry() which can be important
+ * because in some scenarios it's called with a spinlock held...
+ */
+static struct PrivateRefCountEntry PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES];
+static HTAB *PrivateRefCountHash = NULL;
+static int32 PrivateRefCountOverflowed = 0;
+static uint32 PrivateRefCountClock = 0;
+static PrivateRefCountEntry *ReservedRefCountEntry = NULL;
+
+static void ReservePrivateRefCountEntry(void);
+static PrivateRefCountEntry *NewPrivateRefCountEntry(Buffer buffer);
+static PrivateRefCountEntry *GetPrivateRefCountEntry(Buffer buffer, bool do_move);
+static inline int32 GetPrivateRefCount(Buffer buffer);
+static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref);
+
+/*
+ * Ensure that the PrivateRefCountArray has sufficient space to store one more
+ * entry. This has to be called before using NewPrivateRefCountEntry() to fill
+ * a new entry - but it's perfectly fine to not use a reserved entry.
+ */
+static void
+ReservePrivateRefCountEntry(void)
+{
+ /* Already reserved (or freed), nothing to do */
+ if (ReservedRefCountEntry != NULL)
+ return;
+
+ /*
+ * First search for a free entry the array, that'll be sufficient in the
+ * majority of cases.
+ */
+ {
+ int i;
+
+ for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+ {
+ PrivateRefCountEntry *res;
+
+ res = &PrivateRefCountArray[i];
+
+ if (res->buffer == InvalidBuffer)
+ {
+ ReservedRefCountEntry = res;
+ return;
+ }
+ }
+ }
+
+ /*
+ * No luck. All array entries are full. Move one array entry into the hash
+ * table.
+ */
+ {
+ /*
+ * Move entry from the current clock position in the array into the
+ * hashtable. Use that slot.
+ */
+ PrivateRefCountEntry *hashent;
+ bool found;
+
+ /* select victim slot */
+ ReservedRefCountEntry =
+ &PrivateRefCountArray[PrivateRefCountClock++ % REFCOUNT_ARRAY_ENTRIES];
+
+ /* Better be used, otherwise we shouldn't get here. */
+ Assert(ReservedRefCountEntry->buffer != InvalidBuffer);
+
+ /* enter victim array entry into hashtable */
+ hashent = hash_search(PrivateRefCountHash,
+ &(ReservedRefCountEntry->buffer),
+ HASH_ENTER,
+ &found);
+ Assert(!found);
+ hashent->refcount = ReservedRefCountEntry->refcount;
+
+ /* clear the now free array slot */
+ ReservedRefCountEntry->buffer = InvalidBuffer;
+ ReservedRefCountEntry->refcount = 0;
+
+ PrivateRefCountOverflowed++;
+ }
+}
+
+/*
+ * Fill a previously reserved refcount entry.
+ */
+static PrivateRefCountEntry *
+NewPrivateRefCountEntry(Buffer buffer)
+{
+ PrivateRefCountEntry *res;
+
+ /* only allowed to be called when a reservation has been made */
+ Assert(ReservedRefCountEntry != NULL);
+
+ /* use up the reserved entry */
+ res = ReservedRefCountEntry;
+ ReservedRefCountEntry = NULL;
+
+ /* and fill it */
+ res->buffer = buffer;
+ res->refcount = 0;
+
+ return res;
+}
+
+/*
+ * Return the PrivateRefCount entry for the passed buffer.
+ *
+ * Returns NULL if a buffer doesn't have a refcount entry. Otherwise, if
+ * do_move is true, and the entry resides in the hashtable the entry is
+ * optimized for frequent access by moving it to the array.
+ */
+static PrivateRefCountEntry *
+GetPrivateRefCountEntry(Buffer buffer, bool do_move)
+{
+ PrivateRefCountEntry *res;
+ int i;
+
+ Assert(BufferIsValid(buffer));
+ Assert(!BufferIsLocal(buffer));
+
+ /*
+ * First search for references in the array, that'll be sufficient in the
+ * majority of cases.
+ */
+ for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+ {
+ res = &PrivateRefCountArray[i];
+
+ if (res->buffer == buffer)
+ return res;
+ }
+
+ /*
+ * By here we know that the buffer, if already pinned, isn't residing in
+ * the array.
+ *
+ * Only look up the buffer in the hashtable if we've previously overflowed
+ * into it.
+ */
+ if (PrivateRefCountOverflowed == 0)
+ return NULL;
+
+ res = hash_search(PrivateRefCountHash, &buffer, HASH_FIND, NULL);
+
+ if (res == NULL)
+ return NULL;
+ else if (!do_move)
+ {
+ /* caller doesn't want us to move the hash entry into the array */
+ return res;
+ }
+ else
+ {
+ /* move buffer from hashtable into the free array slot */
+ bool found;
+ PrivateRefCountEntry *free;
+
+ /* Ensure there's a free array slot */
+ ReservePrivateRefCountEntry();
+
+ /* Use up the reserved slot */
+ Assert(ReservedRefCountEntry != NULL);
+ free = ReservedRefCountEntry;
+ ReservedRefCountEntry = NULL;
+ Assert(free->buffer == InvalidBuffer);
+
+ /* and fill it */
+ free->buffer = buffer;
+ free->refcount = res->refcount;
+
+ /* delete from hashtable */
+ hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
+ Assert(found);
+ Assert(PrivateRefCountOverflowed > 0);
+ PrivateRefCountOverflowed--;
+
+ return free;
+ }
+}
+
+/*
+ * Returns how many times the passed buffer is pinned by this backend.
+ *
+ * Only works for shared memory buffers!
+ */
+static inline int32
+GetPrivateRefCount(Buffer buffer)
+{
+ PrivateRefCountEntry *ref;
+
+ Assert(BufferIsValid(buffer));
+ Assert(!BufferIsLocal(buffer));
+
+ /*
+ * Not moving the entry - that's ok for the current users, but we might
+ * want to change this one day.
+ */
+ ref = GetPrivateRefCountEntry(buffer, false);
+
+ if (ref == NULL)
+ return 0;
+ return ref->refcount;
+}
+
+/*
+ * Release resources used to track the reference count of a buffer which we no
+ * longer have pinned and don't want to pin again immediately.
+ */
+static void
+ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref)
+{
+ Assert(ref->refcount == 0);
+
+ if (ref >= &PrivateRefCountArray[0] &&
+ ref < &PrivateRefCountArray[REFCOUNT_ARRAY_ENTRIES])
+ {
+ ref->buffer = InvalidBuffer;
+
+ /*
+ * Mark the just used entry as reserved - in many scenarios that
+ * allows us to avoid ever having to search the array/hash for free
+ * entries.
+ */
+ ReservedRefCountEntry = ref;
+ }
+ else
+ {
+ bool found;
+ Buffer buffer = ref->buffer;
+
+ hash_search(PrivateRefCountHash, &buffer, HASH_REMOVE, &found);
+ Assert(found);
+ Assert(PrivateRefCountOverflowed > 0);
+ PrivateRefCountOverflowed--;
+ }
+}
+
+/*
+ * BufferIsPinned
+ * True iff the buffer is pinned (also checks for valid buffer number).
+ *
+ * NOTE: what we check here is that *this* backend holds a pin on
+ * the buffer. We do not care whether some other backend does.
+ */
+#define BufferIsPinned(bufnum) \
+( \
+ !BufferIsValid(bufnum) ? \
+ false \
+ : \
+ BufferIsLocal(bufnum) ? \
+ (LocalRefCount[-(bufnum) - 1] > 0) \
+ : \
+ (GetPrivateRefCount(bufnum) > 0) \
+)
+
+
+static Buffer ReadBuffer_common(SMgrRelation smgr, char relpersistence,
+ ForkNumber forkNum, BlockNumber blockNum,
+ ReadBufferMode mode, BufferAccessStrategy strategy,
+ bool *hit);
+static BlockNumber ExtendBufferedRelCommon(BufferManagerRelation bmr,
+ ForkNumber fork,
+ BufferAccessStrategy strategy,
+ uint32 flags,
+ uint32 extend_by,
+ BlockNumber extend_upto,
+ Buffer *buffers,
+ uint32 *extended_by);
+static BlockNumber ExtendBufferedRelShared(BufferManagerRelation bmr,
+ ForkNumber fork,
+ BufferAccessStrategy strategy,
+ uint32 flags,
+ uint32 extend_by,
+ BlockNumber extend_upto,
+ Buffer *buffers,
+ uint32 *extended_by);
+static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy);
+static void PinBuffer_Locked(BufferDesc *buf);
+static void UnpinBuffer(BufferDesc *buf);
+static void BufferSync(int flags);
+static uint32 WaitBufHdrUnlocked(BufferDesc *buf);
+static int SyncOneBuffer(int buf_id, bool skip_recently_used,
+ WritebackContext *wb_context);
+static void WaitIO(BufferDesc *buf);
+static bool StartBufferIO(BufferDesc *buf, bool forInput);
+static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty,
+ uint32 set_flag_bits);
+static void shared_buffer_write_error_callback(void *arg);
+static void local_buffer_write_error_callback(void *arg);
+static BufferDesc *BufferAlloc(SMgrRelation smgr,
+ char relpersistence,
+ ForkNumber forkNum,
+ BlockNumber blockNum,
+ BufferAccessStrategy strategy,
+ bool *foundPtr, IOContext io_context);
+static Buffer GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context);
+static void FlushBuffer(BufferDesc *buf, SMgrRelation reln,
+ IOObject io_object, IOContext io_context);
+static void FindAndDropRelationBuffers(RelFileLocator rlocator,
+ ForkNumber forkNum,
+ BlockNumber nForkBlock,
+ BlockNumber firstDelBlock);
+static void RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
+ RelFileLocator dstlocator,
+ ForkNumber forkNum, bool permanent);
+static void AtProcExit_Buffers(int code, Datum arg);
+static void CheckForBufferLeaks(void);
+static int rlocator_comparator(const void *p1, const void *p2);
+static inline int buffertag_comparator(const BufferTag *ba, const BufferTag *bb);
+static inline int ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b);
+static int ts_ckpt_progress_comparator(Datum a, Datum b, void *arg);
+
+
+/*
+ * Implementation of PrefetchBuffer() for shared buffers.
+ */
+PrefetchBufferResult
+PrefetchSharedBuffer(SMgrRelation smgr_reln,
+ ForkNumber forkNum,
+ BlockNumber blockNum)
+{
+ PrefetchBufferResult result = {InvalidBuffer, false};
+ BufferTag newTag; /* identity of requested block */
+ uint32 newHash; /* hash value for newTag */
+ LWLock *newPartitionLock; /* buffer partition lock for it */
+ int buf_id;
+
+ Assert(BlockNumberIsValid(blockNum));
+
+ /* create a tag so we can lookup the buffer */
+ InitBufferTag(&newTag, &smgr_reln->smgr_rlocator.locator,
+ forkNum, blockNum);
+
+ /* determine its hash code and partition lock ID */
+ newHash = BufTableHashCode(&newTag);
+ newPartitionLock = BufMappingPartitionLock(newHash);
+
+ /* see if the block is in the buffer pool already */
+ LWLockAcquire(newPartitionLock, LW_SHARED);
+ buf_id = BufTableLookup(&newTag, newHash);
+ LWLockRelease(newPartitionLock);
+
+ /* If not in buffers, initiate prefetch */
+ if (buf_id < 0)
+ {
+#ifdef USE_PREFETCH
+ /*
+ * Try to initiate an asynchronous read. This returns false in
+ * recovery if the relation file doesn't exist.
+ */
+ if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
+ smgrprefetch(smgr_reln, forkNum, blockNum))
+ {
+ result.initiated_io = true;
+ }
+#endif /* USE_PREFETCH */
+ }
+ else
+ {
+ /*
+ * Report the buffer it was in at that time. The caller may be able
+ * to avoid a buffer table lookup, but it's not pinned and it must be
+ * rechecked!
+ */
+ result.recent_buffer = buf_id + 1;
+ }
+
+ /*
+ * If the block *is* in buffers, we do nothing. This is not really ideal:
+ * the block might be just about to be evicted, which would be stupid
+ * since we know we are going to need it soon. But the only easy answer
+ * is to bump the usage_count, which does not seem like a great solution:
+ * when the caller does ultimately touch the block, usage_count would get
+ * bumped again, resulting in too much favoritism for blocks that are
+ * involved in a prefetch sequence. A real fix would involve some
+ * additional per-buffer state, and it's not clear that there's enough of
+ * a problem to justify that.
+ */
+
+ return result;
+}
+
+/*
+ * PrefetchBuffer -- initiate asynchronous read of a block of a relation
+ *
+ * This is named by analogy to ReadBuffer but doesn't actually allocate a
+ * buffer. Instead it tries to ensure that a future ReadBuffer for the given
+ * block will not be delayed by the I/O. Prefetching is optional.
+ *
+ * There are three possible outcomes:
+ *
+ * 1. If the block is already cached, the result includes a valid buffer that
+ * could be used by the caller to avoid the need for a later buffer lookup, but
+ * it's not pinned, so the caller must recheck it.
+ *
+ * 2. If the kernel has been asked to initiate I/O, the initiated_io member is
+ * true. Currently there is no way to know if the data was already cached by
+ * the kernel and therefore didn't really initiate I/O, and no way to know when
+ * the I/O completes other than using synchronous ReadBuffer().
+ *
+ * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
+ * USE_PREFETCH is not defined (this build doesn't support prefetching due to
+ * lack of a kernel facility), direct I/O is enabled, or the underlying
+ * relation file wasn't found and we are in recovery. (If the relation file
+ * wasn't found and we are not in recovery, an error is raised).
+ */
+PrefetchBufferResult
+PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
+{
+ Assert(RelationIsValid(reln));
+ Assert(BlockNumberIsValid(blockNum));
+
+ if (RelationUsesLocalBuffers(reln))
+ {
+ /* see comments in ReadBufferExtended */
+ if (RELATION_IS_OTHER_TEMP(reln))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot access temporary tables of other sessions")));
+
+ /* pass it off to localbuf.c */
+ return PrefetchLocalBuffer(RelationGetSmgr(reln), forkNum, blockNum);
+ }
+ else
+ {
+ /* pass it to the shared buffer version */
+ return PrefetchSharedBuffer(RelationGetSmgr(reln), forkNum, blockNum);
+ }
+}
+
+/*
+ * ReadRecentBuffer -- try to pin a block in a recently observed buffer
+ *
+ * Compared to ReadBuffer(), this avoids a buffer mapping lookup when it's
+ * successful. Return true if the buffer is valid and still has the expected
+ * tag. In that case, the buffer is pinned and the usage count is bumped.
+ */
+bool
+ReadRecentBuffer(RelFileLocator rlocator, ForkNumber forkNum, BlockNumber blockNum,
+ Buffer recent_buffer)
+{
+ BufferDesc *bufHdr;
+ BufferTag tag;
+ uint32 buf_state;
+ bool have_private_ref;
+
+ Assert(BufferIsValid(recent_buffer));
+
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+ ReservePrivateRefCountEntry();
+ InitBufferTag(&tag, &rlocator, forkNum, blockNum);
+
+ if (BufferIsLocal(recent_buffer))
+ {
+ int b = -recent_buffer - 1;
+
+ bufHdr = GetLocalBufferDescriptor(b);
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ /* Is it still valid and holding the right tag? */
+ if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
+ {
+ PinLocalBuffer(bufHdr, true);
+
+ pgBufferUsage.local_blks_hit++;
+
+ return true;
+ }
+ }
+ else
+ {
+ bufHdr = GetBufferDescriptor(recent_buffer - 1);
+ have_private_ref = GetPrivateRefCount(recent_buffer) > 0;
+
+ /*
+ * Do we already have this buffer pinned with a private reference? If
+ * so, it must be valid and it is safe to check the tag without
+ * locking. If not, we have to lock the header first and then check.
+ */
+ if (have_private_ref)
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+ else
+ buf_state = LockBufHdr(bufHdr);
+
+ if ((buf_state & BM_VALID) && BufferTagsEqual(&tag, &bufHdr->tag))
+ {
+ /*
+ * It's now safe to pin the buffer. We can't pin first and ask
+ * questions later, because it might confuse code paths like
+ * InvalidateBuffer() if we pinned a random non-matching buffer.
+ */
+ if (have_private_ref)
+ PinBuffer(bufHdr, NULL); /* bump pin count */
+ else
+ PinBuffer_Locked(bufHdr); /* pin for first time */
+
+ pgBufferUsage.shared_blks_hit++;
+
+ return true;
+ }
+
+ /* If we locked the header above, now unlock. */
+ if (!have_private_ref)
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+
+ return false;
+}
+
+/*
+ * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
+ * fork with RBM_NORMAL mode and default strategy.
+ */
+Buffer
+ReadBuffer(Relation reln, BlockNumber blockNum)
+{
+ return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
+}
+
+/*
+ * ReadBufferExtended -- returns a buffer containing the requested
+ * block of the requested relation. If the blknum
+ * requested is P_NEW, extend the relation file and
+ * allocate a new block. (Caller is responsible for
+ * ensuring that only one backend tries to extend a
+ * relation at the same time!)
+ *
+ * Returns: the buffer number for the buffer containing
+ * the block read. The returned buffer has been pinned.
+ * Does not return on error --- elog's instead.
+ *
+ * Assume when this function is called, that reln has been opened already.
+ *
+ * In RBM_NORMAL mode, the page is read from disk, and the page header is
+ * validated. An error is thrown if the page header is not valid. (But
+ * note that an all-zero page is considered "valid"; see
+ * PageIsVerifiedExtended().)
+ *
+ * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
+ * valid, the page is zeroed instead of throwing an error. This is intended
+ * for non-critical data, where the caller is prepared to repair errors.
+ *
+ * In RBM_ZERO_AND_LOCK mode, if the page isn't in buffer cache already, it's
+ * filled with zeros instead of reading it from disk. Useful when the caller
+ * is going to fill the page from scratch, since this saves I/O and avoids
+ * unnecessary failure if the page-on-disk has corrupt page headers.
+ * The page is returned locked to ensure that the caller has a chance to
+ * initialize the page before it's made visible to others.
+ * Caution: do not use this mode to read a page that is beyond the relation's
+ * current physical EOF; that is likely to cause problems in md.c when
+ * the page is modified and written out. P_NEW is OK, though.
+ *
+ * RBM_ZERO_AND_CLEANUP_LOCK is the same as RBM_ZERO_AND_LOCK, but acquires
+ * a cleanup-strength lock on the page.
+ *
+ * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
+ *
+ * If strategy is not NULL, a nondefault buffer access strategy is used.
+ * See buffer/README for details.
+ */
+Buffer
+ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
+ ReadBufferMode mode, BufferAccessStrategy strategy)
+{
+ bool hit;
+ Buffer buf;
+
+ /*
+ * Reject attempts to read non-local temporary relations; we would be
+ * likely to get wrong data since we have no visibility into the owning
+ * session's local buffers.
+ */
+ if (RELATION_IS_OTHER_TEMP(reln))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot access temporary tables of other sessions")));
+
+ /*
+ * Read the buffer, and update pgstat counters to reflect a cache hit or
+ * miss.
+ */
+ pgstat_count_buffer_read(reln);
+ buf = ReadBuffer_common(RelationGetSmgr(reln), reln->rd_rel->relpersistence,
+ forkNum, blockNum, mode, strategy, &hit);
+ if (hit)
+ pgstat_count_buffer_hit(reln);
+ return buf;
+}
+
+
+/*
+ * ReadBufferWithoutRelcache -- like ReadBufferExtended, but doesn't require
+ * a relcache entry for the relation.
+ *
+ * Pass permanent = true for a RELPERSISTENCE_PERMANENT relation, and
+ * permanent = false for a RELPERSISTENCE_UNLOGGED relation. This function
+ * cannot be used for temporary relations (and making that work might be
+ * difficult, unless we only want to read temporary relations for our own
+ * BackendId).
+ */
+Buffer
+ReadBufferWithoutRelcache(RelFileLocator rlocator, ForkNumber forkNum,
+ BlockNumber blockNum, ReadBufferMode mode,
+ BufferAccessStrategy strategy, bool permanent)
+{
+ bool hit;
+
+ SMgrRelation smgr = smgropen(rlocator, InvalidBackendId);
+
+ return ReadBuffer_common(smgr, permanent ? RELPERSISTENCE_PERMANENT :
+ RELPERSISTENCE_UNLOGGED, forkNum, blockNum,
+ mode, strategy, &hit);
+}
+
+/*
+ * Convenience wrapper around ExtendBufferedRelBy() extending by one block.
+ */
+Buffer
+ExtendBufferedRel(BufferManagerRelation bmr,
+ ForkNumber forkNum,
+ BufferAccessStrategy strategy,
+ uint32 flags)
+{
+ Buffer buf;
+ uint32 extend_by = 1;
+
+ ExtendBufferedRelBy(bmr, forkNum, strategy, flags, extend_by,
+ &buf, &extend_by);
+
+ return buf;
+}
+
+/*
+ * Extend relation by multiple blocks.
+ *
+ * Tries to extend the relation by extend_by blocks. Depending on the
+ * availability of resources the relation may end up being extended by a
+ * smaller number of pages (unless an error is thrown, always by at least one
+ * page). *extended_by is updated to the number of pages the relation has been
+ * extended to.
+ *
+ * buffers needs to be an array that is at least extend_by long. Upon
+ * completion, the first extend_by array elements will point to a pinned
+ * buffer.
+ *
+ * If EB_LOCK_FIRST is part of flags, the first returned buffer is
+ * locked. This is useful for callers that want a buffer that is guaranteed to
+ * be empty.
+ */
+BlockNumber
+ExtendBufferedRelBy(BufferManagerRelation bmr,
+ ForkNumber fork,
+ BufferAccessStrategy strategy,
+ uint32 flags,
+ uint32 extend_by,
+ Buffer *buffers,
+ uint32 *extended_by)
+{
+ Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
+ Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
+ Assert(extend_by > 0);
+
+ if (bmr.smgr == NULL)
+ {
+ bmr.smgr = RelationGetSmgr(bmr.rel);
+ bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
+ }
+
+ return ExtendBufferedRelCommon(bmr, fork, strategy, flags,
+ extend_by, InvalidBlockNumber,
+ buffers, extended_by);
+}
+
+/*
+ * Extend the relation so it is at least extend_to blocks large, return buffer
+ * (extend_to - 1).
+ *
+ * This is useful for callers that want to write a specific page, regardless
+ * of the current size of the relation (e.g. useful for visibilitymap and for
+ * crash recovery).
+ */
+Buffer
+ExtendBufferedRelTo(BufferManagerRelation bmr,
+ ForkNumber fork,
+ BufferAccessStrategy strategy,
+ uint32 flags,
+ BlockNumber extend_to,
+ ReadBufferMode mode)
+{
+ BlockNumber current_size;
+ uint32 extended_by = 0;
+ Buffer buffer = InvalidBuffer;
+ Buffer buffers[64];
+
+ Assert((bmr.rel != NULL) != (bmr.smgr != NULL));
+ Assert(bmr.smgr == NULL || bmr.relpersistence != 0);
+ Assert(extend_to != InvalidBlockNumber && extend_to > 0);
+
+ if (bmr.smgr == NULL)
+ {
+ bmr.smgr = RelationGetSmgr(bmr.rel);
+ bmr.relpersistence = bmr.rel->rd_rel->relpersistence;
+ }
+
+ /*
+ * If desired, create the file if it doesn't exist. If
+ * smgr_cached_nblocks[fork] is positive then it must exist, no need for
+ * an smgrexists call.
+ */
+ if ((flags & EB_CREATE_FORK_IF_NEEDED) &&
+ (bmr.smgr->smgr_cached_nblocks[fork] == 0 ||
+ bmr.smgr->smgr_cached_nblocks[fork] == InvalidBlockNumber) &&
+ !smgrexists(bmr.smgr, fork))
+ {
+ LockRelationForExtension(bmr.rel, ExclusiveLock);
+
+ /* could have been closed while waiting for lock */
+ if (bmr.rel)
+ bmr.smgr = RelationGetSmgr(bmr.rel);
+
+ /* recheck, fork might have been created concurrently */
+ if (!smgrexists(bmr.smgr, fork))
+ smgrcreate(bmr.smgr, fork, flags & EB_PERFORMING_RECOVERY);
+
+ UnlockRelationForExtension(bmr.rel, ExclusiveLock);
+ }
+
+ /*
+ * If requested, invalidate size cache, so that smgrnblocks asks the
+ * kernel.
+ */
+ if (flags & EB_CLEAR_SIZE_CACHE)
+ bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
+
+ /*
+ * Estimate how many pages we'll need to extend by. This avoids acquiring
+ * unnecessarily many victim buffers.
+ */
+ current_size = smgrnblocks(bmr.smgr, fork);
+
+ /*
+ * Since no-one else can be looking at the page contents yet, there is no
+ * difference between an exclusive lock and a cleanup-strength lock. Note
+ * that we pass the original mode to ReadBuffer_common() below, when
+ * falling back to reading the buffer to a concurrent relation extension.
+ */
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ flags |= EB_LOCK_TARGET;
+
+ while (current_size < extend_to)
+ {
+ uint32 num_pages = lengthof(buffers);
+ BlockNumber first_block;
+
+ if ((uint64) current_size + num_pages > extend_to)
+ num_pages = extend_to - current_size;
+
+ first_block = ExtendBufferedRelCommon(bmr, fork, strategy, flags,
+ num_pages, extend_to,
+ buffers, &extended_by);
+
+ current_size = first_block + extended_by;
+ Assert(num_pages != 0 || current_size >= extend_to);
+
+ for (int i = 0; i < extended_by; i++)
+ {
+ if (first_block + i != extend_to - 1)
+ ReleaseBuffer(buffers[i]);
+ else
+ buffer = buffers[i];
+ }
+ }
+
+ /*
+ * It's possible that another backend concurrently extended the relation.
+ * In that case read the buffer.
+ *
+ * XXX: Should we control this via a flag?
+ */
+ if (buffer == InvalidBuffer)
+ {
+ bool hit;
+
+ Assert(extended_by == 0);
+ buffer = ReadBuffer_common(bmr.smgr, bmr.relpersistence,
+ fork, extend_to - 1, mode, strategy,
+ &hit);
+ }
+
+ return buffer;
+}
+
+/*
+ * ReadBuffer_common -- common logic for all ReadBuffer variants
+ *
+ * *hit is set to true if the request was satisfied from shared buffer cache.
+ */
+static Buffer
+ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+ BlockNumber blockNum, ReadBufferMode mode,
+ BufferAccessStrategy strategy, bool *hit)
+{
+ BufferDesc *bufHdr;
+ Block bufBlock;
+ bool found;
+ IOContext io_context;
+ IOObject io_object;
+ bool isLocalBuf = SmgrIsTemp(smgr);
+
+ *hit = false;
+
+ /*
+ * Backward compatibility path, most code should use ExtendBufferedRel()
+ * instead, as acquiring the extension lock inside ExtendBufferedRel()
+ * scales a lot better.
+ */
+ if (unlikely(blockNum == P_NEW))
+ {
+ uint32 flags = EB_SKIP_EXTENSION_LOCK;
+
+ /*
+ * Since no-one else can be looking at the page contents yet, there is
+ * no difference between an exclusive lock and a cleanup-strength
+ * lock.
+ */
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ flags |= EB_LOCK_FIRST;
+
+ return ExtendBufferedRel(BMR_SMGR(smgr, relpersistence),
+ forkNum, strategy, flags);
+ }
+
+ /* Make sure we will have room to remember the buffer pin */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
+ smgr->smgr_rlocator.locator.spcOid,
+ smgr->smgr_rlocator.locator.dbOid,
+ smgr->smgr_rlocator.locator.relNumber,
+ smgr->smgr_rlocator.backend);
+
+ if (isLocalBuf)
+ {
+ /*
+ * We do not use a BufferAccessStrategy for I/O of temporary tables.
+ * However, in some cases, the "strategy" may not be NULL, so we can't
+ * rely on IOContextForStrategy() to set the right IOContext for us.
+ * This may happen in cases like CREATE TEMPORARY TABLE AS...
+ */
+ io_context = IOCONTEXT_NORMAL;
+ io_object = IOOBJECT_TEMP_RELATION;
+ bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);
+ if (found)
+ pgBufferUsage.local_blks_hit++;
+ else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+ mode == RBM_ZERO_ON_ERROR)
+ pgBufferUsage.local_blks_read++;
+ }
+ else
+ {
+ /*
+ * lookup the buffer. IO_IN_PROGRESS is set if the requested block is
+ * not currently in memory.
+ */
+ io_context = IOContextForStrategy(strategy);
+ io_object = IOOBJECT_RELATION;
+ bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
+ strategy, &found, io_context);
+ if (found)
+ pgBufferUsage.shared_blks_hit++;
+ else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
+ mode == RBM_ZERO_ON_ERROR)
+ pgBufferUsage.shared_blks_read++;
+ }
+
+ /* At this point we do NOT hold any locks. */
+
+ /* if it was already in the buffer pool, we're done */
+ if (found)
+ {
+ /* Just need to update stats before we exit */
+ *hit = true;
+ VacuumPageHit++;
+ pgstat_count_io_op(io_object, io_context, IOOP_HIT);
+
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageHit;
+
+ TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
+ smgr->smgr_rlocator.locator.spcOid,
+ smgr->smgr_rlocator.locator.dbOid,
+ smgr->smgr_rlocator.locator.relNumber,
+ smgr->smgr_rlocator.backend,
+ found);
+
+ /*
+ * In RBM_ZERO_AND_LOCK mode the caller expects the page to be locked
+ * on return.
+ */
+ if (!isLocalBuf)
+ {
+ if (mode == RBM_ZERO_AND_LOCK)
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
+ LW_EXCLUSIVE);
+ else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
+ }
+
+ return BufferDescriptorGetBuffer(bufHdr);
+ }
+
+ /*
+ * if we have gotten to this point, we have allocated a buffer for the
+ * page but its contents are not yet valid. IO_IN_PROGRESS is set for it,
+ * if it's a shared buffer.
+ */
+ Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID)); /* spinlock not needed */
+
+ bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
+
+ /*
+ * Read in the page, unless the caller intends to overwrite it and just
+ * wants us to allocate a buffer.
+ */
+ if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)
+ MemSet((char *) bufBlock, 0, BLCKSZ);
+ else
+ {
+ instr_time io_start = pgstat_prepare_io_time();
+
+ smgrread(smgr, forkNum, blockNum, bufBlock);
+
+ pgstat_count_io_op_time(io_object, io_context,
+ IOOP_READ, io_start, 1);
+
+ /* check for garbage data */
+ if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
+ PIV_LOG_WARNING | PIV_REPORT_STAT))
+ {
+ if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
+ {
+ ereport(WARNING,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid page in block %u of relation %s; zeroing out page",
+ blockNum,
+ relpath(smgr->smgr_rlocator, forkNum))));
+ MemSet((char *) bufBlock, 0, BLCKSZ);
+ }
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("invalid page in block %u of relation %s",
+ blockNum,
+ relpath(smgr->smgr_rlocator, forkNum))));
+ }
+ }
+
+ /*
+ * In RBM_ZERO_AND_LOCK / RBM_ZERO_AND_CLEANUP_LOCK mode, grab the buffer
+ * content lock before marking the page as valid, to make sure that no
+ * other backend sees the zeroed page before the caller has had a chance
+ * to initialize it.
+ *
+ * Since no-one else can be looking at the page contents yet, there is no
+ * difference between an exclusive lock and a cleanup-strength lock. (Note
+ * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
+ * they assert that the buffer is already valid.)
+ */
+ if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
+ !isLocalBuf)
+ {
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
+ }
+
+ if (isLocalBuf)
+ {
+ /* Only need to adjust flags */
+ uint32 buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ buf_state |= BM_VALID;
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+ }
+ else
+ {
+ /* Set BM_VALID, terminate IO, and wake up any waiters */
+ TerminateBufferIO(bufHdr, false, BM_VALID);
+ }
+
+ VacuumPageMiss++;
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageMiss;
+
+ TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
+ smgr->smgr_rlocator.locator.spcOid,
+ smgr->smgr_rlocator.locator.dbOid,
+ smgr->smgr_rlocator.locator.relNumber,
+ smgr->smgr_rlocator.backend,
+ found);
+
+ return BufferDescriptorGetBuffer(bufHdr);
+}
+
+/*
+ * BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
+ * buffer. If no buffer exists already, selects a replacement
+ * victim and evicts the old page, but does NOT read in new page.
+ *
+ * "strategy" can be a buffer replacement strategy object, or NULL for
+ * the default strategy. The selected buffer's usage_count is advanced when
+ * using the default strategy, but otherwise possibly not (see PinBuffer).
+ *
+ * The returned buffer is pinned and is already marked as holding the
+ * desired page. If it already did have the desired page, *foundPtr is
+ * set true. Otherwise, *foundPtr is set false and the buffer is marked
+ * as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
+ *
+ * *foundPtr is actually redundant with the buffer's BM_VALID flag, but
+ * we keep it for simplicity in ReadBuffer.
+ *
+ * io_context is passed as an output parameter to avoid calling
+ * IOContextForStrategy() when there is a shared buffers hit and no IO
+ * statistics need be captured.
+ *
+ * No locks are held either at entry or exit.
+ */
+static BufferDesc *
+BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
+ BlockNumber blockNum,
+ BufferAccessStrategy strategy,
+ bool *foundPtr, IOContext io_context)
+{
+ BufferTag newTag; /* identity of requested block */
+ uint32 newHash; /* hash value for newTag */
+ LWLock *newPartitionLock; /* buffer partition lock for it */
+ int existing_buf_id;
+ Buffer victim_buffer;
+ BufferDesc *victim_buf_hdr;
+ uint32 victim_buf_state;
+
+ /* create a tag so we can lookup the buffer */
+ InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
+
+ /* determine its hash code and partition lock ID */
+ newHash = BufTableHashCode(&newTag);
+ newPartitionLock = BufMappingPartitionLock(newHash);
+
+ /* see if the block is in the buffer pool already */
+ LWLockAcquire(newPartitionLock, LW_SHARED);
+ existing_buf_id = BufTableLookup(&newTag, newHash);
+ if (existing_buf_id >= 0)
+ {
+ BufferDesc *buf;
+ bool valid;
+
+ /*
+ * Found it. Now, pin the buffer so no one can steal it from the
+ * buffer pool, and check to see if the correct data has been loaded
+ * into the buffer.
+ */
+ buf = GetBufferDescriptor(existing_buf_id);
+
+ valid = PinBuffer(buf, strategy);
+
+ /* Can release the mapping lock as soon as we've pinned it */
+ LWLockRelease(newPartitionLock);
+
+ *foundPtr = true;
+
+ if (!valid)
+ {
+ /*
+ * We can only get here if (a) someone else is still reading in
+ * the page, or (b) a previous read attempt failed. We have to
+ * wait for any active read attempt to finish, and then set up our
+ * own read attempt if the page is still not BM_VALID.
+ * StartBufferIO does it all.
+ */
+ if (StartBufferIO(buf, true))
+ {
+ /*
+ * If we get here, previous attempts to read the buffer must
+ * have failed ... but we shall bravely try again.
+ */
+ *foundPtr = false;
+ }
+ }
+
+ return buf;
+ }
+
+ /*
+ * Didn't find it in the buffer pool. We'll have to initialize a new
+ * buffer. Remember to unlock the mapping lock while doing the work.
+ */
+ LWLockRelease(newPartitionLock);
+
+ /*
+ * Acquire a victim buffer. Somebody else might try to do the same, we
+ * don't hold any conflicting locks. If so we'll have to undo our work
+ * later.
+ */
+ victim_buffer = GetVictimBuffer(strategy, io_context);
+ victim_buf_hdr = GetBufferDescriptor(victim_buffer - 1);
+
+ /*
+ * Try to make a hashtable entry for the buffer under its new tag. If
+ * somebody else inserted another buffer for the tag, we'll release the
+ * victim buffer we acquired and use the already inserted one.
+ */
+ LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
+ existing_buf_id = BufTableInsert(&newTag, newHash, victim_buf_hdr->buf_id);
+ if (existing_buf_id >= 0)
+ {
+ BufferDesc *existing_buf_hdr;
+ bool valid;
+
+ /*
+ * Got a collision. Someone has already done what we were about to do.
+ * We'll just handle this as if it were found in the buffer pool in
+ * the first place. First, give up the buffer we were planning to
+ * use.
+ *
+ * We could do this after releasing the partition lock, but then we'd
+ * have to call ResourceOwnerEnlargeBuffers() &
+ * ReservePrivateRefCountEntry() before acquiring the lock, for the
+ * rare case of such a collision.
+ */
+ UnpinBuffer(victim_buf_hdr);
+
+ /*
+ * The victim buffer we acquired previously is clean and unused, let
+ * it be found again quickly
+ */
+ StrategyFreeBuffer(victim_buf_hdr);
+
+ /* remaining code should match code at top of routine */
+
+ existing_buf_hdr = GetBufferDescriptor(existing_buf_id);
+
+ valid = PinBuffer(existing_buf_hdr, strategy);
+
+ /* Can release the mapping lock as soon as we've pinned it */
+ LWLockRelease(newPartitionLock);
+
+ *foundPtr = true;
+
+ if (!valid)
+ {
+ /*
+ * We can only get here if (a) someone else is still reading in
+ * the page, or (b) a previous read attempt failed. We have to
+ * wait for any active read attempt to finish, and then set up our
+ * own read attempt if the page is still not BM_VALID.
+ * StartBufferIO does it all.
+ */
+ if (StartBufferIO(existing_buf_hdr, true))
+ {
+ /*
+ * If we get here, previous attempts to read the buffer must
+ * have failed ... but we shall bravely try again.
+ */
+ *foundPtr = false;
+ }
+ }
+
+ return existing_buf_hdr;
+ }
+
+ /*
+ * Need to lock the buffer header too in order to change its tag.
+ */
+ victim_buf_state = LockBufHdr(victim_buf_hdr);
+
+ /* some sanity checks while we hold the buffer header lock */
+ Assert(BUF_STATE_GET_REFCOUNT(victim_buf_state) == 1);
+ Assert(!(victim_buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY | BM_IO_IN_PROGRESS)));
+
+ victim_buf_hdr->tag = newTag;
+
+ /*
+ * Make sure BM_PERMANENT is set for buffers that must be written at every
+ * checkpoint. Unlogged buffers only need to be written at shutdown
+ * checkpoints, except for their "init" forks, which need to be treated
+ * just like permanent relations.
+ */
+ victim_buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+ if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
+ victim_buf_state |= BM_PERMANENT;
+
+ UnlockBufHdr(victim_buf_hdr, victim_buf_state);
+
+ LWLockRelease(newPartitionLock);
+
+ /*
+ * Buffer contents are currently invalid. Try to obtain the right to
+ * start I/O. If StartBufferIO returns false, then someone else managed
+ * to read it before we did, so there's nothing left for BufferAlloc() to
+ * do.
+ */
+ if (StartBufferIO(victim_buf_hdr, true))
+ *foundPtr = false;
+ else
+ *foundPtr = true;
+
+ return victim_buf_hdr;
+}
+
+/*
+ * InvalidateBuffer -- mark a shared buffer invalid and return it to the
+ * freelist.
+ *
+ * The buffer header spinlock must be held at entry. We drop it before
+ * returning. (This is sane because the caller must have locked the
+ * buffer in order to be sure it should be dropped.)
+ *
+ * This is used only in contexts such as dropping a relation. We assume
+ * that no other backend could possibly be interested in using the page,
+ * so the only reason the buffer might be pinned is if someone else is
+ * trying to write it out. We have to let them finish before we can
+ * reclaim the buffer.
+ *
+ * The buffer could get reclaimed by someone else while we are waiting
+ * to acquire the necessary locks; if so, don't mess it up.
+ */
+static void
+InvalidateBuffer(BufferDesc *buf)
+{
+ BufferTag oldTag;
+ uint32 oldHash; /* hash value for oldTag */
+ LWLock *oldPartitionLock; /* buffer partition lock for it */
+ uint32 oldFlags;
+ uint32 buf_state;
+
+ /* Save the original buffer tag before dropping the spinlock */
+ oldTag = buf->tag;
+
+ buf_state = pg_atomic_read_u32(&buf->state);
+ Assert(buf_state & BM_LOCKED);
+ UnlockBufHdr(buf, buf_state);
+
+ /*
+ * Need to compute the old tag's hashcode and partition lock ID. XXX is it
+ * worth storing the hashcode in BufferDesc so we need not recompute it
+ * here? Probably not.
+ */
+ oldHash = BufTableHashCode(&oldTag);
+ oldPartitionLock = BufMappingPartitionLock(oldHash);
+
+retry:
+
+ /*
+ * Acquire exclusive mapping lock in preparation for changing the buffer's
+ * association.
+ */
+ LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
+
+ /* Re-lock the buffer header */
+ buf_state = LockBufHdr(buf);
+
+ /* If it's changed while we were waiting for lock, do nothing */
+ if (!BufferTagsEqual(&buf->tag, &oldTag))
+ {
+ UnlockBufHdr(buf, buf_state);
+ LWLockRelease(oldPartitionLock);
+ return;
+ }
+
+ /*
+ * We assume the only reason for it to be pinned is that someone else is
+ * flushing the page out. Wait for them to finish. (This could be an
+ * infinite loop if the refcount is messed up... it would be nice to time
+ * out after awhile, but there seems no way to be sure how many loops may
+ * be needed. Note that if the other guy has pinned the buffer but not
+ * yet done StartBufferIO, WaitIO will fall through and we'll effectively
+ * be busy-looping here.)
+ */
+ if (BUF_STATE_GET_REFCOUNT(buf_state) != 0)
+ {
+ UnlockBufHdr(buf, buf_state);
+ LWLockRelease(oldPartitionLock);
+ /* safety check: should definitely not be our *own* pin */
+ if (GetPrivateRefCount(BufferDescriptorGetBuffer(buf)) > 0)
+ elog(ERROR, "buffer is pinned in InvalidateBuffer");
+ WaitIO(buf);
+ goto retry;
+ }
+
+ /*
+ * Clear out the buffer's tag and flags. We must do this to ensure that
+ * linear scans of the buffer array don't think the buffer is valid.
+ */
+ oldFlags = buf_state & BUF_FLAG_MASK;
+ ClearBufferTag(&buf->tag);
+ buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+ UnlockBufHdr(buf, buf_state);
+
+ /*
+ * Remove the buffer from the lookup hashtable, if it was in there.
+ */
+ if (oldFlags & BM_TAG_VALID)
+ BufTableDelete(&oldTag, oldHash);
+
+ /*
+ * Done with mapping lock.
+ */
+ LWLockRelease(oldPartitionLock);
+
+ /*
+ * Insert the buffer at the head of the list of free buffers.
+ */
+ StrategyFreeBuffer(buf);
+}
+
+/*
+ * Helper routine for GetVictimBuffer()
+ *
+ * Needs to be called on a buffer with a valid tag, pinned, but without the
+ * buffer header spinlock held.
+ *
+ * Returns true if the buffer can be reused, in which case the buffer is only
+ * pinned by this backend and marked as invalid, false otherwise.
+ */
+static bool
+InvalidateVictimBuffer(BufferDesc *buf_hdr)
+{
+ uint32 buf_state;
+ uint32 hash;
+ LWLock *partition_lock;
+ BufferTag tag;
+
+ Assert(GetPrivateRefCount(BufferDescriptorGetBuffer(buf_hdr)) == 1);
+
+ /* have buffer pinned, so it's safe to read tag without lock */
+ tag = buf_hdr->tag;
+
+ hash = BufTableHashCode(&tag);
+ partition_lock = BufMappingPartitionLock(hash);
+
+ LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+ /* lock the buffer header */
+ buf_state = LockBufHdr(buf_hdr);
+
+ /*
+ * We have the buffer pinned nobody else should have been able to unset
+ * this concurrently.
+ */
+ Assert(buf_state & BM_TAG_VALID);
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+ Assert(BufferTagsEqual(&buf_hdr->tag, &tag));
+
+ /*
+ * If somebody else pinned the buffer since, or even worse, dirtied it,
+ * give up on this buffer: It's clearly in use.
+ */
+ if (BUF_STATE_GET_REFCOUNT(buf_state) != 1 || (buf_state & BM_DIRTY))
+ {
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+ UnlockBufHdr(buf_hdr, buf_state);
+ LWLockRelease(partition_lock);
+
+ return false;
+ }
+
+ /*
+ * Clear out the buffer's tag and flags and usagecount. This is not
+ * strictly required, as BM_TAG_VALID/BM_VALID needs to be checked before
+ * doing anything with the buffer. But currently it's beneficial, as the
+ * cheaper pre-check for several linear scans of shared buffers use the
+ * tag (see e.g. FlushDatabaseBuffers()).
+ */
+ ClearBufferTag(&buf_hdr->tag);
+ buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+ UnlockBufHdr(buf_hdr, buf_state);
+
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+ /* finally delete buffer from the buffer mapping table */
+ BufTableDelete(&tag, hash);
+
+ LWLockRelease(partition_lock);
+
+ Assert(!(buf_state & (BM_DIRTY | BM_VALID | BM_TAG_VALID)));
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+ Assert(BUF_STATE_GET_REFCOUNT(pg_atomic_read_u32(&buf_hdr->state)) > 0);
+
+ return true;
+}
+
+static Buffer
+GetVictimBuffer(BufferAccessStrategy strategy, IOContext io_context)
+{
+ BufferDesc *buf_hdr;
+ Buffer buf;
+ uint32 buf_state;
+ bool from_ring;
+
+ /*
+ * Ensure, while the spinlock's not yet held, that there's a free refcount
+ * entry.
+ */
+ ReservePrivateRefCountEntry();
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ /* we return here if a prospective victim buffer gets used concurrently */
+again:
+
+ /*
+ * Select a victim buffer. The buffer is returned with its header
+ * spinlock still held!
+ */
+ buf_hdr = StrategyGetBuffer(strategy, &buf_state, &from_ring);
+ buf = BufferDescriptorGetBuffer(buf_hdr);
+
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
+
+ /* Pin the buffer and then release the buffer spinlock */
+ PinBuffer_Locked(buf_hdr);
+
+ /*
+ * We shouldn't have any other pins for this buffer.
+ */
+ CheckBufferIsPinnedOnce(buf);
+
+ /*
+ * If the buffer was dirty, try to write it out. There is a race
+ * condition here, in that someone might dirty it after we released the
+ * buffer header lock above, or even while we are writing it out (since
+ * our share-lock won't prevent hint-bit updates). We will recheck the
+ * dirty bit after re-locking the buffer header.
+ */
+ if (buf_state & BM_DIRTY)
+ {
+ LWLock *content_lock;
+
+ Assert(buf_state & BM_TAG_VALID);
+ Assert(buf_state & BM_VALID);
+
+ /*
+ * We need a share-lock on the buffer contents to write it out (else
+ * we might write invalid data, eg because someone else is compacting
+ * the page contents while we write). We must use a conditional lock
+ * acquisition here to avoid deadlock. Even though the buffer was not
+ * pinned (and therefore surely not locked) when StrategyGetBuffer
+ * returned it, someone else could have pinned and exclusive-locked it
+ * by the time we get here. If we try to get the lock unconditionally,
+ * we'd block waiting for them; if they later block waiting for us,
+ * deadlock ensues. (This has been observed to happen when two
+ * backends are both trying to split btree index pages, and the second
+ * one just happens to be trying to split the page the first one got
+ * from StrategyGetBuffer.)
+ */
+ content_lock = BufferDescriptorGetContentLock(buf_hdr);
+ if (!LWLockConditionalAcquire(content_lock, LW_SHARED))
+ {
+ /*
+ * Someone else has locked the buffer, so give it up and loop back
+ * to get another one.
+ */
+ UnpinBuffer(buf_hdr);
+ goto again;
+ }
+
+ /*
+ * If using a nondefault strategy, and writing the buffer would
+ * require a WAL flush, let the strategy decide whether to go ahead
+ * and write/reuse the buffer or to choose another victim. We need a
+ * lock to inspect the page LSN, so this can't be done inside
+ * StrategyGetBuffer.
+ */
+ if (strategy != NULL)
+ {
+ XLogRecPtr lsn;
+
+ /* Read the LSN while holding buffer header lock */
+ buf_state = LockBufHdr(buf_hdr);
+ lsn = BufferGetLSN(buf_hdr);
+ UnlockBufHdr(buf_hdr, buf_state);
+
+ if (XLogNeedsFlush(lsn)
+ && StrategyRejectBuffer(strategy, buf_hdr, from_ring))
+ {
+ LWLockRelease(content_lock);
+ UnpinBuffer(buf_hdr);
+ goto again;
+ }
+ }
+
+ /* OK, do the I/O */
+ FlushBuffer(buf_hdr, NULL, IOOBJECT_RELATION, io_context);
+ LWLockRelease(content_lock);
+
+ ScheduleBufferTagForWriteback(&BackendWritebackContext, io_context,
+ &buf_hdr->tag);
+ }
+
+
+ if (buf_state & BM_VALID)
+ {
+ /*
+ * When a BufferAccessStrategy is in use, blocks evicted from shared
+ * buffers are counted as IOOP_EVICT in the corresponding context
+ * (e.g. IOCONTEXT_BULKWRITE). Shared buffers are evicted by a
+ * strategy in two cases: 1) while initially claiming buffers for the
+ * strategy ring 2) to replace an existing strategy ring buffer
+ * because it is pinned or in use and cannot be reused.
+ *
+ * Blocks evicted from buffers already in the strategy ring are
+ * counted as IOOP_REUSE in the corresponding strategy context.
+ *
+ * At this point, we can accurately count evictions and reuses,
+ * because we have successfully claimed the valid buffer. Previously,
+ * we may have been forced to release the buffer due to concurrent
+ * pinners or erroring out.
+ */
+ pgstat_count_io_op(IOOBJECT_RELATION, io_context,
+ from_ring ? IOOP_REUSE : IOOP_EVICT);
+ }
+
+ /*
+ * If the buffer has an entry in the buffer mapping table, delete it. This
+ * can fail because another backend could have pinned or dirtied the
+ * buffer.
+ */
+ if ((buf_state & BM_TAG_VALID) && !InvalidateVictimBuffer(buf_hdr))
+ {
+ UnpinBuffer(buf_hdr);
+ goto again;
+ }
+
+ /* a final set of sanity checks */
+#ifdef USE_ASSERT_CHECKING
+ buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
+ Assert(!(buf_state & (BM_TAG_VALID | BM_VALID | BM_DIRTY)));
+
+ CheckBufferIsPinnedOnce(buf);
+#endif
+
+ return buf;
+}
+
+/*
+ * Limit the number of pins a batch operation may additionally acquire, to
+ * avoid running out of pinnable buffers.
+ *
+ * One additional pin is always allowed, as otherwise the operation likely
+ * cannot be performed at all.
+ *
+ * The number of allowed pins for a backend is computed based on
+ * shared_buffers and the maximum number of connections possible. That's very
+ * pessimistic, but outside of toy-sized shared_buffers it should allow
+ * sufficient pins.
+ */
+static void
+LimitAdditionalPins(uint32 *additional_pins)
+{
+ uint32 max_backends;
+ int max_proportional_pins;
+
+ if (*additional_pins <= 1)
+ return;
+
+ max_backends = MaxBackends + NUM_AUXILIARY_PROCS;
+ max_proportional_pins = NBuffers / max_backends;
+
+ /*
+ * Subtract the approximate number of buffers already pinned by this
+ * backend. We get the number of "overflowed" pins for free, but don't
+ * know the number of pins in PrivateRefCountArray. The cost of
+ * calculating that exactly doesn't seem worth it, so just assume the max.
+ */
+ max_proportional_pins -= PrivateRefCountOverflowed + REFCOUNT_ARRAY_ENTRIES;
+
+ if (max_proportional_pins <= 0)
+ max_proportional_pins = 1;
+
+ if (*additional_pins > max_proportional_pins)
+ *additional_pins = max_proportional_pins;
+}
+
+/*
+ * Logic shared between ExtendBufferedRelBy(), ExtendBufferedRelTo(). Just to
+ * avoid duplicating the tracing and relpersistence related logic.
+ */
+static BlockNumber
+ExtendBufferedRelCommon(BufferManagerRelation bmr,
+ ForkNumber fork,
+ BufferAccessStrategy strategy,
+ uint32 flags,
+ uint32 extend_by,
+ BlockNumber extend_upto,
+ Buffer *buffers,
+ uint32 *extended_by)
+{
+ BlockNumber first_block;
+
+ TRACE_POSTGRESQL_BUFFER_EXTEND_START(fork,
+ bmr.smgr->smgr_rlocator.locator.spcOid,
+ bmr.smgr->smgr_rlocator.locator.dbOid,
+ bmr.smgr->smgr_rlocator.locator.relNumber,
+ bmr.smgr->smgr_rlocator.backend,
+ extend_by);
+
+ if (bmr.relpersistence == RELPERSISTENCE_TEMP)
+ first_block = ExtendBufferedRelLocal(bmr, fork, flags,
+ extend_by, extend_upto,
+ buffers, &extend_by);
+ else
+ first_block = ExtendBufferedRelShared(bmr, fork, strategy, flags,
+ extend_by, extend_upto,
+ buffers, &extend_by);
+ *extended_by = extend_by;
+
+ TRACE_POSTGRESQL_BUFFER_EXTEND_DONE(fork,
+ bmr.smgr->smgr_rlocator.locator.spcOid,
+ bmr.smgr->smgr_rlocator.locator.dbOid,
+ bmr.smgr->smgr_rlocator.locator.relNumber,
+ bmr.smgr->smgr_rlocator.backend,
+ *extended_by,
+ first_block);
+
+ return first_block;
+}
+
+/*
+ * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
+ * shared buffers.
+ */
+static BlockNumber
+ExtendBufferedRelShared(BufferManagerRelation bmr,
+ ForkNumber fork,
+ BufferAccessStrategy strategy,
+ uint32 flags,
+ uint32 extend_by,
+ BlockNumber extend_upto,
+ Buffer *buffers,
+ uint32 *extended_by)
+{
+ BlockNumber first_block;
+ IOContext io_context = IOContextForStrategy(strategy);
+ instr_time io_start;
+
+ LimitAdditionalPins(&extend_by);
+
+ /*
+ * Acquire victim buffers for extension without holding extension lock.
+ * Writing out victim buffers is the most expensive part of extending the
+ * relation, particularly when doing so requires WAL flushes. Zeroing out
+ * the buffers is also quite expensive, so do that before holding the
+ * extension lock as well.
+ *
+ * These pages are pinned by us and not valid. While we hold the pin they
+ * can't be acquired as victim buffers by another backend.
+ */
+ for (uint32 i = 0; i < extend_by; i++)
+ {
+ Block buf_block;
+
+ buffers[i] = GetVictimBuffer(strategy, io_context);
+ buf_block = BufHdrGetBlock(GetBufferDescriptor(buffers[i] - 1));
+
+ /* new buffers are zero-filled */
+ MemSet((char *) buf_block, 0, BLCKSZ);
+ }
+
+ /* in case we need to pin an existing buffer below */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ /*
+ * Lock relation against concurrent extensions, unless requested not to.
+ *
+ * We use the same extension lock for all forks. That's unnecessarily
+ * restrictive, but currently extensions for forks don't happen often
+ * enough to make it worth locking more granularly.
+ *
+ * Note that another backend might have extended the relation by the time
+ * we get the lock.
+ */
+ if (!(flags & EB_SKIP_EXTENSION_LOCK))
+ {
+ LockRelationForExtension(bmr.rel, ExclusiveLock);
+ if (bmr.rel)
+ bmr.smgr = RelationGetSmgr(bmr.rel);
+ }
+
+ /*
+ * If requested, invalidate size cache, so that smgrnblocks asks the
+ * kernel.
+ */
+ if (flags & EB_CLEAR_SIZE_CACHE)
+ bmr.smgr->smgr_cached_nblocks[fork] = InvalidBlockNumber;
+
+ first_block = smgrnblocks(bmr.smgr, fork);
+
+ /*
+ * Now that we have the accurate relation size, check if the caller wants
+ * us to extend to only up to a specific size. If there were concurrent
+ * extensions, we might have acquired too many buffers and need to release
+ * them.
+ */
+ if (extend_upto != InvalidBlockNumber)
+ {
+ uint32 orig_extend_by = extend_by;
+
+ if (first_block > extend_upto)
+ extend_by = 0;
+ else if ((uint64) first_block + extend_by > extend_upto)
+ extend_by = extend_upto - first_block;
+
+ for (uint32 i = extend_by; i < orig_extend_by; i++)
+ {
+ BufferDesc *buf_hdr = GetBufferDescriptor(buffers[i] - 1);
+
+ /*
+ * The victim buffer we acquired previously is clean and unused,
+ * let it be found again quickly
+ */
+ StrategyFreeBuffer(buf_hdr);
+ UnpinBuffer(buf_hdr);
+ }
+
+ if (extend_by == 0)
+ {
+ if (!(flags & EB_SKIP_EXTENSION_LOCK))
+ UnlockRelationForExtension(bmr.rel, ExclusiveLock);
+ *extended_by = extend_by;
+ return first_block;
+ }
+ }
+
+ /* Fail if relation is already at maximum possible length */
+ if ((uint64) first_block + extend_by >= MaxBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend relation %s beyond %u blocks",
+ relpath(bmr.smgr->smgr_rlocator, fork),
+ MaxBlockNumber)));
+
+ /*
+ * Insert buffers into buffer table, mark as IO_IN_PROGRESS.
+ *
+ * This needs to happen before we extend the relation, because as soon as
+ * we do, other backends can start to read in those pages.
+ */
+ for (int i = 0; i < extend_by; i++)
+ {
+ Buffer victim_buf = buffers[i];
+ BufferDesc *victim_buf_hdr = GetBufferDescriptor(victim_buf - 1);
+ BufferTag tag;
+ uint32 hash;
+ LWLock *partition_lock;
+ int existing_id;
+
+ InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
+ hash = BufTableHashCode(&tag);
+ partition_lock = BufMappingPartitionLock(hash);
+
+ LWLockAcquire(partition_lock, LW_EXCLUSIVE);
+
+ existing_id = BufTableInsert(&tag, hash, victim_buf_hdr->buf_id);
+
+ /*
+ * We get here only in the corner case where we are trying to extend
+ * the relation but we found a pre-existing buffer. This can happen
+ * because a prior attempt at extending the relation failed, and
+ * because mdread doesn't complain about reads beyond EOF (when
+ * zero_damaged_pages is ON) and so a previous attempt to read a block
+ * beyond EOF could have left a "valid" zero-filled buffer.
+ * Unfortunately, we have also seen this case occurring because of
+ * buggy Linux kernels that sometimes return an lseek(SEEK_END) result
+ * that doesn't account for a recent write. In that situation, the
+ * pre-existing buffer would contain valid data that we don't want to
+ * overwrite. Since the legitimate cases should always have left a
+ * zero-filled buffer, complain if not PageIsNew.
+ */
+ if (existing_id >= 0)
+ {
+ BufferDesc *existing_hdr = GetBufferDescriptor(existing_id);
+ Block buf_block;
+ bool valid;
+
+ /*
+ * Pin the existing buffer before releasing the partition lock,
+ * preventing it from being evicted.
+ */
+ valid = PinBuffer(existing_hdr, strategy);
+
+ LWLockRelease(partition_lock);
+
+ /*
+ * The victim buffer we acquired previously is clean and unused,
+ * let it be found again quickly
+ */
+ StrategyFreeBuffer(victim_buf_hdr);
+ UnpinBuffer(victim_buf_hdr);
+
+ buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
+ buf_block = BufHdrGetBlock(existing_hdr);
+
+ if (valid && !PageIsNew((Page) buf_block))
+ ereport(ERROR,
+ (errmsg("unexpected data beyond EOF in block %u of relation %s",
+ existing_hdr->tag.blockNum, relpath(bmr.smgr->smgr_rlocator, fork)),
+ errhint("This has been seen to occur with buggy kernels; consider updating your system.")));
+
+ /*
+ * We *must* do smgr[zero]extend before succeeding, else the page
+ * will not be reserved by the kernel, and the next P_NEW call
+ * will decide to return the same page. Clear the BM_VALID bit,
+ * do StartBufferIO() and proceed.
+ *
+ * Loop to handle the very small possibility that someone re-sets
+ * BM_VALID between our clearing it and StartBufferIO inspecting
+ * it.
+ */
+ do
+ {
+ uint32 buf_state = LockBufHdr(existing_hdr);
+
+ buf_state &= ~BM_VALID;
+ UnlockBufHdr(existing_hdr, buf_state);
+ } while (!StartBufferIO(existing_hdr, true));
+ }
+ else
+ {
+ uint32 buf_state;
+
+ buf_state = LockBufHdr(victim_buf_hdr);
+
+ /* some sanity checks while we hold the buffer header lock */
+ Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 1);
+
+ victim_buf_hdr->tag = tag;
+
+ buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+ if (bmr.relpersistence == RELPERSISTENCE_PERMANENT || fork == INIT_FORKNUM)
+ buf_state |= BM_PERMANENT;
+
+ UnlockBufHdr(victim_buf_hdr, buf_state);
+
+ LWLockRelease(partition_lock);
+
+ /* XXX: could combine the locked operations in it with the above */
+ StartBufferIO(victim_buf_hdr, true);
+ }
+ }
+
+ io_start = pgstat_prepare_io_time();
+
+ /*
+ * Note: if smgrzeroextend fails, we will end up with buffers that are
+ * allocated but not marked BM_VALID. The next relation extension will
+ * still select the same block number (because the relation didn't get any
+ * longer on disk) and so future attempts to extend the relation will find
+ * the same buffers (if they have not been recycled) but come right back
+ * here to try smgrzeroextend again.
+ *
+ * We don't need to set checksum for all-zero pages.
+ */
+ smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
+
+ /*
+ * Release the file-extension lock; it's now OK for someone else to extend
+ * the relation some more.
+ *
+ * We remove IO_IN_PROGRESS after this, as waking up waiting backends can
+ * take noticeable time.
+ */
+ if (!(flags & EB_SKIP_EXTENSION_LOCK))
+ UnlockRelationForExtension(bmr.rel, ExclusiveLock);
+
+ pgstat_count_io_op_time(IOOBJECT_RELATION, io_context, IOOP_EXTEND,
+ io_start, extend_by);
+
+ /* Set BM_VALID, terminate IO, and wake up any waiters */
+ for (int i = 0; i < extend_by; i++)
+ {
+ Buffer buf = buffers[i];
+ BufferDesc *buf_hdr = GetBufferDescriptor(buf - 1);
+ bool lock = false;
+
+ if (flags & EB_LOCK_FIRST && i == 0)
+ lock = true;
+ else if (flags & EB_LOCK_TARGET)
+ {
+ Assert(extend_upto != InvalidBlockNumber);
+ if (first_block + i + 1 == extend_upto)
+ lock = true;
+ }
+
+ if (lock)
+ LWLockAcquire(BufferDescriptorGetContentLock(buf_hdr), LW_EXCLUSIVE);
+
+ TerminateBufferIO(buf_hdr, false, BM_VALID);
+ }
+
+ pgBufferUsage.shared_blks_written += extend_by;
+
+ *extended_by = extend_by;
+
+ return first_block;
+}
+
+/*
+ * MarkBufferDirty
+ *
+ * Marks buffer contents as dirty (actual write happens later).
+ *
+ * Buffer must be pinned and exclusive-locked. (If caller does not hold
+ * exclusive lock, then somebody could be in process of writing the buffer,
+ * leading to risk of bad data written to disk.)
+ */
+void
+MarkBufferDirty(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+ uint32 old_buf_state;
+
+ if (!BufferIsValid(buffer))
+ elog(ERROR, "bad buffer ID: %d", buffer);
+
+ if (BufferIsLocal(buffer))
+ {
+ MarkLocalBufferDirty(buffer);
+ return;
+ }
+
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ Assert(BufferIsPinned(buffer));
+ Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
+ LW_EXCLUSIVE));
+
+ old_buf_state = pg_atomic_read_u32(&bufHdr->state);
+ for (;;)
+ {
+ if (old_buf_state & BM_LOCKED)
+ old_buf_state = WaitBufHdrUnlocked(bufHdr);
+
+ buf_state = old_buf_state;
+
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+ buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
+
+ if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
+ buf_state))
+ break;
+ }
+
+ /*
+ * If the buffer was not dirty already, do vacuum accounting.
+ */
+ if (!(old_buf_state & BM_DIRTY))
+ {
+ VacuumPageDirty++;
+ pgBufferUsage.shared_blks_dirtied++;
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageDirty;
+ }
+}
+
+/*
+ * ReleaseAndReadBuffer -- combine ReleaseBuffer() and ReadBuffer()
+ *
+ * Formerly, this saved one cycle of acquiring/releasing the BufMgrLock
+ * compared to calling the two routines separately. Now it's mainly just
+ * a convenience function. However, if the passed buffer is valid and
+ * already contains the desired block, we just return it as-is; and that
+ * does save considerable work compared to a full release and reacquire.
+ *
+ * Note: it is OK to pass buffer == InvalidBuffer, indicating that no old
+ * buffer actually needs to be released. This case is the same as ReadBuffer,
+ * but can save some tests in the caller.
+ */
+Buffer
+ReleaseAndReadBuffer(Buffer buffer,
+ Relation relation,
+ BlockNumber blockNum)
+{
+ ForkNumber forkNum = MAIN_FORKNUM;
+ BufferDesc *bufHdr;
+
+ if (BufferIsValid(buffer))
+ {
+ Assert(BufferIsPinned(buffer));
+ if (BufferIsLocal(buffer))
+ {
+ bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+ if (bufHdr->tag.blockNum == blockNum &&
+ BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
+ BufTagGetForkNum(&bufHdr->tag) == forkNum)
+ return buffer;
+ UnpinLocalBuffer(buffer);
+ }
+ else
+ {
+ bufHdr = GetBufferDescriptor(buffer - 1);
+ /* we have pin, so it's ok to examine tag without spinlock */
+ if (bufHdr->tag.blockNum == blockNum &&
+ BufTagMatchesRelFileLocator(&bufHdr->tag, &relation->rd_locator) &&
+ BufTagGetForkNum(&bufHdr->tag) == forkNum)
+ return buffer;
+ UnpinBuffer(bufHdr);
+ }
+ }
+
+ return ReadBuffer(relation, blockNum);
+}
+
+/*
+ * PinBuffer -- make buffer unavailable for replacement.
+ *
+ * For the default access strategy, the buffer's usage_count is incremented
+ * when we first pin it; for other strategies we just make sure the usage_count
+ * isn't zero. (The idea of the latter is that we don't want synchronized
+ * heap scans to inflate the count, but we need it to not be zero to discourage
+ * other backends from stealing buffers from our ring. As long as we cycle
+ * through the ring faster than the global clock-sweep cycles, buffers in
+ * our ring won't be chosen as victims for replacement by other backends.)
+ *
+ * This should be applied only to shared buffers, never local ones.
+ *
+ * Since buffers are pinned/unpinned very frequently, pin buffers without
+ * taking the buffer header lock; instead update the state variable in loop of
+ * CAS operations. Hopefully it's just a single CAS.
+ *
+ * Note that ResourceOwnerEnlargeBuffers must have been done already.
+ *
+ * Returns true if buffer is BM_VALID, else false. This provision allows
+ * some callers to avoid an extra spinlock cycle.
+ */
+static bool
+PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy)
+{
+ Buffer b = BufferDescriptorGetBuffer(buf);
+ bool result;
+ PrivateRefCountEntry *ref;
+
+ Assert(!BufferIsLocal(b));
+
+ ref = GetPrivateRefCountEntry(b, true);
+
+ if (ref == NULL)
+ {
+ uint32 buf_state;
+ uint32 old_buf_state;
+
+ ReservePrivateRefCountEntry();
+ ref = NewPrivateRefCountEntry(b);
+
+ old_buf_state = pg_atomic_read_u32(&buf->state);
+ for (;;)
+ {
+ if (old_buf_state & BM_LOCKED)
+ old_buf_state = WaitBufHdrUnlocked(buf);
+
+ buf_state = old_buf_state;
+
+ /* increase refcount */
+ buf_state += BUF_REFCOUNT_ONE;
+
+ if (strategy == NULL)
+ {
+ /* Default case: increase usagecount unless already max. */
+ if (BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
+ buf_state += BUF_USAGECOUNT_ONE;
+ }
+ else
+ {
+ /*
+ * Ring buffers shouldn't evict others from pool. Thus we
+ * don't make usagecount more than 1.
+ */
+ if (BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
+ buf_state += BUF_USAGECOUNT_ONE;
+ }
+
+ if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+ buf_state))
+ {
+ result = (buf_state & BM_VALID) != 0;
+
+ /*
+ * Assume that we acquired a buffer pin for the purposes of
+ * Valgrind buffer client checks (even in !result case) to
+ * keep things simple. Buffers that are unsafe to access are
+ * not generally guaranteed to be marked undefined or
+ * non-accessible in any case.
+ */
+ VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
+ break;
+ }
+ }
+ }
+ else
+ {
+ /*
+ * If we previously pinned the buffer, it must surely be valid.
+ *
+ * Note: We deliberately avoid a Valgrind client request here.
+ * Individual access methods can optionally superimpose buffer page
+ * client requests on top of our client requests to enforce that
+ * buffers are only accessed while locked (and pinned). It's possible
+ * that the buffer page is legitimately non-accessible here. We
+ * cannot meddle with that.
+ */
+ result = true;
+ }
+
+ ref->refcount++;
+ Assert(ref->refcount > 0);
+ ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
+ return result;
+}
+
+/*
+ * PinBuffer_Locked -- as above, but caller already locked the buffer header.
+ * The spinlock is released before return.
+ *
+ * As this function is called with the spinlock held, the caller has to
+ * previously call ReservePrivateRefCountEntry().
+ *
+ * Currently, no callers of this function want to modify the buffer's
+ * usage_count at all, so there's no need for a strategy parameter.
+ * Also we don't bother with a BM_VALID test (the caller could check that for
+ * itself).
+ *
+ * Also all callers only ever use this function when it's known that the
+ * buffer can't have a preexisting pin by this backend. That allows us to skip
+ * searching the private refcount array & hash, which is a boon, because the
+ * spinlock is still held.
+ *
+ * Note: use of this routine is frequently mandatory, not just an optimization
+ * to save a spin lock/unlock cycle, because we need to pin a buffer before
+ * its state can change under us.
+ */
+static void
+PinBuffer_Locked(BufferDesc *buf)
+{
+ Buffer b;
+ PrivateRefCountEntry *ref;
+ uint32 buf_state;
+
+ /*
+ * As explained, We don't expect any preexisting pins. That allows us to
+ * manipulate the PrivateRefCount after releasing the spinlock
+ */
+ Assert(GetPrivateRefCountEntry(BufferDescriptorGetBuffer(buf), false) == NULL);
+
+ /*
+ * Buffer can't have a preexisting pin, so mark its page as defined to
+ * Valgrind (this is similar to the PinBuffer() case where the backend
+ * doesn't already have a buffer pin)
+ */
+ VALGRIND_MAKE_MEM_DEFINED(BufHdrGetBlock(buf), BLCKSZ);
+
+ /*
+ * Since we hold the buffer spinlock, we can update the buffer state and
+ * release the lock in one operation.
+ */
+ buf_state = pg_atomic_read_u32(&buf->state);
+ Assert(buf_state & BM_LOCKED);
+ buf_state += BUF_REFCOUNT_ONE;
+ UnlockBufHdr(buf, buf_state);
+
+ b = BufferDescriptorGetBuffer(buf);
+
+ ref = NewPrivateRefCountEntry(b);
+ ref->refcount++;
+
+ ResourceOwnerRememberBuffer(CurrentResourceOwner, b);
+}
+
+/*
+ * UnpinBuffer -- make buffer available for replacement.
+ *
+ * This should be applied only to shared buffers, never local ones. This
+ * always adjusts CurrentResourceOwner.
+ */
+static void
+UnpinBuffer(BufferDesc *buf)
+{
+ PrivateRefCountEntry *ref;
+ Buffer b = BufferDescriptorGetBuffer(buf);
+
+ Assert(!BufferIsLocal(b));
+
+ /* not moving as we're likely deleting it soon anyway */
+ ref = GetPrivateRefCountEntry(b, false);
+ Assert(ref != NULL);
+
+ ResourceOwnerForgetBuffer(CurrentResourceOwner, b);
+
+ Assert(ref->refcount > 0);
+ ref->refcount--;
+ if (ref->refcount == 0)
+ {
+ uint32 buf_state;
+ uint32 old_buf_state;
+
+ /*
+ * Mark buffer non-accessible to Valgrind.
+ *
+ * Note that the buffer may have already been marked non-accessible
+ * within access method code that enforces that buffers are only
+ * accessed while a buffer lock is held.
+ */
+ VALGRIND_MAKE_MEM_NOACCESS(BufHdrGetBlock(buf), BLCKSZ);
+
+ /* I'd better not still hold the buffer content lock */
+ Assert(!LWLockHeldByMe(BufferDescriptorGetContentLock(buf)));
+
+ /*
+ * Decrement the shared reference count.
+ *
+ * Since buffer spinlock holder can update status using just write,
+ * it's not safe to use atomic decrement here; thus use a CAS loop.
+ */
+ old_buf_state = pg_atomic_read_u32(&buf->state);
+ for (;;)
+ {
+ if (old_buf_state & BM_LOCKED)
+ old_buf_state = WaitBufHdrUnlocked(buf);
+
+ buf_state = old_buf_state;
+
+ buf_state -= BUF_REFCOUNT_ONE;
+
+ if (pg_atomic_compare_exchange_u32(&buf->state, &old_buf_state,
+ buf_state))
+ break;
+ }
+
+ /* Support LockBufferForCleanup() */
+ if (buf_state & BM_PIN_COUNT_WAITER)
+ {
+ /*
+ * Acquire the buffer header lock, re-check that there's a waiter.
+ * Another backend could have unpinned this buffer, and already
+ * woken up the waiter. There's no danger of the buffer being
+ * replaced after we unpinned it above, as it's pinned by the
+ * waiter.
+ */
+ buf_state = LockBufHdr(buf);
+
+ if ((buf_state & BM_PIN_COUNT_WAITER) &&
+ BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+ {
+ /* we just released the last pin other than the waiter's */
+ int wait_backend_pgprocno = buf->wait_backend_pgprocno;
+
+ buf_state &= ~BM_PIN_COUNT_WAITER;
+ UnlockBufHdr(buf, buf_state);
+ ProcSendSignal(wait_backend_pgprocno);
+ }
+ else
+ UnlockBufHdr(buf, buf_state);
+ }
+ ForgetPrivateRefCountEntry(ref);
+ }
+}
+
+#define ST_SORT sort_checkpoint_bufferids
+#define ST_ELEMENT_TYPE CkptSortItem
+#define ST_COMPARE(a, b) ckpt_buforder_comparator(a, b)
+#define ST_SCOPE static
+#define ST_DEFINE
+#include <lib/sort_template.h>
+
+/*
+ * BufferSync -- Write out all dirty buffers in the pool.
+ *
+ * This is called at checkpoint time to write out all dirty shared buffers.
+ * The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
+ * is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
+ * CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
+ * unlogged buffers, which are otherwise skipped. The remaining flags
+ * currently have no effect here.
+ */
+static void
+BufferSync(int flags)
+{
+ uint32 buf_state;
+ int buf_id;
+ int num_to_scan;
+ int num_spaces;
+ int num_processed;
+ int num_written;
+ CkptTsStatus *per_ts_stat = NULL;
+ Oid last_tsid;
+ binaryheap *ts_heap;
+ int i;
+ int mask = BM_DIRTY;
+ WritebackContext wb_context;
+
+ /* Make sure we can handle the pin inside SyncOneBuffer */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ /*
+ * Unless this is a shutdown checkpoint or we have been explicitly told,
+ * we write only permanent, dirty buffers. But at shutdown or end of
+ * recovery, we write all dirty buffers.
+ */
+ if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
+ CHECKPOINT_FLUSH_ALL))))
+ mask |= BM_PERMANENT;
+
+ /*
+ * Loop over all buffers, and mark the ones that need to be written with
+ * BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
+ * can estimate how much work needs to be done.
+ *
+ * This allows us to write only those pages that were dirty when the
+ * checkpoint began, and not those that get dirtied while it proceeds.
+ * Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
+ * later in this function, or by normal backends or the bgwriter cleaning
+ * scan, the flag is cleared. Any buffer dirtied after this point won't
+ * have the flag set.
+ *
+ * Note that if we fail to write some buffer, we may leave buffers with
+ * BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
+ * certainly need to be written for the next checkpoint attempt, too.
+ */
+ num_to_scan = 0;
+ for (buf_id = 0; buf_id < NBuffers; buf_id++)
+ {
+ BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
+
+ /*
+ * Header spinlock is enough to examine BM_DIRTY, see comment in
+ * SyncOneBuffer.
+ */
+ buf_state = LockBufHdr(bufHdr);
+
+ if ((buf_state & mask) == mask)
+ {
+ CkptSortItem *item;
+
+ buf_state |= BM_CHECKPOINT_NEEDED;
+
+ item = &CkptBufferIds[num_to_scan++];
+ item->buf_id = buf_id;
+ item->tsId = bufHdr->tag.spcOid;
+ item->relNumber = BufTagGetRelNumber(&bufHdr->tag);
+ item->forkNum = BufTagGetForkNum(&bufHdr->tag);
+ item->blockNum = bufHdr->tag.blockNum;
+ }
+
+ UnlockBufHdr(bufHdr, buf_state);
+
+ /* Check for barrier events in case NBuffers is large. */
+ if (ProcSignalBarrierPending)
+ ProcessProcSignalBarrier();
+ }
+
+ if (num_to_scan == 0)
+ return; /* nothing to do */
+
+ WritebackContextInit(&wb_context, &checkpoint_flush_after);
+
+ TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
+
+ /*
+ * Sort buffers that need to be written to reduce the likelihood of random
+ * IO. The sorting is also important for the implementation of balancing
+ * writes between tablespaces. Without balancing writes we'd potentially
+ * end up writing to the tablespaces one-by-one; possibly overloading the
+ * underlying system.
+ */
+ sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
+
+ num_spaces = 0;
+
+ /*
+ * Allocate progress status for each tablespace with buffers that need to
+ * be flushed. This requires the to-be-flushed array to be sorted.
+ */
+ last_tsid = InvalidOid;
+ for (i = 0; i < num_to_scan; i++)
+ {
+ CkptTsStatus *s;
+ Oid cur_tsid;
+
+ cur_tsid = CkptBufferIds[i].tsId;
+
+ /*
+ * Grow array of per-tablespace status structs, every time a new
+ * tablespace is found.
+ */
+ if (last_tsid == InvalidOid || last_tsid != cur_tsid)
+ {
+ Size sz;
+
+ num_spaces++;
+
+ /*
+ * Not worth adding grow-by-power-of-2 logic here - even with a
+ * few hundred tablespaces this should be fine.
+ */
+ sz = sizeof(CkptTsStatus) * num_spaces;
+
+ if (per_ts_stat == NULL)
+ per_ts_stat = (CkptTsStatus *) palloc(sz);
+ else
+ per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
+
+ s = &per_ts_stat[num_spaces - 1];
+ memset(s, 0, sizeof(*s));
+ s->tsId = cur_tsid;
+
+ /*
+ * The first buffer in this tablespace. As CkptBufferIds is sorted
+ * by tablespace all (s->num_to_scan) buffers in this tablespace
+ * will follow afterwards.
+ */
+ s->index = i;
+
+ /*
+ * progress_slice will be determined once we know how many buffers
+ * are in each tablespace, i.e. after this loop.
+ */
+
+ last_tsid = cur_tsid;
+ }
+ else
+ {
+ s = &per_ts_stat[num_spaces - 1];
+ }
+
+ s->num_to_scan++;
+
+ /* Check for barrier events. */
+ if (ProcSignalBarrierPending)
+ ProcessProcSignalBarrier();
+ }
+
+ Assert(num_spaces > 0);
+
+ /*
+ * Build a min-heap over the write-progress in the individual tablespaces,
+ * and compute how large a portion of the total progress a single
+ * processed buffer is.
+ */
+ ts_heap = binaryheap_allocate(num_spaces,
+ ts_ckpt_progress_comparator,
+ NULL);
+
+ for (i = 0; i < num_spaces; i++)
+ {
+ CkptTsStatus *ts_stat = &per_ts_stat[i];
+
+ ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
+
+ binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
+ }
+
+ binaryheap_build(ts_heap);
+
+ /*
+ * Iterate through to-be-checkpointed buffers and write the ones (still)
+ * marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
+ * tablespaces; otherwise the sorting would lead to only one tablespace
+ * receiving writes at a time, making inefficient use of the hardware.
+ */
+ num_processed = 0;
+ num_written = 0;
+ while (!binaryheap_empty(ts_heap))
+ {
+ BufferDesc *bufHdr = NULL;
+ CkptTsStatus *ts_stat = (CkptTsStatus *)
+ DatumGetPointer(binaryheap_first(ts_heap));
+
+ buf_id = CkptBufferIds[ts_stat->index].buf_id;
+ Assert(buf_id != -1);
+
+ bufHdr = GetBufferDescriptor(buf_id);
+
+ num_processed++;
+
+ /*
+ * We don't need to acquire the lock here, because we're only looking
+ * at a single bit. It's possible that someone else writes the buffer
+ * and clears the flag right after we check, but that doesn't matter
+ * since SyncOneBuffer will then do nothing. However, there is a
+ * further race condition: it's conceivable that between the time we
+ * examine the bit here and the time SyncOneBuffer acquires the lock,
+ * someone else not only wrote the buffer but replaced it with another
+ * page and dirtied it. In that improbable case, SyncOneBuffer will
+ * write the buffer though we didn't need to. It doesn't seem worth
+ * guarding against this, though.
+ */
+ if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
+ {
+ if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
+ {
+ TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
+ PendingCheckpointerStats.buf_written_checkpoints++;
+ num_written++;
+ }
+ }
+
+ /*
+ * Measure progress independent of actually having to flush the buffer
+ * - otherwise writing become unbalanced.
+ */
+ ts_stat->progress += ts_stat->progress_slice;
+ ts_stat->num_scanned++;
+ ts_stat->index++;
+
+ /* Have all the buffers from the tablespace been processed? */
+ if (ts_stat->num_scanned == ts_stat->num_to_scan)
+ {
+ binaryheap_remove_first(ts_heap);
+ }
+ else
+ {
+ /* update heap with the new progress */
+ binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
+ }
+
+ /*
+ * Sleep to throttle our I/O rate.
+ *
+ * (This will check for barrier events even if it doesn't sleep.)
+ */
+ CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
+ }
+
+ /*
+ * Issue all pending flushes. Only checkpointer calls BufferSync(), so
+ * IOContext will always be IOCONTEXT_NORMAL.
+ */
+ IssuePendingWritebacks(&wb_context, IOCONTEXT_NORMAL);
+
+ pfree(per_ts_stat);
+ per_ts_stat = NULL;
+ binaryheap_free(ts_heap);
+
+ /*
+ * Update checkpoint statistics. As noted above, this doesn't include
+ * buffers written by other backends or bgwriter scan.
+ */
+ CheckpointStats.ckpt_bufs_written += num_written;
+
+ TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
+}
+
+/*
+ * BgBufferSync -- Write out some dirty buffers in the pool.
+ *
+ * This is called periodically by the background writer process.
+ *
+ * Returns true if it's appropriate for the bgwriter process to go into
+ * low-power hibernation mode. (This happens if the strategy clock sweep
+ * has been "lapped" and no buffer allocations have occurred recently,
+ * or if the bgwriter has been effectively disabled by setting
+ * bgwriter_lru_maxpages to 0.)
+ */
+bool
+BgBufferSync(WritebackContext *wb_context)
+{
+ /* info obtained from freelist.c */
+ int strategy_buf_id;
+ uint32 strategy_passes;
+ uint32 recent_alloc;
+
+ /*
+ * Information saved between calls so we can determine the strategy
+ * point's advance rate and avoid scanning already-cleaned buffers.
+ */
+ static bool saved_info_valid = false;
+ static int prev_strategy_buf_id;
+ static uint32 prev_strategy_passes;
+ static int next_to_clean;
+ static uint32 next_passes;
+
+ /* Moving averages of allocation rate and clean-buffer density */
+ static float smoothed_alloc = 0;
+ static float smoothed_density = 10.0;
+
+ /* Potentially these could be tunables, but for now, not */
+ float smoothing_samples = 16;
+ float scan_whole_pool_milliseconds = 120000.0;
+
+ /* Used to compute how far we scan ahead */
+ long strategy_delta;
+ int bufs_to_lap;
+ int bufs_ahead;
+ float scans_per_alloc;
+ int reusable_buffers_est;
+ int upcoming_alloc_est;
+ int min_scan_buffers;
+
+ /* Variables for the scanning loop proper */
+ int num_to_scan;
+ int num_written;
+ int reusable_buffers;
+
+ /* Variables for final smoothed_density update */
+ long new_strategy_delta;
+ uint32 new_recent_alloc;
+
+ /*
+ * Find out where the freelist clock sweep currently is, and how many
+ * buffer allocations have happened since our last call.
+ */
+ strategy_buf_id = StrategySyncStart(&strategy_passes, &recent_alloc);
+
+ /* Report buffer alloc counts to pgstat */
+ PendingBgWriterStats.buf_alloc += recent_alloc;
+
+ /*
+ * If we're not running the LRU scan, just stop after doing the stats
+ * stuff. We mark the saved state invalid so that we can recover sanely
+ * if LRU scan is turned back on later.
+ */
+ if (bgwriter_lru_maxpages <= 0)
+ {
+ saved_info_valid = false;
+ return true;
+ }
+
+ /*
+ * Compute strategy_delta = how many buffers have been scanned by the
+ * clock sweep since last time. If first time through, assume none. Then
+ * see if we are still ahead of the clock sweep, and if so, how many
+ * buffers we could scan before we'd catch up with it and "lap" it. Note:
+ * weird-looking coding of xxx_passes comparisons are to avoid bogus
+ * behavior when the passes counts wrap around.
+ */
+ if (saved_info_valid)
+ {
+ int32 passes_delta = strategy_passes - prev_strategy_passes;
+
+ strategy_delta = strategy_buf_id - prev_strategy_buf_id;
+ strategy_delta += (long) passes_delta * NBuffers;
+
+ Assert(strategy_delta >= 0);
+
+ if ((int32) (next_passes - strategy_passes) > 0)
+ {
+ /* we're one pass ahead of the strategy point */
+ bufs_to_lap = strategy_buf_id - next_to_clean;
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+ next_passes, next_to_clean,
+ strategy_passes, strategy_buf_id,
+ strategy_delta, bufs_to_lap);
+#endif
+ }
+ else if (next_passes == strategy_passes &&
+ next_to_clean >= strategy_buf_id)
+ {
+ /* on same pass, but ahead or at least not behind */
+ bufs_to_lap = NBuffers - (next_to_clean - strategy_buf_id);
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter ahead: bgw %u-%u strategy %u-%u delta=%ld lap=%d",
+ next_passes, next_to_clean,
+ strategy_passes, strategy_buf_id,
+ strategy_delta, bufs_to_lap);
+#endif
+ }
+ else
+ {
+ /*
+ * We're behind, so skip forward to the strategy point and start
+ * cleaning from there.
+ */
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter behind: bgw %u-%u strategy %u-%u delta=%ld",
+ next_passes, next_to_clean,
+ strategy_passes, strategy_buf_id,
+ strategy_delta);
+#endif
+ next_to_clean = strategy_buf_id;
+ next_passes = strategy_passes;
+ bufs_to_lap = NBuffers;
+ }
+ }
+ else
+ {
+ /*
+ * Initializing at startup or after LRU scanning had been off. Always
+ * start at the strategy point.
+ */
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter initializing: strategy %u-%u",
+ strategy_passes, strategy_buf_id);
+#endif
+ strategy_delta = 0;
+ next_to_clean = strategy_buf_id;
+ next_passes = strategy_passes;
+ bufs_to_lap = NBuffers;
+ }
+
+ /* Update saved info for next time */
+ prev_strategy_buf_id = strategy_buf_id;
+ prev_strategy_passes = strategy_passes;
+ saved_info_valid = true;
+
+ /*
+ * Compute how many buffers had to be scanned for each new allocation, ie,
+ * 1/density of reusable buffers, and track a moving average of that.
+ *
+ * If the strategy point didn't move, we don't update the density estimate
+ */
+ if (strategy_delta > 0 && recent_alloc > 0)
+ {
+ scans_per_alloc = (float) strategy_delta / (float) recent_alloc;
+ smoothed_density += (scans_per_alloc - smoothed_density) /
+ smoothing_samples;
+ }
+
+ /*
+ * Estimate how many reusable buffers there are between the current
+ * strategy point and where we've scanned ahead to, based on the smoothed
+ * density estimate.
+ */
+ bufs_ahead = NBuffers - bufs_to_lap;
+ reusable_buffers_est = (float) bufs_ahead / smoothed_density;
+
+ /*
+ * Track a moving average of recent buffer allocations. Here, rather than
+ * a true average we want a fast-attack, slow-decline behavior: we
+ * immediately follow any increase.
+ */
+ if (smoothed_alloc <= (float) recent_alloc)
+ smoothed_alloc = recent_alloc;
+ else
+ smoothed_alloc += ((float) recent_alloc - smoothed_alloc) /
+ smoothing_samples;
+
+ /* Scale the estimate by a GUC to allow more aggressive tuning. */
+ upcoming_alloc_est = (int) (smoothed_alloc * bgwriter_lru_multiplier);
+
+ /*
+ * If recent_alloc remains at zero for many cycles, smoothed_alloc will
+ * eventually underflow to zero, and the underflows produce annoying
+ * kernel warnings on some platforms. Once upcoming_alloc_est has gone to
+ * zero, there's no point in tracking smaller and smaller values of
+ * smoothed_alloc, so just reset it to exactly zero to avoid this
+ * syndrome. It will pop back up as soon as recent_alloc increases.
+ */
+ if (upcoming_alloc_est == 0)
+ smoothed_alloc = 0;
+
+ /*
+ * Even in cases where there's been little or no buffer allocation
+ * activity, we want to make a small amount of progress through the buffer
+ * cache so that as many reusable buffers as possible are clean after an
+ * idle period.
+ *
+ * (scan_whole_pool_milliseconds / BgWriterDelay) computes how many times
+ * the BGW will be called during the scan_whole_pool time; slice the
+ * buffer pool into that many sections.
+ */
+ min_scan_buffers = (int) (NBuffers / (scan_whole_pool_milliseconds / BgWriterDelay));
+
+ if (upcoming_alloc_est < (min_scan_buffers + reusable_buffers_est))
+ {
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter: alloc_est=%d too small, using min=%d + reusable_est=%d",
+ upcoming_alloc_est, min_scan_buffers, reusable_buffers_est);
+#endif
+ upcoming_alloc_est = min_scan_buffers + reusable_buffers_est;
+ }
+
+ /*
+ * Now write out dirty reusable buffers, working forward from the
+ * next_to_clean point, until we have lapped the strategy scan, or cleaned
+ * enough buffers to match our estimate of the next cycle's allocation
+ * requirements, or hit the bgwriter_lru_maxpages limit.
+ */
+
+ /* Make sure we can handle the pin inside SyncOneBuffer */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ num_to_scan = bufs_to_lap;
+ num_written = 0;
+ reusable_buffers = reusable_buffers_est;
+
+ /* Execute the LRU scan */
+ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est)
+ {
+ int sync_state = SyncOneBuffer(next_to_clean, true,
+ wb_context);
+
+ if (++next_to_clean >= NBuffers)
+ {
+ next_to_clean = 0;
+ next_passes++;
+ }
+ num_to_scan--;
+
+ if (sync_state & BUF_WRITTEN)
+ {
+ reusable_buffers++;
+ if (++num_written >= bgwriter_lru_maxpages)
+ {
+ PendingBgWriterStats.maxwritten_clean++;
+ break;
+ }
+ }
+ else if (sync_state & BUF_REUSABLE)
+ reusable_buffers++;
+ }
+
+ PendingBgWriterStats.buf_written_clean += num_written;
+
+#ifdef BGW_DEBUG
+ elog(DEBUG1, "bgwriter: recent_alloc=%u smoothed=%.2f delta=%ld ahead=%d density=%.2f reusable_est=%d upcoming_est=%d scanned=%d wrote=%d reusable=%d",
+ recent_alloc, smoothed_alloc, strategy_delta, bufs_ahead,
+ smoothed_density, reusable_buffers_est, upcoming_alloc_est,
+ bufs_to_lap - num_to_scan,
+ num_written,
+ reusable_buffers - reusable_buffers_est);
+#endif
+
+ /*
+ * Consider the above scan as being like a new allocation scan.
+ * Characterize its density and update the smoothed one based on it. This
+ * effectively halves the moving average period in cases where both the
+ * strategy and the background writer are doing some useful scanning,
+ * which is helpful because a long memory isn't as desirable on the
+ * density estimates.
+ */
+ new_strategy_delta = bufs_to_lap - num_to_scan;
+ new_recent_alloc = reusable_buffers - reusable_buffers_est;
+ if (new_strategy_delta > 0 && new_recent_alloc > 0)
+ {
+ scans_per_alloc = (float) new_strategy_delta / (float) new_recent_alloc;
+ smoothed_density += (scans_per_alloc - smoothed_density) /
+ smoothing_samples;
+
+#ifdef BGW_DEBUG
+ elog(DEBUG2, "bgwriter: cleaner density alloc=%u scan=%ld density=%.2f new smoothed=%.2f",
+ new_recent_alloc, new_strategy_delta,
+ scans_per_alloc, smoothed_density);
+#endif
+ }
+
+ /* Return true if OK to hibernate */
+ return (bufs_to_lap == 0 && recent_alloc == 0);
+}
+
+/*
+ * SyncOneBuffer -- process a single buffer during syncing.
+ *
+ * If skip_recently_used is true, we don't write currently-pinned buffers, nor
+ * buffers marked recently used, as these are not replacement candidates.
+ *
+ * Returns a bitmask containing the following flag bits:
+ * BUF_WRITTEN: we wrote the buffer.
+ * BUF_REUSABLE: buffer is available for replacement, ie, it has
+ * pin count 0 and usage count 0.
+ *
+ * (BUF_WRITTEN could be set in error if FlushBuffer finds the buffer clean
+ * after locking it, but we don't care all that much.)
+ *
+ * Note: caller must have done ResourceOwnerEnlargeBuffers.
+ */
+static int
+SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
+{
+ BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
+ int result = 0;
+ uint32 buf_state;
+ BufferTag tag;
+
+ ReservePrivateRefCountEntry();
+
+ /*
+ * Check whether buffer needs writing.
+ *
+ * We can make this check without taking the buffer content lock so long
+ * as we mark pages dirty in access methods *before* logging changes with
+ * XLogInsert(): if someone marks the buffer dirty just after our check we
+ * don't worry because our checkpoint.redo points before log record for
+ * upcoming changes and so we are not required to write such dirty buffer.
+ */
+ buf_state = LockBufHdr(bufHdr);
+
+ if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
+ BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
+ {
+ result |= BUF_REUSABLE;
+ }
+ else if (skip_recently_used)
+ {
+ /* Caller told us not to write recently-used buffers */
+ UnlockBufHdr(bufHdr, buf_state);
+ return result;
+ }
+
+ if (!(buf_state & BM_VALID) || !(buf_state & BM_DIRTY))
+ {
+ /* It's clean, so nothing to do */
+ UnlockBufHdr(bufHdr, buf_state);
+ return result;
+ }
+
+ /*
+ * Pin it, share-lock it, write it. (FlushBuffer will do nothing if the
+ * buffer is clean by the time we've locked it.)
+ */
+ PinBuffer_Locked(bufHdr);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+
+ FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
+
+ LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+
+ tag = bufHdr->tag;
+
+ UnpinBuffer(bufHdr);
+
+ /*
+ * SyncOneBuffer() is only called by checkpointer and bgwriter, so
+ * IOContext will always be IOCONTEXT_NORMAL.
+ */
+ ScheduleBufferTagForWriteback(wb_context, IOCONTEXT_NORMAL, &tag);
+
+ return result | BUF_WRITTEN;
+}
+
+/*
+ * AtEOXact_Buffers - clean up at end of transaction.
+ *
+ * As of PostgreSQL 8.0, buffer pins should get released by the
+ * ResourceOwner mechanism. This routine is just a debugging
+ * cross-check that no pins remain.
+ */
+void
+AtEOXact_Buffers(bool isCommit)
+{
+ CheckForBufferLeaks();
+
+ AtEOXact_LocalBuffers(isCommit);
+
+ Assert(PrivateRefCountOverflowed == 0);
+}
+
+/*
+ * Initialize access to shared buffer pool
+ *
+ * This is called during backend startup (whether standalone or under the
+ * postmaster). It sets up for this backend's access to the already-existing
+ * buffer pool.
+ */
+void
+InitBufferPoolAccess(void)
+{
+ HASHCTL hash_ctl;
+
+ memset(&PrivateRefCountArray, 0, sizeof(PrivateRefCountArray));
+
+ hash_ctl.keysize = sizeof(int32);
+ hash_ctl.entrysize = sizeof(PrivateRefCountEntry);
+
+ PrivateRefCountHash = hash_create("PrivateRefCount", 100, &hash_ctl,
+ HASH_ELEM | HASH_BLOBS);
+
+ /*
+ * AtProcExit_Buffers needs LWLock access, and thereby has to be called at
+ * the corresponding phase of backend shutdown.
+ */
+ Assert(MyProc != NULL);
+ on_shmem_exit(AtProcExit_Buffers, 0);
+}
+
+/*
+ * During backend exit, ensure that we released all shared-buffer locks and
+ * assert that we have no remaining pins.
+ */
+static void
+AtProcExit_Buffers(int code, Datum arg)
+{
+ UnlockBuffers();
+
+ CheckForBufferLeaks();
+
+ /* localbuf.c needs a chance too */
+ AtProcExit_LocalBuffers();
+}
+
+/*
+ * CheckForBufferLeaks - ensure this backend holds no buffer pins
+ *
+ * As of PostgreSQL 8.0, buffer pins should get released by the
+ * ResourceOwner mechanism. This routine is just a debugging
+ * cross-check that no pins remain.
+ */
+static void
+CheckForBufferLeaks(void)
+{
+#ifdef USE_ASSERT_CHECKING
+ int RefCountErrors = 0;
+ PrivateRefCountEntry *res;
+ int i;
+
+ /* check the array */
+ for (i = 0; i < REFCOUNT_ARRAY_ENTRIES; i++)
+ {
+ res = &PrivateRefCountArray[i];
+
+ if (res->buffer != InvalidBuffer)
+ {
+ PrintBufferLeakWarning(res->buffer);
+ RefCountErrors++;
+ }
+ }
+
+ /* if necessary search the hash */
+ if (PrivateRefCountOverflowed)
+ {
+ HASH_SEQ_STATUS hstat;
+
+ hash_seq_init(&hstat, PrivateRefCountHash);
+ while ((res = (PrivateRefCountEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ PrintBufferLeakWarning(res->buffer);
+ RefCountErrors++;
+ }
+ }
+
+ Assert(RefCountErrors == 0);
+#endif
+}
+
+/*
+ * Helper routine to issue warnings when a buffer is unexpectedly pinned
+ */
+void
+PrintBufferLeakWarning(Buffer buffer)
+{
+ BufferDesc *buf;
+ int32 loccount;
+ char *path;
+ BackendId backend;
+ uint32 buf_state;
+
+ Assert(BufferIsValid(buffer));
+ if (BufferIsLocal(buffer))
+ {
+ buf = GetLocalBufferDescriptor(-buffer - 1);
+ loccount = LocalRefCount[-buffer - 1];
+ backend = MyBackendId;
+ }
+ else
+ {
+ buf = GetBufferDescriptor(buffer - 1);
+ loccount = GetPrivateRefCount(buffer);
+ backend = InvalidBackendId;
+ }
+
+ /* theoretically we should lock the bufhdr here */
+ path = relpathbackend(BufTagGetRelFileLocator(&buf->tag), backend,
+ BufTagGetForkNum(&buf->tag));
+ buf_state = pg_atomic_read_u32(&buf->state);
+ elog(WARNING,
+ "buffer refcount leak: [%03d] "
+ "(rel=%s, blockNum=%u, flags=0x%x, refcount=%u %d)",
+ buffer, path,
+ buf->tag.blockNum, buf_state & BUF_FLAG_MASK,
+ BUF_STATE_GET_REFCOUNT(buf_state), loccount);
+ pfree(path);
+}
+
+/*
+ * CheckPointBuffers
+ *
+ * Flush all dirty blocks in buffer pool to disk at checkpoint time.
+ *
+ * Note: temporary relations do not participate in checkpoints, so they don't
+ * need to be flushed.
+ */
+void
+CheckPointBuffers(int flags)
+{
+ BufferSync(flags);
+}
+
+/*
+ * BufferGetBlockNumber
+ * Returns the block number associated with a buffer.
+ *
+ * Note:
+ * Assumes that the buffer is valid and pinned, else the
+ * value may be obsolete immediately...
+ */
+BlockNumber
+BufferGetBlockNumber(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+
+ Assert(BufferIsPinned(buffer));
+
+ if (BufferIsLocal(buffer))
+ bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+ else
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ /* pinned, so OK to read tag without spinlock */
+ return bufHdr->tag.blockNum;
+}
+
+/*
+ * BufferGetTag
+ * Returns the relfilelocator, fork number and block number associated with
+ * a buffer.
+ */
+void
+BufferGetTag(Buffer buffer, RelFileLocator *rlocator, ForkNumber *forknum,
+ BlockNumber *blknum)
+{
+ BufferDesc *bufHdr;
+
+ /* Do the same checks as BufferGetBlockNumber. */
+ Assert(BufferIsPinned(buffer));
+
+ if (BufferIsLocal(buffer))
+ bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+ else
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ /* pinned, so OK to read tag without spinlock */
+ *rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
+ *forknum = BufTagGetForkNum(&bufHdr->tag);
+ *blknum = bufHdr->tag.blockNum;
+}
+
+/*
+ * FlushBuffer
+ * Physically write out a shared buffer.
+ *
+ * NOTE: this actually just passes the buffer contents to the kernel; the
+ * real write to disk won't happen until the kernel feels like it. This
+ * is okay from our point of view since we can redo the changes from WAL.
+ * However, we will need to force the changes to disk via fsync before
+ * we can checkpoint WAL.
+ *
+ * The caller must hold a pin on the buffer and have share-locked the
+ * buffer contents. (Note: a share-lock does not prevent updates of
+ * hint bits in the buffer, so the page could change while the write
+ * is in progress, but we assume that that will not invalidate the data
+ * written.)
+ *
+ * If the caller has an smgr reference for the buffer's relation, pass it
+ * as the second parameter. If not, pass NULL.
+ */
+static void
+FlushBuffer(BufferDesc *buf, SMgrRelation reln, IOObject io_object,
+ IOContext io_context)
+{
+ XLogRecPtr recptr;
+ ErrorContextCallback errcallback;
+ instr_time io_start;
+ Block bufBlock;
+ char *bufToWrite;
+ uint32 buf_state;
+
+ /*
+ * Try to start an I/O operation. If StartBufferIO returns false, then
+ * someone else flushed the buffer before we could, so we need not do
+ * anything.
+ */
+ if (!StartBufferIO(buf, false))
+ return;
+
+ /* Setup error traceback support for ereport() */
+ errcallback.callback = shared_buffer_write_error_callback;
+ errcallback.arg = (void *) buf;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ /* Find smgr relation for buffer */
+ if (reln == NULL)
+ reln = smgropen(BufTagGetRelFileLocator(&buf->tag), InvalidBackendId);
+
+ TRACE_POSTGRESQL_BUFFER_FLUSH_START(BufTagGetForkNum(&buf->tag),
+ buf->tag.blockNum,
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber);
+
+ buf_state = LockBufHdr(buf);
+
+ /*
+ * Run PageGetLSN while holding header lock, since we don't have the
+ * buffer locked exclusively in all cases.
+ */
+ recptr = BufferGetLSN(buf);
+
+ /* To check if block content changes while flushing. - vadim 01/17/97 */
+ buf_state &= ~BM_JUST_DIRTIED;
+ UnlockBufHdr(buf, buf_state);
+
+ /*
+ * Force XLOG flush up to buffer's LSN. This implements the basic WAL
+ * rule that log updates must hit disk before any of the data-file changes
+ * they describe do.
+ *
+ * However, this rule does not apply to unlogged relations, which will be
+ * lost after a crash anyway. Most unlogged relation pages do not bear
+ * LSNs since we never emit WAL records for them, and therefore flushing
+ * up through the buffer LSN would be useless, but harmless. However,
+ * GiST indexes use LSNs internally to track page-splits, and therefore
+ * unlogged GiST pages bear "fake" LSNs generated by
+ * GetFakeLSNForUnloggedRel. It is unlikely but possible that the fake
+ * LSN counter could advance past the WAL insertion point; and if it did
+ * happen, attempting to flush WAL through that location would fail, with
+ * disastrous system-wide consequences. To make sure that can't happen,
+ * skip the flush if the buffer isn't permanent.
+ */
+ if (buf_state & BM_PERMANENT)
+ XLogFlush(recptr);
+
+ /*
+ * Now it's safe to write buffer to disk. Note that no one else should
+ * have been able to write it while we were busy with log flushing because
+ * only one process at a time can set the BM_IO_IN_PROGRESS bit.
+ */
+ bufBlock = BufHdrGetBlock(buf);
+
+ /*
+ * Update page checksum if desired. Since we have only shared lock on the
+ * buffer, other processes might be updating hint bits in it, so we must
+ * copy the page to private storage if we do checksumming.
+ */
+ bufToWrite = PageSetChecksumCopy((Page) bufBlock, buf->tag.blockNum);
+
+ io_start = pgstat_prepare_io_time();
+
+ /*
+ * bufToWrite is either the shared buffer or a copy, as appropriate.
+ */
+ smgrwrite(reln,
+ BufTagGetForkNum(&buf->tag),
+ buf->tag.blockNum,
+ bufToWrite,
+ false);
+
+ /*
+ * When a strategy is in use, only flushes of dirty buffers already in the
+ * strategy ring are counted as strategy writes (IOCONTEXT
+ * [BULKREAD|BULKWRITE|VACUUM] IOOP_WRITE) for the purpose of IO
+ * statistics tracking.
+ *
+ * If a shared buffer initially added to the ring must be flushed before
+ * being used, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE.
+ *
+ * If a shared buffer which was added to the ring later because the
+ * current strategy buffer is pinned or in use or because all strategy
+ * buffers were dirty and rejected (for BAS_BULKREAD operations only)
+ * requires flushing, this is counted as an IOCONTEXT_NORMAL IOOP_WRITE
+ * (from_ring will be false).
+ *
+ * When a strategy is not in use, the write can only be a "regular" write
+ * of a dirty shared buffer (IOCONTEXT_NORMAL IOOP_WRITE).
+ */
+ pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
+ IOOP_WRITE, io_start, 1);
+
+ pgBufferUsage.shared_blks_written++;
+
+ /*
+ * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
+ * end the BM_IO_IN_PROGRESS state.
+ */
+ TerminateBufferIO(buf, true, 0);
+
+ TRACE_POSTGRESQL_BUFFER_FLUSH_DONE(BufTagGetForkNum(&buf->tag),
+ buf->tag.blockNum,
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber);
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+}
+
+/*
+ * RelationGetNumberOfBlocksInFork
+ * Determines the current number of pages in the specified relation fork.
+ *
+ * Note that the accuracy of the result will depend on the details of the
+ * relation's storage. For builtin AMs it'll be accurate, but for external AMs
+ * it might not be.
+ */
+BlockNumber
+RelationGetNumberOfBlocksInFork(Relation relation, ForkNumber forkNum)
+{
+ if (RELKIND_HAS_TABLE_AM(relation->rd_rel->relkind))
+ {
+ /*
+ * Not every table AM uses BLCKSZ wide fixed size blocks. Therefore
+ * tableam returns the size in bytes - but for the purpose of this
+ * routine, we want the number of blocks. Therefore divide, rounding
+ * up.
+ */
+ uint64 szbytes;
+
+ szbytes = table_relation_size(relation, forkNum);
+
+ return (szbytes + (BLCKSZ - 1)) / BLCKSZ;
+ }
+ else if (RELKIND_HAS_STORAGE(relation->rd_rel->relkind))
+ {
+ return smgrnblocks(RelationGetSmgr(relation), forkNum);
+ }
+ else
+ Assert(false);
+
+ return 0; /* keep compiler quiet */
+}
+
+/*
+ * BufferIsPermanent
+ * Determines whether a buffer will potentially still be around after
+ * a crash. Caller must hold a buffer pin.
+ */
+bool
+BufferIsPermanent(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+
+ /* Local buffers are used only for temp relations. */
+ if (BufferIsLocal(buffer))
+ return false;
+
+ /* Make sure we've got a real buffer, and that we hold a pin on it. */
+ Assert(BufferIsValid(buffer));
+ Assert(BufferIsPinned(buffer));
+
+ /*
+ * BM_PERMANENT can't be changed while we hold a pin on the buffer, so we
+ * need not bother with the buffer header spinlock. Even if someone else
+ * changes the buffer header state while we're doing this, the state is
+ * changed atomically, so we'll read the old value or the new value, but
+ * not random garbage.
+ */
+ bufHdr = GetBufferDescriptor(buffer - 1);
+ return (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT) != 0;
+}
+
+/*
+ * BufferGetLSNAtomic
+ * Retrieves the LSN of the buffer atomically using a buffer header lock.
+ * This is necessary for some callers who may not have an exclusive lock
+ * on the buffer.
+ */
+XLogRecPtr
+BufferGetLSNAtomic(Buffer buffer)
+{
+ BufferDesc *bufHdr = GetBufferDescriptor(buffer - 1);
+ char *page = BufferGetPage(buffer);
+ XLogRecPtr lsn;
+ uint32 buf_state;
+
+ /*
+ * If we don't need locking for correctness, fastpath out.
+ */
+ if (!XLogHintBitIsNeeded() || BufferIsLocal(buffer))
+ return PageGetLSN(page);
+
+ /* Make sure we've got a real buffer, and that we hold a pin on it. */
+ Assert(BufferIsValid(buffer));
+ Assert(BufferIsPinned(buffer));
+
+ buf_state = LockBufHdr(bufHdr);
+ lsn = PageGetLSN(page);
+ UnlockBufHdr(bufHdr, buf_state);
+
+ return lsn;
+}
+
+/* ---------------------------------------------------------------------
+ * DropRelationBuffers
+ *
+ * This function removes from the buffer pool all the pages of the
+ * specified relation forks that have block numbers >= firstDelBlock.
+ * (In particular, with firstDelBlock = 0, all pages are removed.)
+ * Dirty pages are simply dropped, without bothering to write them
+ * out first. Therefore, this is NOT rollback-able, and so should be
+ * used only with extreme caution!
+ *
+ * Currently, this is called only from smgr.c when the underlying file
+ * is about to be deleted or truncated (firstDelBlock is needed for
+ * the truncation case). The data in the affected pages would therefore
+ * be deleted momentarily anyway, and there is no point in writing it.
+ * It is the responsibility of higher-level code to ensure that the
+ * deletion or truncation does not lose any data that could be needed
+ * later. It is also the responsibility of higher-level code to ensure
+ * that no other process could be trying to load more pages of the
+ * relation into buffers.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelationBuffers(SMgrRelation smgr_reln, ForkNumber *forkNum,
+ int nforks, BlockNumber *firstDelBlock)
+{
+ int i;
+ int j;
+ RelFileLocatorBackend rlocator;
+ BlockNumber nForkBlock[MAX_FORKNUM];
+ uint64 nBlocksToInvalidate = 0;
+
+ rlocator = smgr_reln->smgr_rlocator;
+
+ /* If it's a local relation, it's localbuf.c's problem. */
+ if (RelFileLocatorBackendIsTemp(rlocator))
+ {
+ if (rlocator.backend == MyBackendId)
+ {
+ for (j = 0; j < nforks; j++)
+ DropRelationLocalBuffers(rlocator.locator, forkNum[j],
+ firstDelBlock[j]);
+ }
+ return;
+ }
+
+ /*
+ * To remove all the pages of the specified relation forks from the buffer
+ * pool, we need to scan the entire buffer pool but we can optimize it by
+ * finding the buffers from BufMapping table provided we know the exact
+ * size of each fork of the relation. The exact size is required to ensure
+ * that we don't leave any buffer for the relation being dropped as
+ * otherwise the background writer or checkpointer can lead to a PANIC
+ * error while flushing buffers corresponding to files that don't exist.
+ *
+ * To know the exact size, we rely on the size cached for each fork by us
+ * during recovery which limits the optimization to recovery and on
+ * standbys but we can easily extend it once we have shared cache for
+ * relation size.
+ *
+ * In recovery, we cache the value returned by the first lseek(SEEK_END)
+ * and the future writes keeps the cached value up-to-date. See
+ * smgrextend. It is possible that the value of the first lseek is smaller
+ * than the actual number of existing blocks in the file due to buggy
+ * Linux kernels that might not have accounted for the recent write. But
+ * that should be fine because there must not be any buffers after that
+ * file size.
+ */
+ for (i = 0; i < nforks; i++)
+ {
+ /* Get the number of blocks for a relation's fork */
+ nForkBlock[i] = smgrnblocks_cached(smgr_reln, forkNum[i]);
+
+ if (nForkBlock[i] == InvalidBlockNumber)
+ {
+ nBlocksToInvalidate = InvalidBlockNumber;
+ break;
+ }
+
+ /* calculate the number of blocks to be invalidated */
+ nBlocksToInvalidate += (nForkBlock[i] - firstDelBlock[i]);
+ }
+
+ /*
+ * We apply the optimization iff the total number of blocks to invalidate
+ * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
+ */
+ if (BlockNumberIsValid(nBlocksToInvalidate) &&
+ nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
+ {
+ for (j = 0; j < nforks; j++)
+ FindAndDropRelationBuffers(rlocator.locator, forkNum[j],
+ nForkBlock[j], firstDelBlock[j]);
+ return;
+ }
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ BufferDesc *bufHdr = GetBufferDescriptor(i);
+ uint32 buf_state;
+
+ /*
+ * We can make this a tad faster by prechecking the buffer tag before
+ * we attempt to lock the buffer; this saves a lot of lock
+ * acquisitions in typical cases. It should be safe because the
+ * caller must have AccessExclusiveLock on the relation, or some other
+ * reason to be certain that no one is loading new pages of the rel
+ * into the buffer pool. (Otherwise we might well miss such pages
+ * entirely.) Therefore, while the tag might be changing while we
+ * look at it, it can't be changing *to* a value we care about, only
+ * *away* from such a value. So false negatives are impossible, and
+ * false positives are safe because we'll recheck after getting the
+ * buffer lock.
+ *
+ * We could check forkNum and blockNum as well as the rlocator, but
+ * the incremental win from doing so seems small.
+ */
+ if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator))
+ continue;
+
+ buf_state = LockBufHdr(bufHdr);
+
+ for (j = 0; j < nforks; j++)
+ {
+ if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator.locator) &&
+ BufTagGetForkNum(&bufHdr->tag) == forkNum[j] &&
+ bufHdr->tag.blockNum >= firstDelBlock[j])
+ {
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ break;
+ }
+ }
+ if (j >= nforks)
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+}
+
+/* ---------------------------------------------------------------------
+ * DropRelationsAllBuffers
+ *
+ * This function removes from the buffer pool all the pages of all
+ * forks of the specified relations. It's equivalent to calling
+ * DropRelationBuffers once per fork per relation with firstDelBlock = 0.
+ * --------------------------------------------------------------------
+ */
+void
+DropRelationsAllBuffers(SMgrRelation *smgr_reln, int nlocators)
+{
+ int i;
+ int n = 0;
+ SMgrRelation *rels;
+ BlockNumber (*block)[MAX_FORKNUM + 1];
+ uint64 nBlocksToInvalidate = 0;
+ RelFileLocator *locators;
+ bool cached = true;
+ bool use_bsearch;
+
+ if (nlocators == 0)
+ return;
+
+ rels = palloc(sizeof(SMgrRelation) * nlocators); /* non-local relations */
+
+ /* If it's a local relation, it's localbuf.c's problem. */
+ for (i = 0; i < nlocators; i++)
+ {
+ if (RelFileLocatorBackendIsTemp(smgr_reln[i]->smgr_rlocator))
+ {
+ if (smgr_reln[i]->smgr_rlocator.backend == MyBackendId)
+ DropRelationAllLocalBuffers(smgr_reln[i]->smgr_rlocator.locator);
+ }
+ else
+ rels[n++] = smgr_reln[i];
+ }
+
+ /*
+ * If there are no non-local relations, then we're done. Release the
+ * memory and return.
+ */
+ if (n == 0)
+ {
+ pfree(rels);
+ return;
+ }
+
+ /*
+ * This is used to remember the number of blocks for all the relations
+ * forks.
+ */
+ block = (BlockNumber (*)[MAX_FORKNUM + 1])
+ palloc(sizeof(BlockNumber) * n * (MAX_FORKNUM + 1));
+
+ /*
+ * We can avoid scanning the entire buffer pool if we know the exact size
+ * of each of the given relation forks. See DropRelationBuffers.
+ */
+ for (i = 0; i < n && cached; i++)
+ {
+ for (int j = 0; j <= MAX_FORKNUM; j++)
+ {
+ /* Get the number of blocks for a relation's fork. */
+ block[i][j] = smgrnblocks_cached(rels[i], j);
+
+ /* We need to only consider the relation forks that exists. */
+ if (block[i][j] == InvalidBlockNumber)
+ {
+ if (!smgrexists(rels[i], j))
+ continue;
+ cached = false;
+ break;
+ }
+
+ /* calculate the total number of blocks to be invalidated */
+ nBlocksToInvalidate += block[i][j];
+ }
+ }
+
+ /*
+ * We apply the optimization iff the total number of blocks to invalidate
+ * is below the BUF_DROP_FULL_SCAN_THRESHOLD.
+ */
+ if (cached && nBlocksToInvalidate < BUF_DROP_FULL_SCAN_THRESHOLD)
+ {
+ for (i = 0; i < n; i++)
+ {
+ for (int j = 0; j <= MAX_FORKNUM; j++)
+ {
+ /* ignore relation forks that doesn't exist */
+ if (!BlockNumberIsValid(block[i][j]))
+ continue;
+
+ /* drop all the buffers for a particular relation fork */
+ FindAndDropRelationBuffers(rels[i]->smgr_rlocator.locator,
+ j, block[i][j], 0);
+ }
+ }
+
+ pfree(block);
+ pfree(rels);
+ return;
+ }
+
+ pfree(block);
+ locators = palloc(sizeof(RelFileLocator) * n); /* non-local relations */
+ for (i = 0; i < n; i++)
+ locators[i] = rels[i]->smgr_rlocator.locator;
+
+ /*
+ * For low number of relations to drop just use a simple walk through, to
+ * save the bsearch overhead. The threshold to use is rather a guess than
+ * an exactly determined value, as it depends on many factors (CPU and RAM
+ * speeds, amount of shared buffers etc.).
+ */
+ use_bsearch = n > RELS_BSEARCH_THRESHOLD;
+
+ /* sort the list of rlocators if necessary */
+ if (use_bsearch)
+ pg_qsort(locators, n, sizeof(RelFileLocator), rlocator_comparator);
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ RelFileLocator *rlocator = NULL;
+ BufferDesc *bufHdr = GetBufferDescriptor(i);
+ uint32 buf_state;
+
+ /*
+ * As in DropRelationBuffers, an unlocked precheck should be safe and
+ * saves some cycles.
+ */
+
+ if (!use_bsearch)
+ {
+ int j;
+
+ for (j = 0; j < n; j++)
+ {
+ if (BufTagMatchesRelFileLocator(&bufHdr->tag, &locators[j]))
+ {
+ rlocator = &locators[j];
+ break;
+ }
+ }
+ }
+ else
+ {
+ RelFileLocator locator;
+
+ locator = BufTagGetRelFileLocator(&bufHdr->tag);
+ rlocator = bsearch((const void *) &(locator),
+ locators, n, sizeof(RelFileLocator),
+ rlocator_comparator);
+ }
+
+ /* buffer doesn't belong to any of the given relfilelocators; skip it */
+ if (rlocator == NULL)
+ continue;
+
+ buf_state = LockBufHdr(bufHdr);
+ if (BufTagMatchesRelFileLocator(&bufHdr->tag, rlocator))
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+
+ pfree(locators);
+ pfree(rels);
+}
+
+/* ---------------------------------------------------------------------
+ * FindAndDropRelationBuffers
+ *
+ * This function performs look up in BufMapping table and removes from the
+ * buffer pool all the pages of the specified relation fork that has block
+ * number >= firstDelBlock. (In particular, with firstDelBlock = 0, all
+ * pages are removed.)
+ * --------------------------------------------------------------------
+ */
+static void
+FindAndDropRelationBuffers(RelFileLocator rlocator, ForkNumber forkNum,
+ BlockNumber nForkBlock,
+ BlockNumber firstDelBlock)
+{
+ BlockNumber curBlock;
+
+ for (curBlock = firstDelBlock; curBlock < nForkBlock; curBlock++)
+ {
+ uint32 bufHash; /* hash value for tag */
+ BufferTag bufTag; /* identity of requested block */
+ LWLock *bufPartitionLock; /* buffer partition lock for it */
+ int buf_id;
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+
+ /* create a tag so we can lookup the buffer */
+ InitBufferTag(&bufTag, &rlocator, forkNum, curBlock);
+
+ /* determine its hash code and partition lock ID */
+ bufHash = BufTableHashCode(&bufTag);
+ bufPartitionLock = BufMappingPartitionLock(bufHash);
+
+ /* Check that it is in the buffer pool. If not, do nothing. */
+ LWLockAcquire(bufPartitionLock, LW_SHARED);
+ buf_id = BufTableLookup(&bufTag, bufHash);
+ LWLockRelease(bufPartitionLock);
+
+ if (buf_id < 0)
+ continue;
+
+ bufHdr = GetBufferDescriptor(buf_id);
+
+ /*
+ * We need to lock the buffer header and recheck if the buffer is
+ * still associated with the same block because the buffer could be
+ * evicted by some other backend loading blocks for a different
+ * relation after we release lock on the BufMapping table.
+ */
+ buf_state = LockBufHdr(bufHdr);
+
+ if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
+ BufTagGetForkNum(&bufHdr->tag) == forkNum &&
+ bufHdr->tag.blockNum >= firstDelBlock)
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+}
+
+/* ---------------------------------------------------------------------
+ * DropDatabaseBuffers
+ *
+ * This function removes all the buffers in the buffer cache for a
+ * particular database. Dirty pages are simply dropped, without
+ * bothering to write them out first. This is used when we destroy a
+ * database, to avoid trying to flush data to disk when the directory
+ * tree no longer exists. Implementation is pretty similar to
+ * DropRelationBuffers() which is for destroying just one relation.
+ * --------------------------------------------------------------------
+ */
+void
+DropDatabaseBuffers(Oid dbid)
+{
+ int i;
+
+ /*
+ * We needn't consider local buffers, since by assumption the target
+ * database isn't our own.
+ */
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ BufferDesc *bufHdr = GetBufferDescriptor(i);
+ uint32 buf_state;
+
+ /*
+ * As in DropRelationBuffers, an unlocked precheck should be safe and
+ * saves some cycles.
+ */
+ if (bufHdr->tag.dbOid != dbid)
+ continue;
+
+ buf_state = LockBufHdr(bufHdr);
+ if (bufHdr->tag.dbOid == dbid)
+ InvalidateBuffer(bufHdr); /* releases spinlock */
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+}
+
+/* -----------------------------------------------------------------
+ * PrintBufferDescs
+ *
+ * this function prints all the buffer descriptors, for debugging
+ * use only.
+ * -----------------------------------------------------------------
+ */
+#ifdef NOT_USED
+void
+PrintBufferDescs(void)
+{
+ int i;
+
+ for (i = 0; i < NBuffers; ++i)
+ {
+ BufferDesc *buf = GetBufferDescriptor(i);
+ Buffer b = BufferDescriptorGetBuffer(buf);
+
+ /* theoretically we should lock the bufhdr here */
+ elog(LOG,
+ "[%02d] (freeNext=%d, rel=%s, "
+ "blockNum=%u, flags=0x%x, refcount=%u %d)",
+ i, buf->freeNext,
+ relpathbackend(BufTagGetRelFileLocator(&buf->tag),
+ InvalidBackendId, BufTagGetForkNum(&buf->tag)),
+ buf->tag.blockNum, buf->flags,
+ buf->refcount, GetPrivateRefCount(b));
+ }
+}
+#endif
+
+#ifdef NOT_USED
+void
+PrintPinnedBufs(void)
+{
+ int i;
+
+ for (i = 0; i < NBuffers; ++i)
+ {
+ BufferDesc *buf = GetBufferDescriptor(i);
+ Buffer b = BufferDescriptorGetBuffer(buf);
+
+ if (GetPrivateRefCount(b) > 0)
+ {
+ /* theoretically we should lock the bufhdr here */
+ elog(LOG,
+ "[%02d] (freeNext=%d, rel=%s, "
+ "blockNum=%u, flags=0x%x, refcount=%u %d)",
+ i, buf->freeNext,
+ relpathperm(BufTagGetRelFileLocator(&buf->tag),
+ BufTagGetForkNum(&buf->tag)),
+ buf->tag.blockNum, buf->flags,
+ buf->refcount, GetPrivateRefCount(b));
+ }
+ }
+}
+#endif
+
+/* ---------------------------------------------------------------------
+ * FlushRelationBuffers
+ *
+ * This function writes all dirty pages of a relation out to disk
+ * (or more accurately, out to kernel disk buffers), ensuring that the
+ * kernel has an up-to-date view of the relation.
+ *
+ * Generally, the caller should be holding AccessExclusiveLock on the
+ * target relation to ensure that no other backend is busy dirtying
+ * more blocks of the relation; the effects can't be expected to last
+ * after the lock is released.
+ *
+ * XXX currently it sequentially searches the buffer pool, should be
+ * changed to more clever ways of searching. This routine is not
+ * used in any performance-critical code paths, so it's not worth
+ * adding additional overhead to normal paths to make it go faster.
+ * --------------------------------------------------------------------
+ */
+void
+FlushRelationBuffers(Relation rel)
+{
+ int i;
+ BufferDesc *bufHdr;
+
+ if (RelationUsesLocalBuffers(rel))
+ {
+ for (i = 0; i < NLocBuffer; i++)
+ {
+ uint32 buf_state;
+ instr_time io_start;
+
+ bufHdr = GetLocalBufferDescriptor(i);
+ if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
+ ((buf_state = pg_atomic_read_u32(&bufHdr->state)) &
+ (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+ {
+ ErrorContextCallback errcallback;
+ Page localpage;
+
+ localpage = (char *) LocalBufHdrGetBlock(bufHdr);
+
+ /* Setup error traceback support for ereport() */
+ errcallback.callback = local_buffer_write_error_callback;
+ errcallback.arg = (void *) bufHdr;
+ errcallback.previous = error_context_stack;
+ error_context_stack = &errcallback;
+
+ PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+
+ io_start = pgstat_prepare_io_time();
+
+ smgrwrite(RelationGetSmgr(rel),
+ BufTagGetForkNum(&bufHdr->tag),
+ bufHdr->tag.blockNum,
+ localpage,
+ false);
+
+ pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION,
+ IOCONTEXT_NORMAL, IOOP_WRITE,
+ io_start, 1);
+
+ buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+ pgBufferUsage.local_blks_written++;
+
+ /* Pop the error context stack */
+ error_context_stack = errcallback.previous;
+ }
+ }
+
+ return;
+ }
+
+ /* Make sure we can handle the pin inside the loop */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ uint32 buf_state;
+
+ bufHdr = GetBufferDescriptor(i);
+
+ /*
+ * As in DropRelationBuffers, an unlocked precheck should be safe and
+ * saves some cycles.
+ */
+ if (!BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator))
+ continue;
+
+ ReservePrivateRefCountEntry();
+
+ buf_state = LockBufHdr(bufHdr);
+ if (BufTagMatchesRelFileLocator(&bufHdr->tag, &rel->rd_locator) &&
+ (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+ {
+ PinBuffer_Locked(bufHdr);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+ FlushBuffer(bufHdr, RelationGetSmgr(rel), IOOBJECT_RELATION, IOCONTEXT_NORMAL);
+ LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+ UnpinBuffer(bufHdr);
+ }
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+}
+
+/* ---------------------------------------------------------------------
+ * FlushRelationsAllBuffers
+ *
+ * This function flushes out of the buffer pool all the pages of all
+ * forks of the specified smgr relations. It's equivalent to calling
+ * FlushRelationBuffers once per relation. The relations are assumed not
+ * to use local buffers.
+ * --------------------------------------------------------------------
+ */
+void
+FlushRelationsAllBuffers(SMgrRelation *smgrs, int nrels)
+{
+ int i;
+ SMgrSortArray *srels;
+ bool use_bsearch;
+
+ if (nrels == 0)
+ return;
+
+ /* fill-in array for qsort */
+ srels = palloc(sizeof(SMgrSortArray) * nrels);
+
+ for (i = 0; i < nrels; i++)
+ {
+ Assert(!RelFileLocatorBackendIsTemp(smgrs[i]->smgr_rlocator));
+
+ srels[i].rlocator = smgrs[i]->smgr_rlocator.locator;
+ srels[i].srel = smgrs[i];
+ }
+
+ /*
+ * Save the bsearch overhead for low number of relations to sync. See
+ * DropRelationsAllBuffers for details.
+ */
+ use_bsearch = nrels > RELS_BSEARCH_THRESHOLD;
+
+ /* sort the list of SMgrRelations if necessary */
+ if (use_bsearch)
+ pg_qsort(srels, nrels, sizeof(SMgrSortArray), rlocator_comparator);
+
+ /* Make sure we can handle the pin inside the loop */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ SMgrSortArray *srelent = NULL;
+ BufferDesc *bufHdr = GetBufferDescriptor(i);
+ uint32 buf_state;
+
+ /*
+ * As in DropRelationBuffers, an unlocked precheck should be safe and
+ * saves some cycles.
+ */
+
+ if (!use_bsearch)
+ {
+ int j;
+
+ for (j = 0; j < nrels; j++)
+ {
+ if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srels[j].rlocator))
+ {
+ srelent = &srels[j];
+ break;
+ }
+ }
+ }
+ else
+ {
+ RelFileLocator rlocator;
+
+ rlocator = BufTagGetRelFileLocator(&bufHdr->tag);
+ srelent = bsearch((const void *) &(rlocator),
+ srels, nrels, sizeof(SMgrSortArray),
+ rlocator_comparator);
+ }
+
+ /* buffer doesn't belong to any of the given relfilelocators; skip it */
+ if (srelent == NULL)
+ continue;
+
+ ReservePrivateRefCountEntry();
+
+ buf_state = LockBufHdr(bufHdr);
+ if (BufTagMatchesRelFileLocator(&bufHdr->tag, &srelent->rlocator) &&
+ (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+ {
+ PinBuffer_Locked(bufHdr);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+ FlushBuffer(bufHdr, srelent->srel, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
+ LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+ UnpinBuffer(bufHdr);
+ }
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+
+ pfree(srels);
+}
+
+/* ---------------------------------------------------------------------
+ * RelationCopyStorageUsingBuffer
+ *
+ * Copy fork's data using bufmgr. Same as RelationCopyStorage but instead
+ * of using smgrread and smgrextend this will copy using bufmgr APIs.
+ *
+ * Refer comments atop CreateAndCopyRelationData() for details about
+ * 'permanent' parameter.
+ * --------------------------------------------------------------------
+ */
+static void
+RelationCopyStorageUsingBuffer(RelFileLocator srclocator,
+ RelFileLocator dstlocator,
+ ForkNumber forkNum, bool permanent)
+{
+ Buffer srcBuf;
+ Buffer dstBuf;
+ Page srcPage;
+ Page dstPage;
+ bool use_wal;
+ BlockNumber nblocks;
+ BlockNumber blkno;
+ PGIOAlignedBlock buf;
+ BufferAccessStrategy bstrategy_src;
+ BufferAccessStrategy bstrategy_dst;
+
+ /*
+ * In general, we want to write WAL whenever wal_level > 'minimal', but we
+ * can skip it when copying any fork of an unlogged relation other than
+ * the init fork.
+ */
+ use_wal = XLogIsNeeded() && (permanent || forkNum == INIT_FORKNUM);
+
+ /* Get number of blocks in the source relation. */
+ nblocks = smgrnblocks(smgropen(srclocator, InvalidBackendId),
+ forkNum);
+
+ /* Nothing to copy; just return. */
+ if (nblocks == 0)
+ return;
+
+ /*
+ * Bulk extend the destination relation of the same size as the source
+ * relation before starting to copy block by block.
+ */
+ memset(buf.data, 0, BLCKSZ);
+ smgrextend(smgropen(dstlocator, InvalidBackendId), forkNum, nblocks - 1,
+ buf.data, true);
+
+ /* This is a bulk operation, so use buffer access strategies. */
+ bstrategy_src = GetAccessStrategy(BAS_BULKREAD);
+ bstrategy_dst = GetAccessStrategy(BAS_BULKWRITE);
+
+ /* Iterate over each block of the source relation file. */
+ for (blkno = 0; blkno < nblocks; blkno++)
+ {
+ CHECK_FOR_INTERRUPTS();
+
+ /* Read block from source relation. */
+ srcBuf = ReadBufferWithoutRelcache(srclocator, forkNum, blkno,
+ RBM_NORMAL, bstrategy_src,
+ permanent);
+ LockBuffer(srcBuf, BUFFER_LOCK_SHARE);
+ srcPage = BufferGetPage(srcBuf);
+
+ dstBuf = ReadBufferWithoutRelcache(dstlocator, forkNum, blkno,
+ RBM_ZERO_AND_LOCK, bstrategy_dst,
+ permanent);
+ dstPage = BufferGetPage(dstBuf);
+
+ START_CRIT_SECTION();
+
+ /* Copy page data from the source to the destination. */
+ memcpy(dstPage, srcPage, BLCKSZ);
+ MarkBufferDirty(dstBuf);
+
+ /* WAL-log the copied page. */
+ if (use_wal)
+ log_newpage_buffer(dstBuf, true);
+
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(dstBuf);
+ UnlockReleaseBuffer(srcBuf);
+ }
+
+ FreeAccessStrategy(bstrategy_src);
+ FreeAccessStrategy(bstrategy_dst);
+}
+
+/* ---------------------------------------------------------------------
+ * CreateAndCopyRelationData
+ *
+ * Create destination relation storage and copy all forks from the
+ * source relation to the destination.
+ *
+ * Pass permanent as true for permanent relations and false for
+ * unlogged relations. Currently this API is not supported for
+ * temporary relations.
+ * --------------------------------------------------------------------
+ */
+void
+CreateAndCopyRelationData(RelFileLocator src_rlocator,
+ RelFileLocator dst_rlocator, bool permanent)
+{
+ RelFileLocatorBackend rlocator;
+ char relpersistence;
+
+ /* Set the relpersistence. */
+ relpersistence = permanent ?
+ RELPERSISTENCE_PERMANENT : RELPERSISTENCE_UNLOGGED;
+
+ /*
+ * Create and copy all forks of the relation. During create database we
+ * have a separate cleanup mechanism which deletes complete database
+ * directory. Therefore, each individual relation doesn't need to be
+ * registered for cleanup.
+ */
+ RelationCreateStorage(dst_rlocator, relpersistence, false);
+
+ /* copy main fork. */
+ RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, MAIN_FORKNUM,
+ permanent);
+
+ /* copy those extra forks that exist */
+ for (ForkNumber forkNum = MAIN_FORKNUM + 1;
+ forkNum <= MAX_FORKNUM; forkNum++)
+ {
+ if (smgrexists(smgropen(src_rlocator, InvalidBackendId), forkNum))
+ {
+ smgrcreate(smgropen(dst_rlocator, InvalidBackendId), forkNum, false);
+
+ /*
+ * WAL log creation if the relation is persistent, or this is the
+ * init fork of an unlogged relation.
+ */
+ if (permanent || forkNum == INIT_FORKNUM)
+ log_smgrcreate(&dst_rlocator, forkNum);
+
+ /* Copy a fork's data, block by block. */
+ RelationCopyStorageUsingBuffer(src_rlocator, dst_rlocator, forkNum,
+ permanent);
+ }
+ }
+
+ /* close source and destination smgr if exists. */
+ rlocator.backend = InvalidBackendId;
+
+ rlocator.locator = src_rlocator;
+ smgrcloserellocator(rlocator);
+
+ rlocator.locator = dst_rlocator;
+ smgrcloserellocator(rlocator);
+}
+
+/* ---------------------------------------------------------------------
+ * FlushDatabaseBuffers
+ *
+ * This function writes all dirty pages of a database out to disk
+ * (or more accurately, out to kernel disk buffers), ensuring that the
+ * kernel has an up-to-date view of the database.
+ *
+ * Generally, the caller should be holding an appropriate lock to ensure
+ * no other backend is active in the target database; otherwise more
+ * pages could get dirtied.
+ *
+ * Note we don't worry about flushing any pages of temporary relations.
+ * It's assumed these wouldn't be interesting.
+ * --------------------------------------------------------------------
+ */
+void
+FlushDatabaseBuffers(Oid dbid)
+{
+ int i;
+ BufferDesc *bufHdr;
+
+ /* Make sure we can handle the pin inside the loop */
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ for (i = 0; i < NBuffers; i++)
+ {
+ uint32 buf_state;
+
+ bufHdr = GetBufferDescriptor(i);
+
+ /*
+ * As in DropRelationBuffers, an unlocked precheck should be safe and
+ * saves some cycles.
+ */
+ if (bufHdr->tag.dbOid != dbid)
+ continue;
+
+ ReservePrivateRefCountEntry();
+
+ buf_state = LockBufHdr(bufHdr);
+ if (bufHdr->tag.dbOid == dbid &&
+ (buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
+ {
+ PinBuffer_Locked(bufHdr);
+ LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_SHARED);
+ FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
+ LWLockRelease(BufferDescriptorGetContentLock(bufHdr));
+ UnpinBuffer(bufHdr);
+ }
+ else
+ UnlockBufHdr(bufHdr, buf_state);
+ }
+}
+
+/*
+ * Flush a previously, shared or exclusively, locked and pinned buffer to the
+ * OS.
+ */
+void
+FlushOneBuffer(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+
+ /* currently not needed, but no fundamental reason not to support */
+ Assert(!BufferIsLocal(buffer));
+
+ Assert(BufferIsPinned(buffer));
+
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
+
+ FlushBuffer(bufHdr, NULL, IOOBJECT_RELATION, IOCONTEXT_NORMAL);
+}
+
+/*
+ * ReleaseBuffer -- release the pin on a buffer
+ */
+void
+ReleaseBuffer(Buffer buffer)
+{
+ if (!BufferIsValid(buffer))
+ elog(ERROR, "bad buffer ID: %d", buffer);
+
+ if (BufferIsLocal(buffer))
+ UnpinLocalBuffer(buffer);
+ else
+ UnpinBuffer(GetBufferDescriptor(buffer - 1));
+}
+
+/*
+ * UnlockReleaseBuffer -- release the content lock and pin on a buffer
+ *
+ * This is just a shorthand for a common combination.
+ */
+void
+UnlockReleaseBuffer(Buffer buffer)
+{
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ ReleaseBuffer(buffer);
+}
+
+/*
+ * IncrBufferRefCount
+ * Increment the pin count on a buffer that we have *already* pinned
+ * at least once.
+ *
+ * This function cannot be used on a buffer we do not have pinned,
+ * because it doesn't change the shared buffer state.
+ */
+void
+IncrBufferRefCount(Buffer buffer)
+{
+ Assert(BufferIsPinned(buffer));
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+ if (BufferIsLocal(buffer))
+ LocalRefCount[-buffer - 1]++;
+ else
+ {
+ PrivateRefCountEntry *ref;
+
+ ref = GetPrivateRefCountEntry(buffer, true);
+ Assert(ref != NULL);
+ ref->refcount++;
+ }
+ ResourceOwnerRememberBuffer(CurrentResourceOwner, buffer);
+}
+
+/*
+ * MarkBufferDirtyHint
+ *
+ * Mark a buffer dirty for non-critical changes.
+ *
+ * This is essentially the same as MarkBufferDirty, except:
+ *
+ * 1. The caller does not write WAL; so if checksums are enabled, we may need
+ * to write an XLOG_FPI_FOR_HINT WAL record to protect against torn pages.
+ * 2. The caller might have only share-lock instead of exclusive-lock on the
+ * buffer's content lock.
+ * 3. This function does not guarantee that the buffer is always marked dirty
+ * (due to a race condition), so it cannot be used for important changes.
+ */
+void
+MarkBufferDirtyHint(Buffer buffer, bool buffer_std)
+{
+ BufferDesc *bufHdr;
+ Page page = BufferGetPage(buffer);
+
+ if (!BufferIsValid(buffer))
+ elog(ERROR, "bad buffer ID: %d", buffer);
+
+ if (BufferIsLocal(buffer))
+ {
+ MarkLocalBufferDirty(buffer);
+ return;
+ }
+
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ Assert(GetPrivateRefCount(buffer) > 0);
+ /* here, either share or exclusive lock is OK */
+ Assert(LWLockHeldByMe(BufferDescriptorGetContentLock(bufHdr)));
+
+ /*
+ * This routine might get called many times on the same page, if we are
+ * making the first scan after commit of an xact that added/deleted many
+ * tuples. So, be as quick as we can if the buffer is already dirty. We
+ * do this by not acquiring spinlock if it looks like the status bits are
+ * already set. Since we make this test unlocked, there's a chance we
+ * might fail to notice that the flags have just been cleared, and failed
+ * to reset them, due to memory-ordering issues. But since this function
+ * is only intended to be used in cases where failing to write out the
+ * data would be harmless anyway, it doesn't really matter.
+ */
+ if ((pg_atomic_read_u32(&bufHdr->state) & (BM_DIRTY | BM_JUST_DIRTIED)) !=
+ (BM_DIRTY | BM_JUST_DIRTIED))
+ {
+ XLogRecPtr lsn = InvalidXLogRecPtr;
+ bool dirtied = false;
+ bool delayChkptFlags = false;
+ uint32 buf_state;
+
+ /*
+ * If we need to protect hint bit updates from torn writes, WAL-log a
+ * full page image of the page. This full page image is only necessary
+ * if the hint bit update is the first change to the page since the
+ * last checkpoint.
+ *
+ * We don't check full_page_writes here because that logic is included
+ * when we call XLogInsert() since the value changes dynamically.
+ */
+ if (XLogHintBitIsNeeded() &&
+ (pg_atomic_read_u32(&bufHdr->state) & BM_PERMANENT))
+ {
+ /*
+ * If we must not write WAL, due to a relfilelocator-specific
+ * condition or being in recovery, don't dirty the page. We can
+ * set the hint, just not dirty the page as a result so the hint
+ * is lost when we evict the page or shutdown.
+ *
+ * See src/backend/storage/page/README for longer discussion.
+ */
+ if (RecoveryInProgress() ||
+ RelFileLocatorSkippingWAL(BufTagGetRelFileLocator(&bufHdr->tag)))
+ return;
+
+ /*
+ * If the block is already dirty because we either made a change
+ * or set a hint already, then we don't need to write a full page
+ * image. Note that aggressive cleaning of blocks dirtied by hint
+ * bit setting would increase the call rate. Bulk setting of hint
+ * bits would reduce the call rate...
+ *
+ * We must issue the WAL record before we mark the buffer dirty.
+ * Otherwise we might write the page before we write the WAL. That
+ * causes a race condition, since a checkpoint might occur between
+ * writing the WAL record and marking the buffer dirty. We solve
+ * that with a kluge, but one that is already in use during
+ * transaction commit to prevent race conditions. Basically, we
+ * simply prevent the checkpoint WAL record from being written
+ * until we have marked the buffer dirty. We don't start the
+ * checkpoint flush until we have marked dirty, so our checkpoint
+ * must flush the change to disk successfully or the checkpoint
+ * never gets written, so crash recovery will fix.
+ *
+ * It's possible we may enter here without an xid, so it is
+ * essential that CreateCheckPoint waits for virtual transactions
+ * rather than full transactionids.
+ */
+ Assert((MyProc->delayChkptFlags & DELAY_CHKPT_START) == 0);
+ MyProc->delayChkptFlags |= DELAY_CHKPT_START;
+ delayChkptFlags = true;
+ lsn = XLogSaveBufferForHint(buffer, buffer_std);
+ }
+
+ buf_state = LockBufHdr(bufHdr);
+
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+
+ if (!(buf_state & BM_DIRTY))
+ {
+ dirtied = true; /* Means "will be dirtied by this action" */
+
+ /*
+ * Set the page LSN if we wrote a backup block. We aren't supposed
+ * to set this when only holding a share lock but as long as we
+ * serialise it somehow we're OK. We choose to set LSN while
+ * holding the buffer header lock, which causes any reader of an
+ * LSN who holds only a share lock to also obtain a buffer header
+ * lock before using PageGetLSN(), which is enforced in
+ * BufferGetLSNAtomic().
+ *
+ * If checksums are enabled, you might think we should reset the
+ * checksum here. That will happen when the page is written
+ * sometime later in this checkpoint cycle.
+ */
+ if (!XLogRecPtrIsInvalid(lsn))
+ PageSetLSN(page, lsn);
+ }
+
+ buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
+ UnlockBufHdr(bufHdr, buf_state);
+
+ if (delayChkptFlags)
+ MyProc->delayChkptFlags &= ~DELAY_CHKPT_START;
+
+ if (dirtied)
+ {
+ VacuumPageDirty++;
+ pgBufferUsage.shared_blks_dirtied++;
+ if (VacuumCostActive)
+ VacuumCostBalance += VacuumCostPageDirty;
+ }
+ }
+}
+
+/*
+ * Release buffer content locks for shared buffers.
+ *
+ * Used to clean up after errors.
+ *
+ * Currently, we can expect that lwlock.c's LWLockReleaseAll() took care
+ * of releasing buffer content locks per se; the only thing we need to deal
+ * with here is clearing any PIN_COUNT request that was in progress.
+ */
+void
+UnlockBuffers(void)
+{
+ BufferDesc *buf = PinCountWaitBuf;
+
+ if (buf)
+ {
+ uint32 buf_state;
+
+ buf_state = LockBufHdr(buf);
+
+ /*
+ * Don't complain if flag bit not set; it could have been reset but we
+ * got a cancel/die interrupt before getting the signal.
+ */
+ if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
+ buf->wait_backend_pgprocno == MyProc->pgprocno)
+ buf_state &= ~BM_PIN_COUNT_WAITER;
+
+ UnlockBufHdr(buf, buf_state);
+
+ PinCountWaitBuf = NULL;
+ }
+}
+
+/*
+ * Acquire or release the content_lock for the buffer.
+ */
+void
+LockBuffer(Buffer buffer, int mode)
+{
+ BufferDesc *buf;
+
+ Assert(BufferIsPinned(buffer));
+ if (BufferIsLocal(buffer))
+ return; /* local buffers need no lock */
+
+ buf = GetBufferDescriptor(buffer - 1);
+
+ if (mode == BUFFER_LOCK_UNLOCK)
+ LWLockRelease(BufferDescriptorGetContentLock(buf));
+ else if (mode == BUFFER_LOCK_SHARE)
+ LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_SHARED);
+ else if (mode == BUFFER_LOCK_EXCLUSIVE)
+ LWLockAcquire(BufferDescriptorGetContentLock(buf), LW_EXCLUSIVE);
+ else
+ elog(ERROR, "unrecognized buffer lock mode: %d", mode);
+}
+
+/*
+ * Acquire the content_lock for the buffer, but only if we don't have to wait.
+ *
+ * This assumes the caller wants BUFFER_LOCK_EXCLUSIVE mode.
+ */
+bool
+ConditionalLockBuffer(Buffer buffer)
+{
+ BufferDesc *buf;
+
+ Assert(BufferIsPinned(buffer));
+ if (BufferIsLocal(buffer))
+ return true; /* act as though we got it */
+
+ buf = GetBufferDescriptor(buffer - 1);
+
+ return LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+ LW_EXCLUSIVE);
+}
+
+/*
+ * Verify that this backend is pinning the buffer exactly once.
+ *
+ * NOTE: Like in BufferIsPinned(), what we check here is that *this* backend
+ * holds a pin on the buffer. We do not care whether some other backend does.
+ */
+void
+CheckBufferIsPinnedOnce(Buffer buffer)
+{
+ if (BufferIsLocal(buffer))
+ {
+ if (LocalRefCount[-buffer - 1] != 1)
+ elog(ERROR, "incorrect local pin count: %d",
+ LocalRefCount[-buffer - 1]);
+ }
+ else
+ {
+ if (GetPrivateRefCount(buffer) != 1)
+ elog(ERROR, "incorrect local pin count: %d",
+ GetPrivateRefCount(buffer));
+ }
+}
+
+/*
+ * LockBufferForCleanup - lock a buffer in preparation for deleting items
+ *
+ * Items may be deleted from a disk page only when the caller (a) holds an
+ * exclusive lock on the buffer and (b) has observed that no other backend
+ * holds a pin on the buffer. If there is a pin, then the other backend
+ * might have a pointer into the buffer (for example, a heapscan reference
+ * to an item --- see README for more details). It's OK if a pin is added
+ * after the cleanup starts, however; the newly-arrived backend will be
+ * unable to look at the page until we release the exclusive lock.
+ *
+ * To implement this protocol, a would-be deleter must pin the buffer and
+ * then call LockBufferForCleanup(). LockBufferForCleanup() is similar to
+ * LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE), except that it loops until
+ * it has successfully observed pin count = 1.
+ */
+void
+LockBufferForCleanup(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+ TimestampTz waitStart = 0;
+ bool waiting = false;
+ bool logged_recovery_conflict = false;
+
+ Assert(BufferIsPinned(buffer));
+ Assert(PinCountWaitBuf == NULL);
+
+ CheckBufferIsPinnedOnce(buffer);
+
+ /* Nobody else to wait for */
+ if (BufferIsLocal(buffer))
+ return;
+
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ for (;;)
+ {
+ uint32 buf_state;
+
+ /* Try to acquire lock */
+ LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+ buf_state = LockBufHdr(bufHdr);
+
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+ if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+ {
+ /* Successfully acquired exclusive lock with pincount 1 */
+ UnlockBufHdr(bufHdr, buf_state);
+
+ /*
+ * Emit the log message if recovery conflict on buffer pin was
+ * resolved but the startup process waited longer than
+ * deadlock_timeout for it.
+ */
+ if (logged_recovery_conflict)
+ LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+ waitStart, GetCurrentTimestamp(),
+ NULL, false);
+
+ if (waiting)
+ {
+ /* reset ps display to remove the suffix if we added one */
+ set_ps_display_remove_suffix();
+ waiting = false;
+ }
+ return;
+ }
+ /* Failed, so mark myself as waiting for pincount 1 */
+ if (buf_state & BM_PIN_COUNT_WAITER)
+ {
+ UnlockBufHdr(bufHdr, buf_state);
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ elog(ERROR, "multiple backends attempting to wait for pincount 1");
+ }
+ bufHdr->wait_backend_pgprocno = MyProc->pgprocno;
+ PinCountWaitBuf = bufHdr;
+ buf_state |= BM_PIN_COUNT_WAITER;
+ UnlockBufHdr(bufHdr, buf_state);
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+
+ /* Wait to be signaled by UnpinBuffer() */
+ if (InHotStandby)
+ {
+ if (!waiting)
+ {
+ /* adjust the process title to indicate that it's waiting */
+ set_ps_display_suffix("waiting");
+ waiting = true;
+ }
+
+ /*
+ * Emit the log message if the startup process is waiting longer
+ * than deadlock_timeout for recovery conflict on buffer pin.
+ *
+ * Skip this if first time through because the startup process has
+ * not started waiting yet in this case. So, the wait start
+ * timestamp is set after this logic.
+ */
+ if (waitStart != 0 && !logged_recovery_conflict)
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ if (TimestampDifferenceExceeds(waitStart, now,
+ DeadlockTimeout))
+ {
+ LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN,
+ waitStart, now, NULL, true);
+ logged_recovery_conflict = true;
+ }
+ }
+
+ /*
+ * Set the wait start timestamp if logging is enabled and first
+ * time through.
+ */
+ if (log_recovery_conflict_waits && waitStart == 0)
+ waitStart = GetCurrentTimestamp();
+
+ /* Publish the bufid that Startup process waits on */
+ SetStartupBufferPinWaitBufId(buffer - 1);
+ /* Set alarm and then wait to be signaled by UnpinBuffer() */
+ ResolveRecoveryConflictWithBufferPin();
+ /* Reset the published bufid */
+ SetStartupBufferPinWaitBufId(-1);
+ }
+ else
+ ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
+
+ /*
+ * Remove flag marking us as waiter. Normally this will not be set
+ * anymore, but ProcWaitForSignal() can return for other signals as
+ * well. We take care to only reset the flag if we're the waiter, as
+ * theoretically another backend could have started waiting. That's
+ * impossible with the current usages due to table level locking, but
+ * better be safe.
+ */
+ buf_state = LockBufHdr(bufHdr);
+ if ((buf_state & BM_PIN_COUNT_WAITER) != 0 &&
+ bufHdr->wait_backend_pgprocno == MyProc->pgprocno)
+ buf_state &= ~BM_PIN_COUNT_WAITER;
+ UnlockBufHdr(bufHdr, buf_state);
+
+ PinCountWaitBuf = NULL;
+ /* Loop back and try again */
+ }
+}
+
+/*
+ * Check called from RecoveryConflictInterrupt handler when Startup
+ * process requests cancellation of all pin holders that are blocking it.
+ */
+bool
+HoldingBufferPinThatDelaysRecovery(void)
+{
+ int bufid = GetStartupBufferPinWaitBufId();
+
+ /*
+ * If we get woken slowly then it's possible that the Startup process was
+ * already woken by other backends before we got here. Also possible that
+ * we get here by multiple interrupts or interrupts at inappropriate
+ * times, so make sure we do nothing if the bufid is not set.
+ */
+ if (bufid < 0)
+ return false;
+
+ if (GetPrivateRefCount(bufid + 1) > 0)
+ return true;
+
+ return false;
+}
+
+/*
+ * ConditionalLockBufferForCleanup - as above, but don't wait to get the lock
+ *
+ * We won't loop, but just check once to see if the pin count is OK. If
+ * not, return false with no lock held.
+ */
+bool
+ConditionalLockBufferForCleanup(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+ uint32 buf_state,
+ refcount;
+
+ Assert(BufferIsValid(buffer));
+
+ if (BufferIsLocal(buffer))
+ {
+ refcount = LocalRefCount[-buffer - 1];
+ /* There should be exactly one pin */
+ Assert(refcount > 0);
+ if (refcount != 1)
+ return false;
+ /* Nobody else to wait for */
+ return true;
+ }
+
+ /* There should be exactly one local pin */
+ refcount = GetPrivateRefCount(buffer);
+ Assert(refcount);
+ if (refcount != 1)
+ return false;
+
+ /* Try to acquire lock */
+ if (!ConditionalLockBuffer(buffer))
+ return false;
+
+ bufHdr = GetBufferDescriptor(buffer - 1);
+ buf_state = LockBufHdr(bufHdr);
+ refcount = BUF_STATE_GET_REFCOUNT(buf_state);
+
+ Assert(refcount > 0);
+ if (refcount == 1)
+ {
+ /* Successfully acquired exclusive lock with pincount 1 */
+ UnlockBufHdr(bufHdr, buf_state);
+ return true;
+ }
+
+ /* Failed, so release the lock */
+ UnlockBufHdr(bufHdr, buf_state);
+ LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+ return false;
+}
+
+/*
+ * IsBufferCleanupOK - as above, but we already have the lock
+ *
+ * Check whether it's OK to perform cleanup on a buffer we've already
+ * locked. If we observe that the pin count is 1, our exclusive lock
+ * happens to be a cleanup lock, and we can proceed with anything that
+ * would have been allowable had we sought a cleanup lock originally.
+ */
+bool
+IsBufferCleanupOK(Buffer buffer)
+{
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+
+ Assert(BufferIsValid(buffer));
+
+ if (BufferIsLocal(buffer))
+ {
+ /* There should be exactly one pin */
+ if (LocalRefCount[-buffer - 1] != 1)
+ return false;
+ /* Nobody else to wait for */
+ return true;
+ }
+
+ /* There should be exactly one local pin */
+ if (GetPrivateRefCount(buffer) != 1)
+ return false;
+
+ bufHdr = GetBufferDescriptor(buffer - 1);
+
+ /* caller must hold exclusive lock on buffer */
+ Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(bufHdr),
+ LW_EXCLUSIVE));
+
+ buf_state = LockBufHdr(bufHdr);
+
+ Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+ if (BUF_STATE_GET_REFCOUNT(buf_state) == 1)
+ {
+ /* pincount is OK. */
+ UnlockBufHdr(bufHdr, buf_state);
+ return true;
+ }
+
+ UnlockBufHdr(bufHdr, buf_state);
+ return false;
+}
+
+
+/*
+ * Functions for buffer I/O handling
+ *
+ * Note: We assume that nested buffer I/O never occurs.
+ * i.e at most one BM_IO_IN_PROGRESS bit is set per proc.
+ *
+ * Also note that these are used only for shared buffers, not local ones.
+ */
+
+/*
+ * WaitIO -- Block until the IO_IN_PROGRESS flag on 'buf' is cleared.
+ */
+static void
+WaitIO(BufferDesc *buf)
+{
+ ConditionVariable *cv = BufferDescriptorGetIOCV(buf);
+
+ ConditionVariablePrepareToSleep(cv);
+ for (;;)
+ {
+ uint32 buf_state;
+
+ /*
+ * It may not be necessary to acquire the spinlock to check the flag
+ * here, but since this test is essential for correctness, we'd better
+ * play it safe.
+ */
+ buf_state = LockBufHdr(buf);
+ UnlockBufHdr(buf, buf_state);
+
+ if (!(buf_state & BM_IO_IN_PROGRESS))
+ break;
+ ConditionVariableSleep(cv, WAIT_EVENT_BUFFER_IO);
+ }
+ ConditionVariableCancelSleep();
+}
+
+/*
+ * StartBufferIO: begin I/O on this buffer
+ * (Assumptions)
+ * My process is executing no IO
+ * The buffer is Pinned
+ *
+ * In some scenarios there are race conditions in which multiple backends
+ * could attempt the same I/O operation concurrently. If someone else
+ * has already started I/O on this buffer then we will block on the
+ * I/O condition variable until he's done.
+ *
+ * Input operations are only attempted on buffers that are not BM_VALID,
+ * and output operations only on buffers that are BM_VALID and BM_DIRTY,
+ * so we can always tell if the work is already done.
+ *
+ * Returns true if we successfully marked the buffer as I/O busy,
+ * false if someone else already did the work.
+ */
+static bool
+StartBufferIO(BufferDesc *buf, bool forInput)
+{
+ uint32 buf_state;
+
+ ResourceOwnerEnlargeBufferIOs(CurrentResourceOwner);
+
+ for (;;)
+ {
+ buf_state = LockBufHdr(buf);
+
+ if (!(buf_state & BM_IO_IN_PROGRESS))
+ break;
+ UnlockBufHdr(buf, buf_state);
+ WaitIO(buf);
+ }
+
+ /* Once we get here, there is definitely no I/O active on this buffer */
+
+ if (forInput ? (buf_state & BM_VALID) : !(buf_state & BM_DIRTY))
+ {
+ /* someone else already did the I/O */
+ UnlockBufHdr(buf, buf_state);
+ return false;
+ }
+
+ buf_state |= BM_IO_IN_PROGRESS;
+ UnlockBufHdr(buf, buf_state);
+
+ ResourceOwnerRememberBufferIO(CurrentResourceOwner,
+ BufferDescriptorGetBuffer(buf));
+
+ return true;
+}
+
+/*
+ * TerminateBufferIO: release a buffer we were doing I/O on
+ * (Assumptions)
+ * My process is executing IO for the buffer
+ * BM_IO_IN_PROGRESS bit is set for the buffer
+ * The buffer is Pinned
+ *
+ * If clear_dirty is true and BM_JUST_DIRTIED is not set, we clear the
+ * buffer's BM_DIRTY flag. This is appropriate when terminating a
+ * successful write. The check on BM_JUST_DIRTIED is necessary to avoid
+ * marking the buffer clean if it was re-dirtied while we were writing.
+ *
+ * set_flag_bits gets ORed into the buffer's flags. It must include
+ * BM_IO_ERROR in a failure case. For successful completion it could
+ * be 0, or BM_VALID if we just finished reading in the page.
+ */
+static void
+TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
+{
+ uint32 buf_state;
+
+ buf_state = LockBufHdr(buf);
+
+ Assert(buf_state & BM_IO_IN_PROGRESS);
+
+ buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
+ if (clear_dirty && !(buf_state & BM_JUST_DIRTIED))
+ buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+
+ buf_state |= set_flag_bits;
+ UnlockBufHdr(buf, buf_state);
+
+ ResourceOwnerForgetBufferIO(CurrentResourceOwner,
+ BufferDescriptorGetBuffer(buf));
+
+ ConditionVariableBroadcast(BufferDescriptorGetIOCV(buf));
+}
+
+/*
+ * AbortBufferIO: Clean up active buffer I/O after an error.
+ *
+ * All LWLocks we might have held have been released,
+ * but we haven't yet released buffer pins, so the buffer is still pinned.
+ *
+ * If I/O was in progress, we always set BM_IO_ERROR, even though it's
+ * possible the error condition wasn't related to the I/O.
+ */
+void
+AbortBufferIO(Buffer buffer)
+{
+ BufferDesc *buf_hdr = GetBufferDescriptor(buffer - 1);
+ uint32 buf_state;
+
+ buf_state = LockBufHdr(buf_hdr);
+ Assert(buf_state & (BM_IO_IN_PROGRESS | BM_TAG_VALID));
+
+ if (!(buf_state & BM_VALID))
+ {
+ Assert(!(buf_state & BM_DIRTY));
+ UnlockBufHdr(buf_hdr, buf_state);
+ }
+ else
+ {
+ Assert(buf_state & BM_DIRTY);
+ UnlockBufHdr(buf_hdr, buf_state);
+
+ /* Issue notice if this is not the first failure... */
+ if (buf_state & BM_IO_ERROR)
+ {
+ /* Buffer is pinned, so we can read tag without spinlock */
+ char *path;
+
+ path = relpathperm(BufTagGetRelFileLocator(&buf_hdr->tag),
+ BufTagGetForkNum(&buf_hdr->tag));
+ ereport(WARNING,
+ (errcode(ERRCODE_IO_ERROR),
+ errmsg("could not write block %u of %s",
+ buf_hdr->tag.blockNum, path),
+ errdetail("Multiple failures --- write error might be permanent.")));
+ pfree(path);
+ }
+ }
+
+ TerminateBufferIO(buf_hdr, false, BM_IO_ERROR);
+}
+
+/*
+ * Error context callback for errors occurring during shared buffer writes.
+ */
+static void
+shared_buffer_write_error_callback(void *arg)
+{
+ BufferDesc *bufHdr = (BufferDesc *) arg;
+
+ /* Buffer is pinned, so we can read the tag without locking the spinlock */
+ if (bufHdr != NULL)
+ {
+ char *path = relpathperm(BufTagGetRelFileLocator(&bufHdr->tag),
+ BufTagGetForkNum(&bufHdr->tag));
+
+ errcontext("writing block %u of relation %s",
+ bufHdr->tag.blockNum, path);
+ pfree(path);
+ }
+}
+
+/*
+ * Error context callback for errors occurring during local buffer writes.
+ */
+static void
+local_buffer_write_error_callback(void *arg)
+{
+ BufferDesc *bufHdr = (BufferDesc *) arg;
+
+ if (bufHdr != NULL)
+ {
+ char *path = relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
+ MyBackendId,
+ BufTagGetForkNum(&bufHdr->tag));
+
+ errcontext("writing block %u of relation %s",
+ bufHdr->tag.blockNum, path);
+ pfree(path);
+ }
+}
+
+/*
+ * RelFileLocator qsort/bsearch comparator; see RelFileLocatorEquals.
+ */
+static int
+rlocator_comparator(const void *p1, const void *p2)
+{
+ RelFileLocator n1 = *(const RelFileLocator *) p1;
+ RelFileLocator n2 = *(const RelFileLocator *) p2;
+
+ if (n1.relNumber < n2.relNumber)
+ return -1;
+ else if (n1.relNumber > n2.relNumber)
+ return 1;
+
+ if (n1.dbOid < n2.dbOid)
+ return -1;
+ else if (n1.dbOid > n2.dbOid)
+ return 1;
+
+ if (n1.spcOid < n2.spcOid)
+ return -1;
+ else if (n1.spcOid > n2.spcOid)
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * Lock buffer header - set BM_LOCKED in buffer state.
+ */
+uint32
+LockBufHdr(BufferDesc *desc)
+{
+ SpinDelayStatus delayStatus;
+ uint32 old_buf_state;
+
+ Assert(!BufferIsLocal(BufferDescriptorGetBuffer(desc)));
+
+ init_local_spin_delay(&delayStatus);
+
+ while (true)
+ {
+ /* set BM_LOCKED flag */
+ old_buf_state = pg_atomic_fetch_or_u32(&desc->state, BM_LOCKED);
+ /* if it wasn't set before we're OK */
+ if (!(old_buf_state & BM_LOCKED))
+ break;
+ perform_spin_delay(&delayStatus);
+ }
+ finish_spin_delay(&delayStatus);
+ return old_buf_state | BM_LOCKED;
+}
+
+/*
+ * Wait until the BM_LOCKED flag isn't set anymore and return the buffer's
+ * state at that point.
+ *
+ * Obviously the buffer could be locked by the time the value is returned, so
+ * this is primarily useful in CAS style loops.
+ */
+static uint32
+WaitBufHdrUnlocked(BufferDesc *buf)
+{
+ SpinDelayStatus delayStatus;
+ uint32 buf_state;
+
+ init_local_spin_delay(&delayStatus);
+
+ buf_state = pg_atomic_read_u32(&buf->state);
+
+ while (buf_state & BM_LOCKED)
+ {
+ perform_spin_delay(&delayStatus);
+ buf_state = pg_atomic_read_u32(&buf->state);
+ }
+
+ finish_spin_delay(&delayStatus);
+
+ return buf_state;
+}
+
+/*
+ * BufferTag comparator.
+ */
+static inline int
+buffertag_comparator(const BufferTag *ba, const BufferTag *bb)
+{
+ int ret;
+ RelFileLocator rlocatora;
+ RelFileLocator rlocatorb;
+
+ rlocatora = BufTagGetRelFileLocator(ba);
+ rlocatorb = BufTagGetRelFileLocator(bb);
+
+ ret = rlocator_comparator(&rlocatora, &rlocatorb);
+
+ if (ret != 0)
+ return ret;
+
+ if (BufTagGetForkNum(ba) < BufTagGetForkNum(bb))
+ return -1;
+ if (BufTagGetForkNum(ba) > BufTagGetForkNum(bb))
+ return 1;
+
+ if (ba->blockNum < bb->blockNum)
+ return -1;
+ if (ba->blockNum > bb->blockNum)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Comparator determining the writeout order in a checkpoint.
+ *
+ * It is important that tablespaces are compared first, the logic balancing
+ * writes between tablespaces relies on it.
+ */
+static inline int
+ckpt_buforder_comparator(const CkptSortItem *a, const CkptSortItem *b)
+{
+ /* compare tablespace */
+ if (a->tsId < b->tsId)
+ return -1;
+ else if (a->tsId > b->tsId)
+ return 1;
+ /* compare relation */
+ if (a->relNumber < b->relNumber)
+ return -1;
+ else if (a->relNumber > b->relNumber)
+ return 1;
+ /* compare fork */
+ else if (a->forkNum < b->forkNum)
+ return -1;
+ else if (a->forkNum > b->forkNum)
+ return 1;
+ /* compare block number */
+ else if (a->blockNum < b->blockNum)
+ return -1;
+ else if (a->blockNum > b->blockNum)
+ return 1;
+ /* equal page IDs are unlikely, but not impossible */
+ return 0;
+}
+
+/*
+ * Comparator for a Min-Heap over the per-tablespace checkpoint completion
+ * progress.
+ */
+static int
+ts_ckpt_progress_comparator(Datum a, Datum b, void *arg)
+{
+ CkptTsStatus *sa = (CkptTsStatus *) a;
+ CkptTsStatus *sb = (CkptTsStatus *) b;
+
+ /* we want a min-heap, so return 1 for the a < b */
+ if (sa->progress < sb->progress)
+ return 1;
+ else if (sa->progress == sb->progress)
+ return 0;
+ else
+ return -1;
+}
+
+/*
+ * Initialize a writeback context, discarding potential previous state.
+ *
+ * *max_pending is a pointer instead of an immediate value, so the coalesce
+ * limits can easily changed by the GUC mechanism, and so calling code does
+ * not have to check the current configuration. A value of 0 means that no
+ * writeback control will be performed.
+ */
+void
+WritebackContextInit(WritebackContext *context, int *max_pending)
+{
+ Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
+
+ context->max_pending = max_pending;
+ context->nr_pending = 0;
+}
+
+/*
+ * Add buffer to list of pending writeback requests.
+ */
+void
+ScheduleBufferTagForWriteback(WritebackContext *wb_context, IOContext io_context,
+ BufferTag *tag)
+{
+ PendingWriteback *pending;
+
+ if (io_direct_flags & IO_DIRECT_DATA)
+ return;
+
+ /*
+ * Add buffer to the pending writeback array, unless writeback control is
+ * disabled.
+ */
+ if (*wb_context->max_pending > 0)
+ {
+ Assert(*wb_context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES);
+
+ pending = &wb_context->pending_writebacks[wb_context->nr_pending++];
+
+ pending->tag = *tag;
+ }
+
+ /*
+ * Perform pending flushes if the writeback limit is exceeded. This
+ * includes the case where previously an item has been added, but control
+ * is now disabled.
+ */
+ if (wb_context->nr_pending >= *wb_context->max_pending)
+ IssuePendingWritebacks(wb_context, io_context);
+}
+
+#define ST_SORT sort_pending_writebacks
+#define ST_ELEMENT_TYPE PendingWriteback
+#define ST_COMPARE(a, b) buffertag_comparator(&a->tag, &b->tag)
+#define ST_SCOPE static
+#define ST_DEFINE
+#include <lib/sort_template.h>
+
+/*
+ * Issue all pending writeback requests, previously scheduled with
+ * ScheduleBufferTagForWriteback, to the OS.
+ *
+ * Because this is only used to improve the OSs IO scheduling we try to never
+ * error out - it's just a hint.
+ */
+void
+IssuePendingWritebacks(WritebackContext *wb_context, IOContext io_context)
+{
+ instr_time io_start;
+ int i;
+
+ if (wb_context->nr_pending == 0)
+ return;
+
+ /*
+ * Executing the writes in-order can make them a lot faster, and allows to
+ * merge writeback requests to consecutive blocks into larger writebacks.
+ */
+ sort_pending_writebacks(wb_context->pending_writebacks,
+ wb_context->nr_pending);
+
+ io_start = pgstat_prepare_io_time();
+
+ /*
+ * Coalesce neighbouring writes, but nothing else. For that we iterate
+ * through the, now sorted, array of pending flushes, and look forward to
+ * find all neighbouring (or identical) writes.
+ */
+ for (i = 0; i < wb_context->nr_pending; i++)
+ {
+ PendingWriteback *cur;
+ PendingWriteback *next;
+ SMgrRelation reln;
+ int ahead;
+ BufferTag tag;
+ RelFileLocator currlocator;
+ Size nblocks = 1;
+
+ cur = &wb_context->pending_writebacks[i];
+ tag = cur->tag;
+ currlocator = BufTagGetRelFileLocator(&tag);
+
+ /*
+ * Peek ahead, into following writeback requests, to see if they can
+ * be combined with the current one.
+ */
+ for (ahead = 0; i + ahead + 1 < wb_context->nr_pending; ahead++)
+ {
+
+ next = &wb_context->pending_writebacks[i + ahead + 1];
+
+ /* different file, stop */
+ if (!RelFileLocatorEquals(currlocator,
+ BufTagGetRelFileLocator(&next->tag)) ||
+ BufTagGetForkNum(&cur->tag) != BufTagGetForkNum(&next->tag))
+ break;
+
+ /* ok, block queued twice, skip */
+ if (cur->tag.blockNum == next->tag.blockNum)
+ continue;
+
+ /* only merge consecutive writes */
+ if (cur->tag.blockNum + 1 != next->tag.blockNum)
+ break;
+
+ nblocks++;
+ cur = next;
+ }
+
+ i += ahead;
+
+ /* and finally tell the kernel to write the data to storage */
+ reln = smgropen(currlocator, InvalidBackendId);
+ smgrwriteback(reln, BufTagGetForkNum(&tag), tag.blockNum, nblocks);
+ }
+
+ /*
+ * Assume that writeback requests are only issued for buffers containing
+ * blocks of permanent relations.
+ */
+ pgstat_count_io_op_time(IOOBJECT_RELATION, io_context,
+ IOOP_WRITEBACK, io_start, wb_context->nr_pending);
+
+ wb_context->nr_pending = 0;
+}
+
+
+/*
+ * Implement slower/larger portions of TestForOldSnapshot
+ *
+ * Smaller/faster portions are put inline, but the entire set of logic is too
+ * big for that.
+ */
+void
+TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
+{
+ if (RelationAllowsEarlyPruning(relation)
+ && (snapshot)->whenTaken < GetOldSnapshotThresholdTimestamp())
+ ereport(ERROR,
+ (errcode(ERRCODE_SNAPSHOT_TOO_OLD),
+ errmsg("snapshot too old")));
+}
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
new file mode 100644
index 0000000..1c804fd
--- /dev/null
+++ b/src/backend/storage/buffer/freelist.c
@@ -0,0 +1,774 @@
+/*-------------------------------------------------------------------------
+ *
+ * freelist.c
+ * routines for managing the buffer pool's replacement strategy.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/buffer/freelist.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "storage/proc.h"
+
+#define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var))))
+
+
+/*
+ * The shared freelist control information.
+ */
+typedef struct
+{
+ /* Spinlock: protects the values below */
+ slock_t buffer_strategy_lock;
+
+ /*
+ * Clock sweep hand: index of next buffer to consider grabbing. Note that
+ * this isn't a concrete buffer - we only ever increase the value. So, to
+ * get an actual buffer, it needs to be used modulo NBuffers.
+ */
+ pg_atomic_uint32 nextVictimBuffer;
+
+ int firstFreeBuffer; /* Head of list of unused buffers */
+ int lastFreeBuffer; /* Tail of list of unused buffers */
+
+ /*
+ * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is,
+ * when the list is empty)
+ */
+
+ /*
+ * Statistics. These counters should be wide enough that they can't
+ * overflow during a single bgwriter cycle.
+ */
+ uint32 completePasses; /* Complete cycles of the clock sweep */
+ pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */
+
+ /*
+ * Bgworker process to be notified upon activity or -1 if none. See
+ * StrategyNotifyBgWriter.
+ */
+ int bgwprocno;
+} BufferStrategyControl;
+
+/* Pointers to shared state */
+static BufferStrategyControl *StrategyControl = NULL;
+
+/*
+ * Private (non-shared) state for managing a ring of shared buffers to re-use.
+ * This is currently the only kind of BufferAccessStrategy object, but someday
+ * we might have more kinds.
+ */
+typedef struct BufferAccessStrategyData
+{
+ /* Overall strategy type */
+ BufferAccessStrategyType btype;
+ /* Number of elements in buffers[] array */
+ int nbuffers;
+
+ /*
+ * Index of the "current" slot in the ring, ie, the one most recently
+ * returned by GetBufferFromRing.
+ */
+ int current;
+
+ /*
+ * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
+ * have not yet selected a buffer for this ring slot. For allocation
+ * simplicity this is palloc'd together with the fixed fields of the
+ * struct.
+ */
+ Buffer buffers[FLEXIBLE_ARRAY_MEMBER];
+} BufferAccessStrategyData;
+
+
+/* Prototypes for internal functions */
+static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy,
+ uint32 *buf_state);
+static void AddBufferToRing(BufferAccessStrategy strategy,
+ BufferDesc *buf);
+
+/*
+ * ClockSweepTick - Helper routine for StrategyGetBuffer()
+ *
+ * Move the clock hand one buffer ahead of its current position and return the
+ * id of the buffer now under the hand.
+ */
+static inline uint32
+ClockSweepTick(void)
+{
+ uint32 victim;
+
+ /*
+ * Atomically move hand ahead one buffer - if there's several processes
+ * doing this, this can lead to buffers being returned slightly out of
+ * apparent order.
+ */
+ victim =
+ pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1);
+
+ if (victim >= NBuffers)
+ {
+ uint32 originalVictim = victim;
+
+ /* always wrap what we look up in BufferDescriptors */
+ victim = victim % NBuffers;
+
+ /*
+ * If we're the one that just caused a wraparound, force
+ * completePasses to be incremented while holding the spinlock. We
+ * need the spinlock so StrategySyncStart() can return a consistent
+ * value consisting of nextVictimBuffer and completePasses.
+ */
+ if (victim == 0)
+ {
+ uint32 expected;
+ uint32 wrapped;
+ bool success = false;
+
+ expected = originalVictim + 1;
+
+ while (!success)
+ {
+ /*
+ * Acquire the spinlock while increasing completePasses. That
+ * allows other readers to read nextVictimBuffer and
+ * completePasses in a consistent manner which is required for
+ * StrategySyncStart(). In theory delaying the increment
+ * could lead to an overflow of nextVictimBuffers, but that's
+ * highly unlikely and wouldn't be particularly harmful.
+ */
+ SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+ wrapped = expected % NBuffers;
+
+ success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer,
+ &expected, wrapped);
+ if (success)
+ StrategyControl->completePasses++;
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+ }
+ }
+ }
+ return victim;
+}
+
+/*
+ * have_free_buffer -- a lockless check to see if there is a free buffer in
+ * buffer pool.
+ *
+ * If the result is true that will become stale once free buffers are moved out
+ * by other operations, so the caller who strictly want to use a free buffer
+ * should not call this.
+ */
+bool
+have_free_buffer(void)
+{
+ if (StrategyControl->firstFreeBuffer >= 0)
+ return true;
+ else
+ return false;
+}
+
+/*
+ * StrategyGetBuffer
+ *
+ * Called by the bufmgr to get the next candidate buffer to use in
+ * BufferAlloc(). The only hard requirement BufferAlloc() has is that
+ * the selected buffer must not currently be pinned by anyone.
+ *
+ * strategy is a BufferAccessStrategy object, or NULL for default strategy.
+ *
+ * To ensure that no one else can pin the buffer before we do, we must
+ * return the buffer with the buffer header spinlock still held.
+ */
+BufferDesc *
+StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state, bool *from_ring)
+{
+ BufferDesc *buf;
+ int bgwprocno;
+ int trycounter;
+ uint32 local_buf_state; /* to avoid repeated (de-)referencing */
+
+ *from_ring = false;
+
+ /*
+ * If given a strategy object, see whether it can select a buffer. We
+ * assume strategy objects don't need buffer_strategy_lock.
+ */
+ if (strategy != NULL)
+ {
+ buf = GetBufferFromRing(strategy, buf_state);
+ if (buf != NULL)
+ {
+ *from_ring = true;
+ return buf;
+ }
+ }
+
+ /*
+ * If asked, we need to waken the bgwriter. Since we don't want to rely on
+ * a spinlock for this we force a read from shared memory once, and then
+ * set the latch based on that value. We need to go through that length
+ * because otherwise bgwprocno might be reset while/after we check because
+ * the compiler might just reread from memory.
+ *
+ * This can possibly set the latch of the wrong process if the bgwriter
+ * dies in the wrong moment. But since PGPROC->procLatch is never
+ * deallocated the worst consequence of that is that we set the latch of
+ * some arbitrary process.
+ */
+ bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno);
+ if (bgwprocno != -1)
+ {
+ /* reset bgwprocno first, before setting the latch */
+ StrategyControl->bgwprocno = -1;
+
+ /*
+ * Not acquiring ProcArrayLock here which is slightly icky. It's
+ * actually fine because procLatch isn't ever freed, so we just can
+ * potentially set the wrong process' (or no process') latch.
+ */
+ SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch);
+ }
+
+ /*
+ * We count buffer allocation requests so that the bgwriter can estimate
+ * the rate of buffer consumption. Note that buffers recycled by a
+ * strategy object are intentionally not counted here.
+ */
+ pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1);
+
+ /*
+ * First check, without acquiring the lock, whether there's buffers in the
+ * freelist. Since we otherwise don't require the spinlock in every
+ * StrategyGetBuffer() invocation, it'd be sad to acquire it here -
+ * uselessly in most cases. That obviously leaves a race where a buffer is
+ * put on the freelist but we don't see the store yet - but that's pretty
+ * harmless, it'll just get used during the next buffer acquisition.
+ *
+ * If there's buffers on the freelist, acquire the spinlock to pop one
+ * buffer of the freelist. Then check whether that buffer is usable and
+ * repeat if not.
+ *
+ * Note that the freeNext fields are considered to be protected by the
+ * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to
+ * manipulate them without holding the spinlock.
+ */
+ if (StrategyControl->firstFreeBuffer >= 0)
+ {
+ while (true)
+ {
+ /* Acquire the spinlock to remove element from the freelist */
+ SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+ if (StrategyControl->firstFreeBuffer < 0)
+ {
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+ break;
+ }
+
+ buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer);
+ Assert(buf->freeNext != FREENEXT_NOT_IN_LIST);
+
+ /* Unconditionally remove buffer from freelist */
+ StrategyControl->firstFreeBuffer = buf->freeNext;
+ buf->freeNext = FREENEXT_NOT_IN_LIST;
+
+ /*
+ * Release the lock so someone else can access the freelist while
+ * we check out this buffer.
+ */
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+
+ /*
+ * If the buffer is pinned or has a nonzero usage_count, we cannot
+ * use it; discard it and retry. (This can only happen if VACUUM
+ * put a valid buffer in the freelist and then someone else used
+ * it before we got to it. It's probably impossible altogether as
+ * of 8.3, but we'd better check anyway.)
+ */
+ local_buf_state = LockBufHdr(buf);
+ if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
+ && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
+ {
+ if (strategy != NULL)
+ AddBufferToRing(strategy, buf);
+ *buf_state = local_buf_state;
+ return buf;
+ }
+ UnlockBufHdr(buf, local_buf_state);
+ }
+ }
+
+ /* Nothing on the freelist, so run the "clock sweep" algorithm */
+ trycounter = NBuffers;
+ for (;;)
+ {
+ buf = GetBufferDescriptor(ClockSweepTick());
+
+ /*
+ * If the buffer is pinned or has a nonzero usage_count, we cannot use
+ * it; decrement the usage_count (unless pinned) and keep scanning.
+ */
+ local_buf_state = LockBufHdr(buf);
+
+ if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+ {
+ if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
+ {
+ local_buf_state -= BUF_USAGECOUNT_ONE;
+
+ trycounter = NBuffers;
+ }
+ else
+ {
+ /* Found a usable buffer */
+ if (strategy != NULL)
+ AddBufferToRing(strategy, buf);
+ *buf_state = local_buf_state;
+ return buf;
+ }
+ }
+ else if (--trycounter == 0)
+ {
+ /*
+ * We've scanned all the buffers without making any state changes,
+ * so all the buffers are pinned (or were when we looked at them).
+ * We could hope that someone will free one eventually, but it's
+ * probably better to fail than to risk getting stuck in an
+ * infinite loop.
+ */
+ UnlockBufHdr(buf, local_buf_state);
+ elog(ERROR, "no unpinned buffers available");
+ }
+ UnlockBufHdr(buf, local_buf_state);
+ }
+}
+
+/*
+ * StrategyFreeBuffer: put a buffer on the freelist
+ */
+void
+StrategyFreeBuffer(BufferDesc *buf)
+{
+ SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+
+ /*
+ * It is possible that we are told to put something in the freelist that
+ * is already in it; don't screw up the list if so.
+ */
+ if (buf->freeNext == FREENEXT_NOT_IN_LIST)
+ {
+ buf->freeNext = StrategyControl->firstFreeBuffer;
+ if (buf->freeNext < 0)
+ StrategyControl->lastFreeBuffer = buf->buf_id;
+ StrategyControl->firstFreeBuffer = buf->buf_id;
+ }
+
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+}
+
+/*
+ * StrategySyncStart -- tell BufferSync where to start syncing
+ *
+ * The result is the buffer index of the best buffer to sync first.
+ * BufferSync() will proceed circularly around the buffer array from there.
+ *
+ * In addition, we return the completed-pass count (which is effectively
+ * the higher-order bits of nextVictimBuffer) and the count of recent buffer
+ * allocs if non-NULL pointers are passed. The alloc count is reset after
+ * being read.
+ */
+int
+StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc)
+{
+ uint32 nextVictimBuffer;
+ int result;
+
+ SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+ nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer);
+ result = nextVictimBuffer % NBuffers;
+
+ if (complete_passes)
+ {
+ *complete_passes = StrategyControl->completePasses;
+
+ /*
+ * Additionally add the number of wraparounds that happened before
+ * completePasses could be incremented. C.f. ClockSweepTick().
+ */
+ *complete_passes += nextVictimBuffer / NBuffers;
+ }
+
+ if (num_buf_alloc)
+ {
+ *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0);
+ }
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+ return result;
+}
+
+/*
+ * StrategyNotifyBgWriter -- set or clear allocation notification latch
+ *
+ * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will
+ * set that latch. Pass -1 to clear the pending notification before it
+ * happens. This feature is used by the bgwriter process to wake itself up
+ * from hibernation, and is not meant for anybody else to use.
+ */
+void
+StrategyNotifyBgWriter(int bgwprocno)
+{
+ /*
+ * We acquire buffer_strategy_lock just to ensure that the store appears
+ * atomic to StrategyGetBuffer. The bgwriter should call this rather
+ * infrequently, so there's no performance penalty from being safe.
+ */
+ SpinLockAcquire(&StrategyControl->buffer_strategy_lock);
+ StrategyControl->bgwprocno = bgwprocno;
+ SpinLockRelease(&StrategyControl->buffer_strategy_lock);
+}
+
+
+/*
+ * StrategyShmemSize
+ *
+ * estimate the size of shared memory used by the freelist-related structures.
+ *
+ * Note: for somewhat historical reasons, the buffer lookup hashtable size
+ * is also determined here.
+ */
+Size
+StrategyShmemSize(void)
+{
+ Size size = 0;
+
+ /* size of lookup hash table ... see comment in StrategyInitialize */
+ size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS));
+
+ /* size of the shared replacement strategy control block */
+ size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl)));
+
+ return size;
+}
+
+/*
+ * StrategyInitialize -- initialize the buffer cache replacement
+ * strategy.
+ *
+ * Assumes: All of the buffers are already built into a linked list.
+ * Only called by postmaster and only during initialization.
+ */
+void
+StrategyInitialize(bool init)
+{
+ bool found;
+
+ /*
+ * Initialize the shared buffer lookup hashtable.
+ *
+ * Since we can't tolerate running out of lookup table entries, we must be
+ * sure to specify an adequate table size here. The maximum steady-state
+ * usage is of course NBuffers entries, but BufferAlloc() tries to insert
+ * a new entry before deleting the old. In principle this could be
+ * happening in each partition concurrently, so we could need as many as
+ * NBuffers + NUM_BUFFER_PARTITIONS entries.
+ */
+ InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS);
+
+ /*
+ * Get or create the shared strategy control block
+ */
+ StrategyControl = (BufferStrategyControl *)
+ ShmemInitStruct("Buffer Strategy Status",
+ sizeof(BufferStrategyControl),
+ &found);
+
+ if (!found)
+ {
+ /*
+ * Only done once, usually in postmaster
+ */
+ Assert(init);
+
+ SpinLockInit(&StrategyControl->buffer_strategy_lock);
+
+ /*
+ * Grab the whole linked list of free buffers for our strategy. We
+ * assume it was previously set up by InitBufferPool().
+ */
+ StrategyControl->firstFreeBuffer = 0;
+ StrategyControl->lastFreeBuffer = NBuffers - 1;
+
+ /* Initialize the clock sweep pointer */
+ pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0);
+
+ /* Clear statistics */
+ StrategyControl->completePasses = 0;
+ pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
+
+ /* No pending notification */
+ StrategyControl->bgwprocno = -1;
+ }
+ else
+ Assert(!init);
+}
+
+
+/* ----------------------------------------------------------------
+ * Backend-private buffer ring management
+ * ----------------------------------------------------------------
+ */
+
+
+/*
+ * GetAccessStrategy -- create a BufferAccessStrategy object
+ *
+ * The object is allocated in the current memory context.
+ */
+BufferAccessStrategy
+GetAccessStrategy(BufferAccessStrategyType btype)
+{
+ int ring_size_kb;
+
+ /*
+ * Select ring size to use. See buffer/README for rationales.
+ *
+ * Note: if you change the ring size for BAS_BULKREAD, see also
+ * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c.
+ */
+ switch (btype)
+ {
+ case BAS_NORMAL:
+ /* if someone asks for NORMAL, just give 'em a "default" object */
+ return NULL;
+
+ case BAS_BULKREAD:
+ ring_size_kb = 256;
+ break;
+ case BAS_BULKWRITE:
+ ring_size_kb = 16 * 1024;
+ break;
+ case BAS_VACUUM:
+ ring_size_kb = 256;
+ break;
+
+ default:
+ elog(ERROR, "unrecognized buffer access strategy: %d",
+ (int) btype);
+ return NULL; /* keep compiler quiet */
+ }
+
+ return GetAccessStrategyWithSize(btype, ring_size_kb);
+}
+
+/*
+ * GetAccessStrategyWithSize -- create a BufferAccessStrategy object with a
+ * number of buffers equivalent to the passed in size.
+ *
+ * If the given ring size is 0, no BufferAccessStrategy will be created and
+ * the function will return NULL. ring_size_kb must not be negative.
+ */
+BufferAccessStrategy
+GetAccessStrategyWithSize(BufferAccessStrategyType btype, int ring_size_kb)
+{
+ int ring_buffers;
+ BufferAccessStrategy strategy;
+
+ Assert(ring_size_kb >= 0);
+
+ /* Figure out how many buffers ring_size_kb is */
+ ring_buffers = ring_size_kb / (BLCKSZ / 1024);
+
+ /* 0 means unlimited, so no BufferAccessStrategy required */
+ if (ring_buffers == 0)
+ return NULL;
+
+ /* Cap to 1/8th of shared_buffers */
+ ring_buffers = Min(NBuffers / 8, ring_buffers);
+
+ /* NBuffers should never be less than 16, so this shouldn't happen */
+ Assert(ring_buffers > 0);
+
+ /* Allocate the object and initialize all elements to zeroes */
+ strategy = (BufferAccessStrategy)
+ palloc0(offsetof(BufferAccessStrategyData, buffers) +
+ ring_buffers * sizeof(Buffer));
+
+ /* Set fields that don't start out zero */
+ strategy->btype = btype;
+ strategy->nbuffers = ring_buffers;
+
+ return strategy;
+}
+
+/*
+ * GetAccessStrategyBufferCount -- an accessor for the number of buffers in
+ * the ring
+ *
+ * Returns 0 on NULL input to match behavior of GetAccessStrategyWithSize()
+ * returning NULL with 0 size.
+ */
+int
+GetAccessStrategyBufferCount(BufferAccessStrategy strategy)
+{
+ if (strategy == NULL)
+ return 0;
+
+ return strategy->nbuffers;
+}
+
+/*
+ * FreeAccessStrategy -- release a BufferAccessStrategy object
+ *
+ * A simple pfree would do at the moment, but we would prefer that callers
+ * don't assume that much about the representation of BufferAccessStrategy.
+ */
+void
+FreeAccessStrategy(BufferAccessStrategy strategy)
+{
+ /* don't crash if called on a "default" strategy */
+ if (strategy != NULL)
+ pfree(strategy);
+}
+
+/*
+ * GetBufferFromRing -- returns a buffer from the ring, or NULL if the
+ * ring is empty / not usable.
+ *
+ * The bufhdr spin lock is held on the returned buffer.
+ */
+static BufferDesc *
+GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
+{
+ BufferDesc *buf;
+ Buffer bufnum;
+ uint32 local_buf_state; /* to avoid repeated (de-)referencing */
+
+
+ /* Advance to next ring slot */
+ if (++strategy->current >= strategy->nbuffers)
+ strategy->current = 0;
+
+ /*
+ * If the slot hasn't been filled yet, tell the caller to allocate a new
+ * buffer with the normal allocation strategy. He will then fill this
+ * slot by calling AddBufferToRing with the new buffer.
+ */
+ bufnum = strategy->buffers[strategy->current];
+ if (bufnum == InvalidBuffer)
+ return NULL;
+
+ /*
+ * If the buffer is pinned we cannot use it under any circumstances.
+ *
+ * If usage_count is 0 or 1 then the buffer is fair game (we expect 1,
+ * since our own previous usage of the ring element would have left it
+ * there, but it might've been decremented by clock sweep since then). A
+ * higher usage_count indicates someone else has touched the buffer, so we
+ * shouldn't re-use it.
+ */
+ buf = GetBufferDescriptor(bufnum - 1);
+ local_buf_state = LockBufHdr(buf);
+ if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
+ && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
+ {
+ *buf_state = local_buf_state;
+ return buf;
+ }
+ UnlockBufHdr(buf, local_buf_state);
+
+ /*
+ * Tell caller to allocate a new buffer with the normal allocation
+ * strategy. He'll then replace this ring element via AddBufferToRing.
+ */
+ return NULL;
+}
+
+/*
+ * AddBufferToRing -- add a buffer to the buffer ring
+ *
+ * Caller must hold the buffer header spinlock on the buffer. Since this
+ * is called with the spinlock held, it had better be quite cheap.
+ */
+static void
+AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf)
+{
+ strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf);
+}
+
+/*
+ * Utility function returning the IOContext of a given BufferAccessStrategy's
+ * strategy ring.
+ */
+IOContext
+IOContextForStrategy(BufferAccessStrategy strategy)
+{
+ if (!strategy)
+ return IOCONTEXT_NORMAL;
+
+ switch (strategy->btype)
+ {
+ case BAS_NORMAL:
+
+ /*
+ * Currently, GetAccessStrategy() returns NULL for
+ * BufferAccessStrategyType BAS_NORMAL, so this case is
+ * unreachable.
+ */
+ pg_unreachable();
+ return IOCONTEXT_NORMAL;
+ case BAS_BULKREAD:
+ return IOCONTEXT_BULKREAD;
+ case BAS_BULKWRITE:
+ return IOCONTEXT_BULKWRITE;
+ case BAS_VACUUM:
+ return IOCONTEXT_VACUUM;
+ }
+
+ elog(ERROR, "unrecognized BufferAccessStrategyType: %d", strategy->btype);
+ pg_unreachable();
+}
+
+/*
+ * StrategyRejectBuffer -- consider rejecting a dirty buffer
+ *
+ * When a nondefault strategy is used, the buffer manager calls this function
+ * when it turns out that the buffer selected by StrategyGetBuffer needs to
+ * be written out and doing so would require flushing WAL too. This gives us
+ * a chance to choose a different victim.
+ *
+ * Returns true if buffer manager should ask for a new victim, and false
+ * if this buffer should be written and re-used.
+ */
+bool
+StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf, bool from_ring)
+{
+ /* We only do this in bulkread mode */
+ if (strategy->btype != BAS_BULKREAD)
+ return false;
+
+ /* Don't muck with behavior of normal buffer-replacement strategy */
+ if (!from_ring ||
+ strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf))
+ return false;
+
+ /*
+ * Remove the dirty buffer from the ring; necessary to prevent infinite
+ * loop if all ring members are dirty.
+ */
+ strategy->buffers[strategy->current] = InvalidBuffer;
+
+ return true;
+}
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
new file mode 100644
index 0000000..55953c3
--- /dev/null
+++ b/src/backend/storage/buffer/localbuf.c
@@ -0,0 +1,821 @@
+/*-------------------------------------------------------------------------
+ *
+ * localbuf.c
+ * local buffer manager. Fast buffer manager for temporary tables,
+ * which never need to be WAL-logged or checkpointed, etc.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994-5, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/buffer/localbuf.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "catalog/catalog.h"
+#include "executor/instrument.h"
+#include "pgstat.h"
+#include "storage/buf_internals.h"
+#include "storage/bufmgr.h"
+#include "utils/guc_hooks.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+
+/*#define LBDEBUG*/
+
+/* entry for buffer lookup hashtable */
+typedef struct
+{
+ BufferTag key; /* Tag of a disk page */
+ int id; /* Associated local buffer's index */
+} LocalBufferLookupEnt;
+
+/* Note: this macro only works on local buffers, not shared ones! */
+#define LocalBufHdrGetBlock(bufHdr) \
+ LocalBufferBlockPointers[-((bufHdr)->buf_id + 2)]
+
+int NLocBuffer = 0; /* until buffers are initialized */
+
+BufferDesc *LocalBufferDescriptors = NULL;
+Block *LocalBufferBlockPointers = NULL;
+int32 *LocalRefCount = NULL;
+
+static int nextFreeLocalBufId = 0;
+
+static HTAB *LocalBufHash = NULL;
+
+/* number of local buffers pinned at least once */
+static int NLocalPinnedBuffers = 0;
+
+
+static void InitLocalBuffers(void);
+static Block GetLocalBufferStorage(void);
+static Buffer GetLocalVictimBuffer(void);
+
+
+/*
+ * PrefetchLocalBuffer -
+ * initiate asynchronous read of a block of a relation
+ *
+ * Do PrefetchBuffer's work for temporary relations.
+ * No-op if prefetching isn't compiled in.
+ */
+PrefetchBufferResult
+PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
+ BlockNumber blockNum)
+{
+ PrefetchBufferResult result = {InvalidBuffer, false};
+ BufferTag newTag; /* identity of requested block */
+ LocalBufferLookupEnt *hresult;
+
+ InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
+
+ /* Initialize local buffers if first request in this session */
+ if (LocalBufHash == NULL)
+ InitLocalBuffers();
+
+ /* See if the desired buffer already exists */
+ hresult = (LocalBufferLookupEnt *)
+ hash_search(LocalBufHash, &newTag, HASH_FIND, NULL);
+
+ if (hresult)
+ {
+ /* Yes, so nothing to do */
+ result.recent_buffer = -hresult->id - 1;
+ }
+ else
+ {
+#ifdef USE_PREFETCH
+ /* Not in buffers, so initiate prefetch */
+ if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
+ smgrprefetch(smgr, forkNum, blockNum))
+ {
+ result.initiated_io = true;
+ }
+#endif /* USE_PREFETCH */
+ }
+
+ return result;
+}
+
+
+/*
+ * LocalBufferAlloc -
+ * Find or create a local buffer for the given page of the given relation.
+ *
+ * API is similar to bufmgr.c's BufferAlloc, except that we do not need
+ * to do any locking since this is all local. Also, IO_IN_PROGRESS
+ * does not get set. Lastly, we support only default access strategy
+ * (hence, usage_count is always advanced).
+ */
+BufferDesc *
+LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum,
+ bool *foundPtr)
+{
+ BufferTag newTag; /* identity of requested block */
+ LocalBufferLookupEnt *hresult;
+ BufferDesc *bufHdr;
+ Buffer victim_buffer;
+ int bufid;
+ bool found;
+
+ InitBufferTag(&newTag, &smgr->smgr_rlocator.locator, forkNum, blockNum);
+
+ /* Initialize local buffers if first request in this session */
+ if (LocalBufHash == NULL)
+ InitLocalBuffers();
+
+ /* See if the desired buffer already exists */
+ hresult = (LocalBufferLookupEnt *)
+ hash_search(LocalBufHash, &newTag, HASH_FIND, NULL);
+
+ if (hresult)
+ {
+ bufid = hresult->id;
+ bufHdr = GetLocalBufferDescriptor(bufid);
+ Assert(BufferTagsEqual(&bufHdr->tag, &newTag));
+
+ *foundPtr = PinLocalBuffer(bufHdr, true);
+ }
+ else
+ {
+ uint32 buf_state;
+
+ victim_buffer = GetLocalVictimBuffer();
+ bufid = -victim_buffer - 1;
+ bufHdr = GetLocalBufferDescriptor(bufid);
+
+ hresult = (LocalBufferLookupEnt *)
+ hash_search(LocalBufHash, &newTag, HASH_ENTER, &found);
+ if (found) /* shouldn't happen */
+ elog(ERROR, "local buffer hash table corrupted");
+ hresult->id = bufid;
+
+ /*
+ * it's all ours now.
+ */
+ bufHdr->tag = newTag;
+
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+ buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+ buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+ *foundPtr = false;
+ }
+
+ return bufHdr;
+}
+
+static Buffer
+GetLocalVictimBuffer(void)
+{
+ int victim_bufid;
+ int trycounter;
+ uint32 buf_state;
+ BufferDesc *bufHdr;
+
+ ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
+
+ /*
+ * Need to get a new buffer. We use a clock sweep algorithm (essentially
+ * the same as what freelist.c does now...)
+ */
+ trycounter = NLocBuffer;
+ for (;;)
+ {
+ victim_bufid = nextFreeLocalBufId;
+
+ if (++nextFreeLocalBufId >= NLocBuffer)
+ nextFreeLocalBufId = 0;
+
+ bufHdr = GetLocalBufferDescriptor(victim_bufid);
+
+ if (LocalRefCount[victim_bufid] == 0)
+ {
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ if (BUF_STATE_GET_USAGECOUNT(buf_state) > 0)
+ {
+ buf_state -= BUF_USAGECOUNT_ONE;
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+ trycounter = NLocBuffer;
+ }
+ else
+ {
+ /* Found a usable buffer */
+ PinLocalBuffer(bufHdr, false);
+ break;
+ }
+ }
+ else if (--trycounter == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("no empty local buffer available")));
+ }
+
+ /*
+ * lazy memory allocation: allocate space on first use of a buffer.
+ */
+ if (LocalBufHdrGetBlock(bufHdr) == NULL)
+ {
+ /* Set pointer for use by BufferGetBlock() macro */
+ LocalBufHdrGetBlock(bufHdr) = GetLocalBufferStorage();
+ }
+
+ /*
+ * this buffer is not referenced but it might still be dirty. if that's
+ * the case, write it out before reusing it!
+ */
+ if (buf_state & BM_DIRTY)
+ {
+ instr_time io_start;
+ SMgrRelation oreln;
+ Page localpage = (char *) LocalBufHdrGetBlock(bufHdr);
+
+ /* Find smgr relation for buffer */
+ oreln = smgropen(BufTagGetRelFileLocator(&bufHdr->tag), MyBackendId);
+
+ PageSetChecksumInplace(localpage, bufHdr->tag.blockNum);
+
+ io_start = pgstat_prepare_io_time();
+
+ /* And write... */
+ smgrwrite(oreln,
+ BufTagGetForkNum(&bufHdr->tag),
+ bufHdr->tag.blockNum,
+ localpage,
+ false);
+
+ /* Temporary table I/O does not use Buffer Access Strategies */
+ pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL,
+ IOOP_WRITE, io_start, 1);
+
+ /* Mark not-dirty now in case we error out below */
+ buf_state &= ~BM_DIRTY;
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+
+ pgBufferUsage.local_blks_written++;
+ }
+
+ /*
+ * Remove the victim buffer from the hashtable and mark as invalid.
+ */
+ if (buf_state & BM_TAG_VALID)
+ {
+ LocalBufferLookupEnt *hresult;
+
+ hresult = (LocalBufferLookupEnt *)
+ hash_search(LocalBufHash, &bufHdr->tag, HASH_REMOVE, NULL);
+ if (!hresult) /* shouldn't happen */
+ elog(ERROR, "local buffer hash table corrupted");
+ /* mark buffer invalid just in case hash insert fails */
+ ClearBufferTag(&bufHdr->tag);
+ buf_state &= ~(BUF_FLAG_MASK | BUF_USAGECOUNT_MASK);
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+ pgstat_count_io_op(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EVICT);
+ }
+
+ return BufferDescriptorGetBuffer(bufHdr);
+}
+
+/* see LimitAdditionalPins() */
+static void
+LimitAdditionalLocalPins(uint32 *additional_pins)
+{
+ uint32 max_pins;
+
+ if (*additional_pins <= 1)
+ return;
+
+ /*
+ * In contrast to LimitAdditionalPins() other backends don't play a role
+ * here. We can allow up to NLocBuffer pins in total.
+ */
+ max_pins = (NLocBuffer - NLocalPinnedBuffers);
+
+ if (*additional_pins >= max_pins)
+ *additional_pins = max_pins;
+}
+
+/*
+ * Implementation of ExtendBufferedRelBy() and ExtendBufferedRelTo() for
+ * temporary buffers.
+ */
+BlockNumber
+ExtendBufferedRelLocal(BufferManagerRelation bmr,
+ ForkNumber fork,
+ uint32 flags,
+ uint32 extend_by,
+ BlockNumber extend_upto,
+ Buffer *buffers,
+ uint32 *extended_by)
+{
+ BlockNumber first_block;
+ instr_time io_start;
+
+ /* Initialize local buffers if first request in this session */
+ if (LocalBufHash == NULL)
+ InitLocalBuffers();
+
+ LimitAdditionalLocalPins(&extend_by);
+
+ for (uint32 i = 0; i < extend_by; i++)
+ {
+ BufferDesc *buf_hdr;
+ Block buf_block;
+
+ buffers[i] = GetLocalVictimBuffer();
+ buf_hdr = GetLocalBufferDescriptor(-buffers[i] - 1);
+ buf_block = LocalBufHdrGetBlock(buf_hdr);
+
+ /* new buffers are zero-filled */
+ MemSet((char *) buf_block, 0, BLCKSZ);
+ }
+
+ first_block = smgrnblocks(bmr.smgr, fork);
+
+ if (extend_upto != InvalidBlockNumber)
+ {
+ /*
+ * In contrast to shared relations, nothing could change the relation
+ * size concurrently. Thus we shouldn't end up finding that we don't
+ * need to do anything.
+ */
+ Assert(first_block <= extend_upto);
+
+ Assert((uint64) first_block + extend_by <= extend_upto);
+ }
+
+ /* Fail if relation is already at maximum possible length */
+ if ((uint64) first_block + extend_by >= MaxBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend relation %s beyond %u blocks",
+ relpath(bmr.smgr->smgr_rlocator, fork),
+ MaxBlockNumber)));
+
+ for (int i = 0; i < extend_by; i++)
+ {
+ int victim_buf_id;
+ BufferDesc *victim_buf_hdr;
+ BufferTag tag;
+ LocalBufferLookupEnt *hresult;
+ bool found;
+
+ victim_buf_id = -buffers[i] - 1;
+ victim_buf_hdr = GetLocalBufferDescriptor(victim_buf_id);
+
+ InitBufferTag(&tag, &bmr.smgr->smgr_rlocator.locator, fork, first_block + i);
+
+ hresult = (LocalBufferLookupEnt *)
+ hash_search(LocalBufHash, (void *) &tag, HASH_ENTER, &found);
+ if (found)
+ {
+ BufferDesc *existing_hdr;
+ uint32 buf_state;
+
+ UnpinLocalBuffer(BufferDescriptorGetBuffer(victim_buf_hdr));
+
+ existing_hdr = GetLocalBufferDescriptor(hresult->id);
+ PinLocalBuffer(existing_hdr, false);
+ buffers[i] = BufferDescriptorGetBuffer(existing_hdr);
+
+ buf_state = pg_atomic_read_u32(&existing_hdr->state);
+ Assert(buf_state & BM_TAG_VALID);
+ Assert(!(buf_state & BM_DIRTY));
+ buf_state &= ~BM_VALID;
+ pg_atomic_unlocked_write_u32(&existing_hdr->state, buf_state);
+ }
+ else
+ {
+ uint32 buf_state = pg_atomic_read_u32(&victim_buf_hdr->state);
+
+ Assert(!(buf_state & (BM_VALID | BM_TAG_VALID | BM_DIRTY | BM_JUST_DIRTIED)));
+
+ victim_buf_hdr->tag = tag;
+
+ buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
+
+ pg_atomic_unlocked_write_u32(&victim_buf_hdr->state, buf_state);
+
+ hresult->id = victim_buf_id;
+ }
+ }
+
+ io_start = pgstat_prepare_io_time();
+
+ /* actually extend relation */
+ smgrzeroextend(bmr.smgr, fork, first_block, extend_by, false);
+
+ pgstat_count_io_op_time(IOOBJECT_TEMP_RELATION, IOCONTEXT_NORMAL, IOOP_EXTEND,
+ io_start, extend_by);
+
+ for (int i = 0; i < extend_by; i++)
+ {
+ Buffer buf = buffers[i];
+ BufferDesc *buf_hdr;
+ uint32 buf_state;
+
+ buf_hdr = GetLocalBufferDescriptor(-buf - 1);
+
+ buf_state = pg_atomic_read_u32(&buf_hdr->state);
+ buf_state |= BM_VALID;
+ pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
+ }
+
+ *extended_by = extend_by;
+
+ pgBufferUsage.local_blks_written += extend_by;
+
+ return first_block;
+}
+
+/*
+ * MarkLocalBufferDirty -
+ * mark a local buffer dirty
+ */
+void
+MarkLocalBufferDirty(Buffer buffer)
+{
+ int bufid;
+ BufferDesc *bufHdr;
+ uint32 buf_state;
+
+ Assert(BufferIsLocal(buffer));
+
+#ifdef LBDEBUG
+ fprintf(stderr, "LB DIRTY %d\n", buffer);
+#endif
+
+ bufid = -buffer - 1;
+
+ Assert(LocalRefCount[bufid] > 0);
+
+ bufHdr = GetLocalBufferDescriptor(bufid);
+
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ if (!(buf_state & BM_DIRTY))
+ pgBufferUsage.local_blks_dirtied++;
+
+ buf_state |= BM_DIRTY;
+
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+}
+
+/*
+ * DropRelationLocalBuffers
+ * This function removes from the buffer pool all the pages of the
+ * specified relation that have block numbers >= firstDelBlock.
+ * (In particular, with firstDelBlock = 0, all pages are removed.)
+ * Dirty pages are simply dropped, without bothering to write them
+ * out first. Therefore, this is NOT rollback-able, and so should be
+ * used only with extreme caution!
+ *
+ * See DropRelationBuffers in bufmgr.c for more notes.
+ */
+void
+DropRelationLocalBuffers(RelFileLocator rlocator, ForkNumber forkNum,
+ BlockNumber firstDelBlock)
+{
+ int i;
+
+ for (i = 0; i < NLocBuffer; i++)
+ {
+ BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
+ LocalBufferLookupEnt *hresult;
+ uint32 buf_state;
+
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ if ((buf_state & BM_TAG_VALID) &&
+ BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator) &&
+ BufTagGetForkNum(&bufHdr->tag) == forkNum &&
+ bufHdr->tag.blockNum >= firstDelBlock)
+ {
+ if (LocalRefCount[i] != 0)
+ elog(ERROR, "block %u of %s is still referenced (local %u)",
+ bufHdr->tag.blockNum,
+ relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
+ MyBackendId,
+ BufTagGetForkNum(&bufHdr->tag)),
+ LocalRefCount[i]);
+
+ /* Remove entry from hashtable */
+ hresult = (LocalBufferLookupEnt *)
+ hash_search(LocalBufHash, &bufHdr->tag, HASH_REMOVE, NULL);
+ if (!hresult) /* shouldn't happen */
+ elog(ERROR, "local buffer hash table corrupted");
+ /* Mark buffer invalid */
+ ClearBufferTag(&bufHdr->tag);
+ buf_state &= ~BUF_FLAG_MASK;
+ buf_state &= ~BUF_USAGECOUNT_MASK;
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+ }
+ }
+}
+
+/*
+ * DropRelationAllLocalBuffers
+ * This function removes from the buffer pool all pages of all forks
+ * of the specified relation.
+ *
+ * See DropRelationsAllBuffers in bufmgr.c for more notes.
+ */
+void
+DropRelationAllLocalBuffers(RelFileLocator rlocator)
+{
+ int i;
+
+ for (i = 0; i < NLocBuffer; i++)
+ {
+ BufferDesc *bufHdr = GetLocalBufferDescriptor(i);
+ LocalBufferLookupEnt *hresult;
+ uint32 buf_state;
+
+ buf_state = pg_atomic_read_u32(&bufHdr->state);
+
+ if ((buf_state & BM_TAG_VALID) &&
+ BufTagMatchesRelFileLocator(&bufHdr->tag, &rlocator))
+ {
+ if (LocalRefCount[i] != 0)
+ elog(ERROR, "block %u of %s is still referenced (local %u)",
+ bufHdr->tag.blockNum,
+ relpathbackend(BufTagGetRelFileLocator(&bufHdr->tag),
+ MyBackendId,
+ BufTagGetForkNum(&bufHdr->tag)),
+ LocalRefCount[i]);
+ /* Remove entry from hashtable */
+ hresult = (LocalBufferLookupEnt *)
+ hash_search(LocalBufHash, &bufHdr->tag, HASH_REMOVE, NULL);
+ if (!hresult) /* shouldn't happen */
+ elog(ERROR, "local buffer hash table corrupted");
+ /* Mark buffer invalid */
+ ClearBufferTag(&bufHdr->tag);
+ buf_state &= ~BUF_FLAG_MASK;
+ buf_state &= ~BUF_USAGECOUNT_MASK;
+ pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
+ }
+ }
+}
+
+/*
+ * InitLocalBuffers -
+ * init the local buffer cache. Since most queries (esp. multi-user ones)
+ * don't involve local buffers, we delay allocating actual memory for the
+ * buffers until we need them; just make the buffer headers here.
+ */
+static void
+InitLocalBuffers(void)
+{
+ int nbufs = num_temp_buffers;
+ HASHCTL info;
+ int i;
+
+ /*
+ * Parallel workers can't access data in temporary tables, because they
+ * have no visibility into the local buffers of their leader. This is a
+ * convenient, low-cost place to provide a backstop check for that. Note
+ * that we don't wish to prevent a parallel worker from accessing catalog
+ * metadata about a temp table, so checks at higher levels would be
+ * inappropriate.
+ */
+ if (IsParallelWorker())
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TRANSACTION_STATE),
+ errmsg("cannot access temporary tables during a parallel operation")));
+
+ /* Allocate and zero buffer headers and auxiliary arrays */
+ LocalBufferDescriptors = (BufferDesc *) calloc(nbufs, sizeof(BufferDesc));
+ LocalBufferBlockPointers = (Block *) calloc(nbufs, sizeof(Block));
+ LocalRefCount = (int32 *) calloc(nbufs, sizeof(int32));
+ if (!LocalBufferDescriptors || !LocalBufferBlockPointers || !LocalRefCount)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+
+ nextFreeLocalBufId = 0;
+
+ /* initialize fields that need to start off nonzero */
+ for (i = 0; i < nbufs; i++)
+ {
+ BufferDesc *buf = GetLocalBufferDescriptor(i);
+
+ /*
+ * negative to indicate local buffer. This is tricky: shared buffers
+ * start with 0. We have to start with -2. (Note that the routine
+ * BufferDescriptorGetBuffer adds 1 to buf_id so our first buffer id
+ * is -1.)
+ */
+ buf->buf_id = -i - 2;
+
+ /*
+ * Intentionally do not initialize the buffer's atomic variable
+ * (besides zeroing the underlying memory above). That way we get
+ * errors on platforms without atomics, if somebody (re-)introduces
+ * atomic operations for local buffers.
+ */
+ }
+
+ /* Create the lookup hash table */
+ info.keysize = sizeof(BufferTag);
+ info.entrysize = sizeof(LocalBufferLookupEnt);
+
+ LocalBufHash = hash_create("Local Buffer Lookup Table",
+ nbufs,
+ &info,
+ HASH_ELEM | HASH_BLOBS);
+
+ if (!LocalBufHash)
+ elog(ERROR, "could not initialize local buffer hash table");
+
+ /* Initialization done, mark buffers allocated */
+ NLocBuffer = nbufs;
+}
+
+/*
+ * XXX: We could have a slightly more efficient version of PinLocalBuffer()
+ * that does not support adjusting the usagecount - but so far it does not
+ * seem worth the trouble.
+ */
+bool
+PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount)
+{
+ uint32 buf_state;
+ Buffer buffer = BufferDescriptorGetBuffer(buf_hdr);
+ int bufid = -buffer - 1;
+
+ buf_state = pg_atomic_read_u32(&buf_hdr->state);
+
+ if (LocalRefCount[bufid] == 0)
+ {
+ NLocalPinnedBuffers++;
+ if (adjust_usagecount &&
+ BUF_STATE_GET_USAGECOUNT(buf_state) < BM_MAX_USAGE_COUNT)
+ {
+ buf_state += BUF_USAGECOUNT_ONE;
+ pg_atomic_unlocked_write_u32(&buf_hdr->state, buf_state);
+ }
+ }
+ LocalRefCount[bufid]++;
+ ResourceOwnerRememberBuffer(CurrentResourceOwner,
+ BufferDescriptorGetBuffer(buf_hdr));
+
+ return buf_state & BM_VALID;
+}
+
+void
+UnpinLocalBuffer(Buffer buffer)
+{
+ int buffid = -buffer - 1;
+
+ Assert(BufferIsLocal(buffer));
+ Assert(LocalRefCount[buffid] > 0);
+ Assert(NLocalPinnedBuffers > 0);
+
+ ResourceOwnerForgetBuffer(CurrentResourceOwner, buffer);
+ if (--LocalRefCount[buffid] == 0)
+ NLocalPinnedBuffers--;
+}
+
+/*
+ * GUC check_hook for temp_buffers
+ */
+bool
+check_temp_buffers(int *newval, void **extra, GucSource source)
+{
+ /*
+ * Once local buffers have been initialized, it's too late to change this.
+ * However, if this is only a test call, allow it.
+ */
+ if (source != PGC_S_TEST && NLocBuffer && NLocBuffer != *newval)
+ {
+ GUC_check_errdetail("\"temp_buffers\" cannot be changed after any temporary tables have been accessed in the session.");
+ return false;
+ }
+ return true;
+}
+
+/*
+ * GetLocalBufferStorage - allocate memory for a local buffer
+ *
+ * The idea of this function is to aggregate our requests for storage
+ * so that the memory manager doesn't see a whole lot of relatively small
+ * requests. Since we'll never give back a local buffer once it's created
+ * within a particular process, no point in burdening memmgr with separately
+ * managed chunks.
+ */
+static Block
+GetLocalBufferStorage(void)
+{
+ static char *cur_block = NULL;
+ static int next_buf_in_block = 0;
+ static int num_bufs_in_block = 0;
+ static int total_bufs_allocated = 0;
+ static MemoryContext LocalBufferContext = NULL;
+
+ char *this_buf;
+
+ Assert(total_bufs_allocated < NLocBuffer);
+
+ if (next_buf_in_block >= num_bufs_in_block)
+ {
+ /* Need to make a new request to memmgr */
+ int num_bufs;
+
+ /*
+ * We allocate local buffers in a context of their own, so that the
+ * space eaten for them is easily recognizable in MemoryContextStats
+ * output. Create the context on first use.
+ */
+ if (LocalBufferContext == NULL)
+ LocalBufferContext =
+ AllocSetContextCreate(TopMemoryContext,
+ "LocalBufferContext",
+ ALLOCSET_DEFAULT_SIZES);
+
+ /* Start with a 16-buffer request; subsequent ones double each time */
+ num_bufs = Max(num_bufs_in_block * 2, 16);
+ /* But not more than what we need for all remaining local bufs */
+ num_bufs = Min(num_bufs, NLocBuffer - total_bufs_allocated);
+ /* And don't overflow MaxAllocSize, either */
+ num_bufs = Min(num_bufs, MaxAllocSize / BLCKSZ);
+
+ /* Buffers should be I/O aligned. */
+ cur_block = (char *)
+ TYPEALIGN(PG_IO_ALIGN_SIZE,
+ MemoryContextAlloc(LocalBufferContext,
+ num_bufs * BLCKSZ + PG_IO_ALIGN_SIZE));
+ next_buf_in_block = 0;
+ num_bufs_in_block = num_bufs;
+ }
+
+ /* Allocate next buffer in current memory block */
+ this_buf = cur_block + next_buf_in_block * BLCKSZ;
+ next_buf_in_block++;
+ total_bufs_allocated++;
+
+ return (Block) this_buf;
+}
+
+/*
+ * CheckForLocalBufferLeaks - ensure this backend holds no local buffer pins
+ *
+ * This is just like CheckForBufferLeaks(), but for local buffers.
+ */
+static void
+CheckForLocalBufferLeaks(void)
+{
+#ifdef USE_ASSERT_CHECKING
+ if (LocalRefCount)
+ {
+ int RefCountErrors = 0;
+ int i;
+
+ for (i = 0; i < NLocBuffer; i++)
+ {
+ if (LocalRefCount[i] != 0)
+ {
+ Buffer b = -i - 1;
+
+ PrintBufferLeakWarning(b);
+ RefCountErrors++;
+ }
+ }
+ Assert(RefCountErrors == 0);
+ }
+#endif
+}
+
+/*
+ * AtEOXact_LocalBuffers - clean up at end of transaction.
+ *
+ * This is just like AtEOXact_Buffers, but for local buffers.
+ */
+void
+AtEOXact_LocalBuffers(bool isCommit)
+{
+ CheckForLocalBufferLeaks();
+}
+
+/*
+ * AtProcExit_LocalBuffers - ensure we have dropped pins during backend exit.
+ *
+ * This is just like AtProcExit_Buffers, but for local buffers.
+ */
+void
+AtProcExit_LocalBuffers(void)
+{
+ /*
+ * We shouldn't be holding any remaining pins; if we are, and assertions
+ * aren't enabled, we'll fail later in DropRelationBuffers while trying to
+ * drop the temp rels.
+ */
+ CheckForLocalBufferLeaks();
+}
diff --git a/src/backend/storage/buffer/meson.build b/src/backend/storage/buffer/meson.build
new file mode 100644
index 0000000..ea2f9c0
--- /dev/null
+++ b/src/backend/storage/buffer/meson.build
@@ -0,0 +1,9 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'buf_init.c',
+ 'buf_table.c',
+ 'bufmgr.c',
+ 'freelist.c',
+ 'localbuf.c',
+)
diff --git a/src/backend/storage/file/Makefile b/src/backend/storage/file/Makefile
new file mode 100644
index 0000000..660ac51
--- /dev/null
+++ b/src/backend/storage/file/Makefile
@@ -0,0 +1,23 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/file
+#
+# IDENTIFICATION
+# src/backend/storage/file/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/file
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ buffile.o \
+ copydir.o \
+ fd.o \
+ fileset.o \
+ reinit.o \
+ sharedfileset.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
new file mode 100644
index 0000000..41ab641
--- /dev/null
+++ b/src/backend/storage/file/buffile.c
@@ -0,0 +1,1039 @@
+/*-------------------------------------------------------------------------
+ *
+ * buffile.c
+ * Management of large buffered temporary files.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/file/buffile.c
+ *
+ * NOTES:
+ *
+ * BufFiles provide a very incomplete emulation of stdio atop virtual Files
+ * (as managed by fd.c). Currently, we only support the buffered-I/O
+ * aspect of stdio: a read or write of the low-level File occurs only
+ * when the buffer is filled or emptied. This is an even bigger win
+ * for virtual Files than for ordinary kernel files, since reducing the
+ * frequency with which a virtual File is touched reduces "thrashing"
+ * of opening/closing file descriptors.
+ *
+ * Note that BufFile structs are allocated with palloc(), and therefore
+ * will go away automatically at query/transaction end. Since the underlying
+ * virtual Files are made with OpenTemporaryFile, all resources for
+ * the file are certain to be cleaned up even if processing is aborted
+ * by ereport(ERROR). The data structures required are made in the
+ * palloc context that was current when the BufFile was created, and
+ * any external resources such as temp files are owned by the ResourceOwner
+ * that was current at that time.
+ *
+ * BufFile also supports temporary files that exceed the OS file size limit
+ * (by opening multiple fd.c temporary files). This is an essential feature
+ * for sorts and hashjoins on large amounts of data.
+ *
+ * BufFile supports temporary files that can be shared with other backends, as
+ * infrastructure for parallel execution. Such files need to be created as a
+ * member of a SharedFileSet that all participants are attached to.
+ *
+ * BufFile also supports temporary files that can be used by the single backend
+ * when the corresponding files need to be survived across the transaction and
+ * need to be opened and closed multiple times. Such files need to be created
+ * as a member of a FileSet.
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "commands/tablespace.h"
+#include "executor/instrument.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/buf_internals.h"
+#include "storage/buffile.h"
+#include "storage/fd.h"
+#include "utils/resowner.h"
+
+/*
+ * We break BufFiles into gigabyte-sized segments, regardless of RELSEG_SIZE.
+ * The reason is that we'd like large BufFiles to be spread across multiple
+ * tablespaces when available.
+ */
+#define MAX_PHYSICAL_FILESIZE 0x40000000
+#define BUFFILE_SEG_SIZE (MAX_PHYSICAL_FILESIZE / BLCKSZ)
+
+/*
+ * This data structure represents a buffered file that consists of one or
+ * more physical files (each accessed through a virtual file descriptor
+ * managed by fd.c).
+ */
+struct BufFile
+{
+ int numFiles; /* number of physical files in set */
+ /* all files except the last have length exactly MAX_PHYSICAL_FILESIZE */
+ File *files; /* palloc'd array with numFiles entries */
+
+ bool isInterXact; /* keep open over transactions? */
+ bool dirty; /* does buffer need to be written? */
+ bool readOnly; /* has the file been set to read only? */
+
+ FileSet *fileset; /* space for fileset based segment files */
+ const char *name; /* name of fileset based BufFile */
+
+ /*
+ * resowner is the ResourceOwner to use for underlying temp files. (We
+ * don't need to remember the memory context we're using explicitly,
+ * because after creation we only repalloc our arrays larger.)
+ */
+ ResourceOwner resowner;
+
+ /*
+ * "current pos" is position of start of buffer within the logical file.
+ * Position as seen by user of BufFile is (curFile, curOffset + pos).
+ */
+ int curFile; /* file index (0..n) part of current pos */
+ off_t curOffset; /* offset part of current pos */
+ int pos; /* next read/write position in buffer */
+ int nbytes; /* total # of valid bytes in buffer */
+
+ /*
+ * XXX Should ideally us PGIOAlignedBlock, but might need a way to avoid
+ * wasting per-file alignment padding when some users create many files.
+ */
+ PGAlignedBlock buffer;
+};
+
+static BufFile *makeBufFileCommon(int nfiles);
+static BufFile *makeBufFile(File firstfile);
+static void extendBufFile(BufFile *file);
+static void BufFileLoadBuffer(BufFile *file);
+static void BufFileDumpBuffer(BufFile *file);
+static void BufFileFlush(BufFile *file);
+static File MakeNewFileSetSegment(BufFile *buffile, int segment);
+
+/*
+ * Create BufFile and perform the common initialization.
+ */
+static BufFile *
+makeBufFileCommon(int nfiles)
+{
+ BufFile *file = (BufFile *) palloc(sizeof(BufFile));
+
+ file->numFiles = nfiles;
+ file->isInterXact = false;
+ file->dirty = false;
+ file->resowner = CurrentResourceOwner;
+ file->curFile = 0;
+ file->curOffset = 0;
+ file->pos = 0;
+ file->nbytes = 0;
+
+ return file;
+}
+
+/*
+ * Create a BufFile given the first underlying physical file.
+ * NOTE: caller must set isInterXact if appropriate.
+ */
+static BufFile *
+makeBufFile(File firstfile)
+{
+ BufFile *file = makeBufFileCommon(1);
+
+ file->files = (File *) palloc(sizeof(File));
+ file->files[0] = firstfile;
+ file->readOnly = false;
+ file->fileset = NULL;
+ file->name = NULL;
+
+ return file;
+}
+
+/*
+ * Add another component temp file.
+ */
+static void
+extendBufFile(BufFile *file)
+{
+ File pfile;
+ ResourceOwner oldowner;
+
+ /* Be sure to associate the file with the BufFile's resource owner */
+ oldowner = CurrentResourceOwner;
+ CurrentResourceOwner = file->resowner;
+
+ if (file->fileset == NULL)
+ pfile = OpenTemporaryFile(file->isInterXact);
+ else
+ pfile = MakeNewFileSetSegment(file, file->numFiles);
+
+ Assert(pfile >= 0);
+
+ CurrentResourceOwner = oldowner;
+
+ file->files = (File *) repalloc(file->files,
+ (file->numFiles + 1) * sizeof(File));
+ file->files[file->numFiles] = pfile;
+ file->numFiles++;
+}
+
+/*
+ * Create a BufFile for a new temporary file (which will expand to become
+ * multiple temporary files if more than MAX_PHYSICAL_FILESIZE bytes are
+ * written to it).
+ *
+ * If interXact is true, the temp file will not be automatically deleted
+ * at end of transaction.
+ *
+ * Note: if interXact is true, the caller had better be calling us in a
+ * memory context, and with a resource owner, that will survive across
+ * transaction boundaries.
+ */
+BufFile *
+BufFileCreateTemp(bool interXact)
+{
+ BufFile *file;
+ File pfile;
+
+ /*
+ * Ensure that temp tablespaces are set up for OpenTemporaryFile to use.
+ * Possibly the caller will have done this already, but it seems useful to
+ * double-check here. Failure to do this at all would result in the temp
+ * files always getting placed in the default tablespace, which is a
+ * pretty hard-to-detect bug. Callers may prefer to do it earlier if they
+ * want to be sure that any required catalog access is done in some other
+ * resource context.
+ */
+ PrepareTempTablespaces();
+
+ pfile = OpenTemporaryFile(interXact);
+ Assert(pfile >= 0);
+
+ file = makeBufFile(pfile);
+ file->isInterXact = interXact;
+
+ return file;
+}
+
+/*
+ * Build the name for a given segment of a given BufFile.
+ */
+static void
+FileSetSegmentName(char *name, const char *buffile_name, int segment)
+{
+ snprintf(name, MAXPGPATH, "%s.%d", buffile_name, segment);
+}
+
+/*
+ * Create a new segment file backing a fileset based BufFile.
+ */
+static File
+MakeNewFileSetSegment(BufFile *buffile, int segment)
+{
+ char name[MAXPGPATH];
+ File file;
+
+ /*
+ * It is possible that there are files left over from before a crash
+ * restart with the same name. In order for BufFileOpenFileSet() not to
+ * get confused about how many segments there are, we'll unlink the next
+ * segment number if it already exists.
+ */
+ FileSetSegmentName(name, buffile->name, segment + 1);
+ FileSetDelete(buffile->fileset, name, true);
+
+ /* Create the new segment. */
+ FileSetSegmentName(name, buffile->name, segment);
+ file = FileSetCreate(buffile->fileset, name);
+
+ /* FileSetCreate would've errored out */
+ Assert(file > 0);
+
+ return file;
+}
+
+/*
+ * Create a BufFile that can be discovered and opened read-only by other
+ * backends that are attached to the same SharedFileSet using the same name.
+ *
+ * The naming scheme for fileset based BufFiles is left up to the calling code.
+ * The name will appear as part of one or more filenames on disk, and might
+ * provide clues to administrators about which subsystem is generating
+ * temporary file data. Since each SharedFileSet object is backed by one or
+ * more uniquely named temporary directory, names don't conflict with
+ * unrelated SharedFileSet objects.
+ */
+BufFile *
+BufFileCreateFileSet(FileSet *fileset, const char *name)
+{
+ BufFile *file;
+
+ file = makeBufFileCommon(1);
+ file->fileset = fileset;
+ file->name = pstrdup(name);
+ file->files = (File *) palloc(sizeof(File));
+ file->files[0] = MakeNewFileSetSegment(file, 0);
+ file->readOnly = false;
+
+ return file;
+}
+
+/*
+ * Open a file that was previously created in another backend (or this one)
+ * with BufFileCreateFileSet in the same FileSet using the same name.
+ * The backend that created the file must have called BufFileClose() or
+ * BufFileExportFileSet() to make sure that it is ready to be opened by other
+ * backends and render it read-only. If missing_ok is true, which indicates
+ * that missing files can be safely ignored, then return NULL if the BufFile
+ * with the given name is not found, otherwise, throw an error.
+ */
+BufFile *
+BufFileOpenFileSet(FileSet *fileset, const char *name, int mode,
+ bool missing_ok)
+{
+ BufFile *file;
+ char segment_name[MAXPGPATH];
+ Size capacity = 16;
+ File *files;
+ int nfiles = 0;
+
+ files = palloc(sizeof(File) * capacity);
+
+ /*
+ * We don't know how many segments there are, so we'll probe the
+ * filesystem to find out.
+ */
+ for (;;)
+ {
+ /* See if we need to expand our file segment array. */
+ if (nfiles + 1 > capacity)
+ {
+ capacity *= 2;
+ files = repalloc(files, sizeof(File) * capacity);
+ }
+ /* Try to load a segment. */
+ FileSetSegmentName(segment_name, name, nfiles);
+ files[nfiles] = FileSetOpen(fileset, segment_name, mode);
+ if (files[nfiles] <= 0)
+ break;
+ ++nfiles;
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * If we didn't find any files at all, then no BufFile exists with this
+ * name.
+ */
+ if (nfiles == 0)
+ {
+ /* free the memory */
+ pfree(files);
+
+ if (missing_ok)
+ return NULL;
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary file \"%s\" from BufFile \"%s\": %m",
+ segment_name, name)));
+ }
+
+ file = makeBufFileCommon(nfiles);
+ file->files = files;
+ file->readOnly = (mode == O_RDONLY);
+ file->fileset = fileset;
+ file->name = pstrdup(name);
+
+ return file;
+}
+
+/*
+ * Delete a BufFile that was created by BufFileCreateFileSet in the given
+ * FileSet using the given name.
+ *
+ * It is not necessary to delete files explicitly with this function. It is
+ * provided only as a way to delete files proactively, rather than waiting for
+ * the FileSet to be cleaned up.
+ *
+ * Only one backend should attempt to delete a given name, and should know
+ * that it exists and has been exported or closed otherwise missing_ok should
+ * be passed true.
+ */
+void
+BufFileDeleteFileSet(FileSet *fileset, const char *name, bool missing_ok)
+{
+ char segment_name[MAXPGPATH];
+ int segment = 0;
+ bool found = false;
+
+ /*
+ * We don't know how many segments the file has. We'll keep deleting
+ * until we run out. If we don't manage to find even an initial segment,
+ * raise an error.
+ */
+ for (;;)
+ {
+ FileSetSegmentName(segment_name, name, segment);
+ if (!FileSetDelete(fileset, segment_name, true))
+ break;
+ found = true;
+ ++segment;
+
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ if (!found && !missing_ok)
+ elog(ERROR, "could not delete unknown BufFile \"%s\"", name);
+}
+
+/*
+ * BufFileExportFileSet --- flush and make read-only, in preparation for sharing.
+ */
+void
+BufFileExportFileSet(BufFile *file)
+{
+ /* Must be a file belonging to a FileSet. */
+ Assert(file->fileset != NULL);
+
+ /* It's probably a bug if someone calls this twice. */
+ Assert(!file->readOnly);
+
+ BufFileFlush(file);
+ file->readOnly = true;
+}
+
+/*
+ * Close a BufFile
+ *
+ * Like fclose(), this also implicitly FileCloses the underlying File.
+ */
+void
+BufFileClose(BufFile *file)
+{
+ int i;
+
+ /* flush any unwritten data */
+ BufFileFlush(file);
+ /* close and delete the underlying file(s) */
+ for (i = 0; i < file->numFiles; i++)
+ FileClose(file->files[i]);
+ /* release the buffer space */
+ pfree(file->files);
+ pfree(file);
+}
+
+/*
+ * BufFileLoadBuffer
+ *
+ * Load some data into buffer, if possible, starting from curOffset.
+ * At call, must have dirty = false, pos and nbytes = 0.
+ * On exit, nbytes is number of bytes loaded.
+ */
+static void
+BufFileLoadBuffer(BufFile *file)
+{
+ File thisfile;
+ instr_time io_start;
+ instr_time io_time;
+
+ /*
+ * Advance to next component file if necessary and possible.
+ */
+ if (file->curOffset >= MAX_PHYSICAL_FILESIZE &&
+ file->curFile + 1 < file->numFiles)
+ {
+ file->curFile++;
+ file->curOffset = 0;
+ }
+
+ thisfile = file->files[file->curFile];
+
+ if (track_io_timing)
+ INSTR_TIME_SET_CURRENT(io_start);
+ else
+ INSTR_TIME_SET_ZERO(io_start);
+
+ /*
+ * Read whatever we can get, up to a full bufferload.
+ */
+ file->nbytes = FileRead(thisfile,
+ file->buffer.data,
+ sizeof(file->buffer),
+ file->curOffset,
+ WAIT_EVENT_BUFFILE_READ);
+ if (file->nbytes < 0)
+ {
+ file->nbytes = 0;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m",
+ FilePathName(thisfile))));
+ }
+
+ if (track_io_timing)
+ {
+ INSTR_TIME_SET_CURRENT(io_time);
+ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_read_time, io_time, io_start);
+ }
+
+ /* we choose not to advance curOffset here */
+
+ if (file->nbytes > 0)
+ pgBufferUsage.temp_blks_read++;
+}
+
+/*
+ * BufFileDumpBuffer
+ *
+ * Dump buffer contents starting at curOffset.
+ * At call, should have dirty = true, nbytes > 0.
+ * On exit, dirty is cleared if successful write, and curOffset is advanced.
+ */
+static void
+BufFileDumpBuffer(BufFile *file)
+{
+ int wpos = 0;
+ int bytestowrite;
+ File thisfile;
+
+ /*
+ * Unlike BufFileLoadBuffer, we must dump the whole buffer even if it
+ * crosses a component-file boundary; so we need a loop.
+ */
+ while (wpos < file->nbytes)
+ {
+ off_t availbytes;
+ instr_time io_start;
+ instr_time io_time;
+
+ /*
+ * Advance to next component file if necessary and possible.
+ */
+ if (file->curOffset >= MAX_PHYSICAL_FILESIZE)
+ {
+ while (file->curFile + 1 >= file->numFiles)
+ extendBufFile(file);
+ file->curFile++;
+ file->curOffset = 0;
+ }
+
+ /*
+ * Determine how much we need to write into this file.
+ */
+ bytestowrite = file->nbytes - wpos;
+ availbytes = MAX_PHYSICAL_FILESIZE - file->curOffset;
+
+ if ((off_t) bytestowrite > availbytes)
+ bytestowrite = (int) availbytes;
+
+ thisfile = file->files[file->curFile];
+
+ if (track_io_timing)
+ INSTR_TIME_SET_CURRENT(io_start);
+ else
+ INSTR_TIME_SET_ZERO(io_start);
+
+ bytestowrite = FileWrite(thisfile,
+ file->buffer.data + wpos,
+ bytestowrite,
+ file->curOffset,
+ WAIT_EVENT_BUFFILE_WRITE);
+ if (bytestowrite <= 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m",
+ FilePathName(thisfile))));
+
+ if (track_io_timing)
+ {
+ INSTR_TIME_SET_CURRENT(io_time);
+ INSTR_TIME_ACCUM_DIFF(pgBufferUsage.temp_blk_write_time, io_time, io_start);
+ }
+
+ file->curOffset += bytestowrite;
+ wpos += bytestowrite;
+
+ pgBufferUsage.temp_blks_written++;
+ }
+ file->dirty = false;
+
+ /*
+ * At this point, curOffset has been advanced to the end of the buffer,
+ * ie, its original value + nbytes. We need to make it point to the
+ * logical file position, ie, original value + pos, in case that is less
+ * (as could happen due to a small backwards seek in a dirty buffer!)
+ */
+ file->curOffset -= (file->nbytes - file->pos);
+ if (file->curOffset < 0) /* handle possible segment crossing */
+ {
+ file->curFile--;
+ Assert(file->curFile >= 0);
+ file->curOffset += MAX_PHYSICAL_FILESIZE;
+ }
+
+ /*
+ * Now we can set the buffer empty without changing the logical position
+ */
+ file->pos = 0;
+ file->nbytes = 0;
+}
+
+/*
+ * BufFileRead variants
+ *
+ * Like fread() except we assume 1-byte element size and report I/O errors via
+ * ereport().
+ *
+ * If 'exact' is true, then an error is also raised if the number of bytes
+ * read is not exactly 'size' (no short reads). If 'exact' and 'eofOK' are
+ * true, then reading zero bytes is ok.
+ */
+static size_t
+BufFileReadCommon(BufFile *file, void *ptr, size_t size, bool exact, bool eofOK)
+{
+ size_t start_size = size;
+ size_t nread = 0;
+ size_t nthistime;
+
+ BufFileFlush(file);
+
+ while (size > 0)
+ {
+ if (file->pos >= file->nbytes)
+ {
+ /* Try to load more data into buffer. */
+ file->curOffset += file->pos;
+ file->pos = 0;
+ file->nbytes = 0;
+ BufFileLoadBuffer(file);
+ if (file->nbytes <= 0)
+ break; /* no more data available */
+ }
+
+ nthistime = file->nbytes - file->pos;
+ if (nthistime > size)
+ nthistime = size;
+ Assert(nthistime > 0);
+
+ memcpy(ptr, file->buffer.data + file->pos, nthistime);
+
+ file->pos += nthistime;
+ ptr = (char *) ptr + nthistime;
+ size -= nthistime;
+ nread += nthistime;
+ }
+
+ if (exact &&
+ (nread != start_size && !(nread == 0 && eofOK)))
+ ereport(ERROR,
+ errcode_for_file_access(),
+ file->name ?
+ errmsg("could not read from file set \"%s\": read only %zu of %zu bytes",
+ file->name, nread, start_size) :
+ errmsg("could not read from temporary file: read only %zu of %zu bytes",
+ nread, start_size));
+
+ return nread;
+}
+
+/*
+ * Legacy interface where the caller needs to check for end of file or short
+ * reads.
+ */
+size_t
+BufFileRead(BufFile *file, void *ptr, size_t size)
+{
+ return BufFileReadCommon(file, ptr, size, false, false);
+}
+
+/*
+ * Require read of exactly the specified size.
+ */
+void
+BufFileReadExact(BufFile *file, void *ptr, size_t size)
+{
+ BufFileReadCommon(file, ptr, size, true, false);
+}
+
+/*
+ * Require read of exactly the specified size, but optionally allow end of
+ * file (in which case 0 is returned).
+ */
+size_t
+BufFileReadMaybeEOF(BufFile *file, void *ptr, size_t size, bool eofOK)
+{
+ return BufFileReadCommon(file, ptr, size, true, eofOK);
+}
+
+/*
+ * BufFileWrite
+ *
+ * Like fwrite() except we assume 1-byte element size and report errors via
+ * ereport().
+ */
+void
+BufFileWrite(BufFile *file, const void *ptr, size_t size)
+{
+ size_t nthistime;
+
+ Assert(!file->readOnly);
+
+ while (size > 0)
+ {
+ if (file->pos >= BLCKSZ)
+ {
+ /* Buffer full, dump it out */
+ if (file->dirty)
+ BufFileDumpBuffer(file);
+ else
+ {
+ /* Hmm, went directly from reading to writing? */
+ file->curOffset += file->pos;
+ file->pos = 0;
+ file->nbytes = 0;
+ }
+ }
+
+ nthistime = BLCKSZ - file->pos;
+ if (nthistime > size)
+ nthistime = size;
+ Assert(nthistime > 0);
+
+ memcpy(file->buffer.data + file->pos, ptr, nthistime);
+
+ file->dirty = true;
+ file->pos += nthistime;
+ if (file->nbytes < file->pos)
+ file->nbytes = file->pos;
+ ptr = (const char *) ptr + nthistime;
+ size -= nthistime;
+ }
+}
+
+/*
+ * BufFileFlush
+ *
+ * Like fflush(), except that I/O errors are reported with ereport().
+ */
+static void
+BufFileFlush(BufFile *file)
+{
+ if (file->dirty)
+ BufFileDumpBuffer(file);
+
+ Assert(!file->dirty);
+}
+
+/*
+ * BufFileSeek
+ *
+ * Like fseek(), except that target position needs two values in order to
+ * work when logical filesize exceeds maximum value representable by off_t.
+ * We do not support relative seeks across more than that, however.
+ * I/O errors are reported by ereport().
+ *
+ * Result is 0 if OK, EOF if not. Logical position is not moved if an
+ * impossible seek is attempted.
+ */
+int
+BufFileSeek(BufFile *file, int fileno, off_t offset, int whence)
+{
+ int newFile;
+ off_t newOffset;
+
+ switch (whence)
+ {
+ case SEEK_SET:
+ if (fileno < 0)
+ return EOF;
+ newFile = fileno;
+ newOffset = offset;
+ break;
+ case SEEK_CUR:
+
+ /*
+ * Relative seek considers only the signed offset, ignoring
+ * fileno. Note that large offsets (> 1 GB) risk overflow in this
+ * add, unless we have 64-bit off_t.
+ */
+ newFile = file->curFile;
+ newOffset = (file->curOffset + file->pos) + offset;
+ break;
+ case SEEK_END:
+
+ /*
+ * The file size of the last file gives us the end offset of that
+ * file.
+ */
+ newFile = file->numFiles - 1;
+ newOffset = FileSize(file->files[file->numFiles - 1]);
+ if (newOffset < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
+ FilePathName(file->files[file->numFiles - 1]),
+ file->name)));
+ break;
+ default:
+ elog(ERROR, "invalid whence: %d", whence);
+ return EOF;
+ }
+ while (newOffset < 0)
+ {
+ if (--newFile < 0)
+ return EOF;
+ newOffset += MAX_PHYSICAL_FILESIZE;
+ }
+ if (newFile == file->curFile &&
+ newOffset >= file->curOffset &&
+ newOffset <= file->curOffset + file->nbytes)
+ {
+ /*
+ * Seek is to a point within existing buffer; we can just adjust
+ * pos-within-buffer, without flushing buffer. Note this is OK
+ * whether reading or writing, but buffer remains dirty if we were
+ * writing.
+ */
+ file->pos = (int) (newOffset - file->curOffset);
+ return 0;
+ }
+ /* Otherwise, must reposition buffer, so flush any dirty data */
+ BufFileFlush(file);
+
+ /*
+ * At this point and no sooner, check for seek past last segment. The
+ * above flush could have created a new segment, so checking sooner would
+ * not work (at least not with this code).
+ */
+
+ /* convert seek to "start of next seg" to "end of last seg" */
+ if (newFile == file->numFiles && newOffset == 0)
+ {
+ newFile--;
+ newOffset = MAX_PHYSICAL_FILESIZE;
+ }
+ while (newOffset > MAX_PHYSICAL_FILESIZE)
+ {
+ if (++newFile >= file->numFiles)
+ return EOF;
+ newOffset -= MAX_PHYSICAL_FILESIZE;
+ }
+ if (newFile >= file->numFiles)
+ return EOF;
+ /* Seek is OK! */
+ file->curFile = newFile;
+ file->curOffset = newOffset;
+ file->pos = 0;
+ file->nbytes = 0;
+ return 0;
+}
+
+void
+BufFileTell(BufFile *file, int *fileno, off_t *offset)
+{
+ *fileno = file->curFile;
+ *offset = file->curOffset + file->pos;
+}
+
+/*
+ * BufFileSeekBlock --- block-oriented seek
+ *
+ * Performs absolute seek to the start of the n'th BLCKSZ-sized block of
+ * the file. Note that users of this interface will fail if their files
+ * exceed BLCKSZ * LONG_MAX bytes, but that is quite a lot; we don't work
+ * with tables bigger than that, either...
+ *
+ * Result is 0 if OK, EOF if not. Logical position is not moved if an
+ * impossible seek is attempted.
+ */
+int
+BufFileSeekBlock(BufFile *file, long blknum)
+{
+ return BufFileSeek(file,
+ (int) (blknum / BUFFILE_SEG_SIZE),
+ (off_t) (blknum % BUFFILE_SEG_SIZE) * BLCKSZ,
+ SEEK_SET);
+}
+
+#ifdef NOT_USED
+/*
+ * BufFileTellBlock --- block-oriented tell
+ *
+ * Any fractional part of a block in the current seek position is ignored.
+ */
+long
+BufFileTellBlock(BufFile *file)
+{
+ long blknum;
+
+ blknum = (file->curOffset + file->pos) / BLCKSZ;
+ blknum += file->curFile * BUFFILE_SEG_SIZE;
+ return blknum;
+}
+
+#endif
+
+/*
+ * Return the current fileset based BufFile size.
+ *
+ * Counts any holes left behind by BufFileAppend as part of the size.
+ * ereport()s on failure.
+ */
+int64
+BufFileSize(BufFile *file)
+{
+ int64 lastFileSize;
+
+ Assert(file->fileset != NULL);
+
+ /* Get the size of the last physical file. */
+ lastFileSize = FileSize(file->files[file->numFiles - 1]);
+ if (lastFileSize < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not determine size of temporary file \"%s\" from BufFile \"%s\": %m",
+ FilePathName(file->files[file->numFiles - 1]),
+ file->name)));
+
+ return ((file->numFiles - 1) * (int64) MAX_PHYSICAL_FILESIZE) +
+ lastFileSize;
+}
+
+/*
+ * Append the contents of source file (managed within fileset) to
+ * end of target file (managed within same fileset).
+ *
+ * Note that operation subsumes ownership of underlying resources from
+ * "source". Caller should never call BufFileClose against source having
+ * called here first. Resource owners for source and target must match,
+ * too.
+ *
+ * This operation works by manipulating lists of segment files, so the
+ * file content is always appended at a MAX_PHYSICAL_FILESIZE-aligned
+ * boundary, typically creating empty holes before the boundary. These
+ * areas do not contain any interesting data, and cannot be read from by
+ * caller.
+ *
+ * Returns the block number within target where the contents of source
+ * begins. Caller should apply this as an offset when working off block
+ * positions that are in terms of the original BufFile space.
+ */
+long
+BufFileAppend(BufFile *target, BufFile *source)
+{
+ long startBlock = target->numFiles * BUFFILE_SEG_SIZE;
+ int newNumFiles = target->numFiles + source->numFiles;
+ int i;
+
+ Assert(target->fileset != NULL);
+ Assert(source->readOnly);
+ Assert(!source->dirty);
+ Assert(source->fileset != NULL);
+
+ if (target->resowner != source->resowner)
+ elog(ERROR, "could not append BufFile with non-matching resource owner");
+
+ target->files = (File *)
+ repalloc(target->files, sizeof(File) * newNumFiles);
+ for (i = target->numFiles; i < newNumFiles; i++)
+ target->files[i] = source->files[i - target->numFiles];
+ target->numFiles = newNumFiles;
+
+ return startBlock;
+}
+
+/*
+ * Truncate a BufFile created by BufFileCreateFileSet up to the given fileno
+ * and the offset.
+ */
+void
+BufFileTruncateFileSet(BufFile *file, int fileno, off_t offset)
+{
+ int numFiles = file->numFiles;
+ int newFile = fileno;
+ off_t newOffset = file->curOffset;
+ char segment_name[MAXPGPATH];
+ int i;
+
+ /*
+ * Loop over all the files up to the given fileno and remove the files
+ * that are greater than the fileno and truncate the given file up to the
+ * offset. Note that we also remove the given fileno if the offset is 0
+ * provided it is not the first file in which we truncate it.
+ */
+ for (i = file->numFiles - 1; i >= fileno; i--)
+ {
+ if ((i != fileno || offset == 0) && i != 0)
+ {
+ FileSetSegmentName(segment_name, file->name, i);
+ FileClose(file->files[i]);
+ if (!FileSetDelete(file->fileset, segment_name, true))
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not delete fileset \"%s\": %m",
+ segment_name)));
+ numFiles--;
+ newOffset = MAX_PHYSICAL_FILESIZE;
+
+ /*
+ * This is required to indicate that we have deleted the given
+ * fileno.
+ */
+ if (i == fileno)
+ newFile--;
+ }
+ else
+ {
+ if (FileTruncate(file->files[i], offset,
+ WAIT_EVENT_BUFFILE_TRUNCATE) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\": %m",
+ FilePathName(file->files[i]))));
+ newOffset = offset;
+ }
+ }
+
+ file->numFiles = numFiles;
+
+ /*
+ * If the truncate point is within existing buffer then we can just adjust
+ * pos within buffer.
+ */
+ if (newFile == file->curFile &&
+ newOffset >= file->curOffset &&
+ newOffset <= file->curOffset + file->nbytes)
+ {
+ /* No need to reset the current pos if the new pos is greater. */
+ if (newOffset <= file->curOffset + file->pos)
+ file->pos = (int) (newOffset - file->curOffset);
+
+ /* Adjust the nbytes for the current buffer. */
+ file->nbytes = (int) (newOffset - file->curOffset);
+ }
+ else if (newFile == file->curFile &&
+ newOffset < file->curOffset)
+ {
+ /*
+ * The truncate point is within the existing file but prior to the
+ * current position, so we can forget the current buffer and reset the
+ * current position.
+ */
+ file->curOffset = newOffset;
+ file->pos = 0;
+ file->nbytes = 0;
+ }
+ else if (newFile < file->curFile)
+ {
+ /*
+ * The truncate point is prior to the current file, so need to reset
+ * the current position accordingly.
+ */
+ file->curFile = newFile;
+ file->curOffset = newOffset;
+ file->pos = 0;
+ file->nbytes = 0;
+ }
+ /* Nothing to do, if the truncate point is beyond current file. */
+}
diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c
new file mode 100644
index 0000000..e04bc39
--- /dev/null
+++ b/src/backend/storage/file/copydir.c
@@ -0,0 +1,216 @@
+/*-------------------------------------------------------------------------
+ *
+ * copydir.c
+ * copies a directory
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * While "xcopy /e /i /q" works fine for copying directories, on Windows XP
+ * it requires a Window handle which prevents it from working when invoked
+ * as a service.
+ *
+ * IDENTIFICATION
+ * src/backend/storage/file/copydir.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "common/file_utils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/copydir.h"
+#include "storage/fd.h"
+
+/*
+ * copydir: copy a directory
+ *
+ * If recurse is false, subdirectories are ignored. Anything that's not
+ * a directory or a regular file is ignored.
+ */
+void
+copydir(const char *fromdir, const char *todir, bool recurse)
+{
+ DIR *xldir;
+ struct dirent *xlde;
+ char fromfile[MAXPGPATH * 2];
+ char tofile[MAXPGPATH * 2];
+
+ if (MakePGDirectory(todir) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create directory \"%s\": %m", todir)));
+
+ xldir = AllocateDir(fromdir);
+
+ while ((xlde = ReadDir(xldir, fromdir)) != NULL)
+ {
+ PGFileType xlde_type;
+
+ /* If we got a cancel signal during the copy of the directory, quit */
+ CHECK_FOR_INTERRUPTS();
+
+ if (strcmp(xlde->d_name, ".") == 0 ||
+ strcmp(xlde->d_name, "..") == 0)
+ continue;
+
+ snprintf(fromfile, sizeof(fromfile), "%s/%s", fromdir, xlde->d_name);
+ snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name);
+
+ xlde_type = get_dirent_type(fromfile, xlde, false, ERROR);
+
+ if (xlde_type == PGFILETYPE_DIR)
+ {
+ /* recurse to handle subdirectories */
+ if (recurse)
+ copydir(fromfile, tofile, true);
+ }
+ else if (xlde_type == PGFILETYPE_REG)
+ copy_file(fromfile, tofile);
+ }
+ FreeDir(xldir);
+
+ /*
+ * Be paranoid here and fsync all files to ensure the copy is really done.
+ * But if fsync is disabled, we're done.
+ */
+ if (!enableFsync)
+ return;
+
+ xldir = AllocateDir(todir);
+
+ while ((xlde = ReadDir(xldir, todir)) != NULL)
+ {
+ if (strcmp(xlde->d_name, ".") == 0 ||
+ strcmp(xlde->d_name, "..") == 0)
+ continue;
+
+ snprintf(tofile, sizeof(tofile), "%s/%s", todir, xlde->d_name);
+
+ /*
+ * We don't need to sync subdirectories here since the recursive
+ * copydir will do it before it returns
+ */
+ if (get_dirent_type(tofile, xlde, false, ERROR) == PGFILETYPE_REG)
+ fsync_fname(tofile, false);
+ }
+ FreeDir(xldir);
+
+ /*
+ * It's important to fsync the destination directory itself as individual
+ * file fsyncs don't guarantee that the directory entry for the file is
+ * synced. Recent versions of ext4 have made the window much wider but
+ * it's been true for ext3 and other filesystems in the past.
+ */
+ fsync_fname(todir, true);
+}
+
+/*
+ * copy one file
+ */
+void
+copy_file(const char *fromfile, const char *tofile)
+{
+ char *buffer;
+ int srcfd;
+ int dstfd;
+ int nbytes;
+ off_t offset;
+ off_t flush_offset;
+
+ /* Size of copy buffer (read and write requests) */
+#define COPY_BUF_SIZE (8 * BLCKSZ)
+
+ /*
+ * Size of data flush requests. It seems beneficial on most platforms to
+ * do this every 1MB or so. But macOS, at least with early releases of
+ * APFS, is really unfriendly to small mmap/msync requests, so there do it
+ * only every 32MB.
+ */
+#if defined(__darwin__)
+#define FLUSH_DISTANCE (32 * 1024 * 1024)
+#else
+#define FLUSH_DISTANCE (1024 * 1024)
+#endif
+
+ /* Use palloc to ensure we get a maxaligned buffer */
+ buffer = palloc(COPY_BUF_SIZE);
+
+ /*
+ * Open the files
+ */
+ srcfd = OpenTransientFile(fromfile, O_RDONLY | PG_BINARY);
+ if (srcfd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", fromfile)));
+
+ dstfd = OpenTransientFile(tofile, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ if (dstfd < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", tofile)));
+
+ /*
+ * Do the data copying.
+ */
+ flush_offset = 0;
+ for (offset = 0;; offset += nbytes)
+ {
+ /* If we got a cancel signal during the copy of the file, quit */
+ CHECK_FOR_INTERRUPTS();
+
+ /*
+ * We fsync the files later, but during the copy, flush them every so
+ * often to avoid spamming the cache and hopefully get the kernel to
+ * start writing them out before the fsync comes.
+ */
+ if (offset - flush_offset >= FLUSH_DISTANCE)
+ {
+ pg_flush_data(dstfd, flush_offset, offset - flush_offset);
+ flush_offset = offset;
+ }
+
+ pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_READ);
+ nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
+ pgstat_report_wait_end();
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read file \"%s\": %m", fromfile)));
+ if (nbytes == 0)
+ break;
+ errno = 0;
+ pgstat_report_wait_start(WAIT_EVENT_COPY_FILE_WRITE);
+ if ((int) write(dstfd, buffer, nbytes) != nbytes)
+ {
+ /* if write didn't set errno, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write to file \"%s\": %m", tofile)));
+ }
+ pgstat_report_wait_end();
+ }
+
+ if (offset > flush_offset)
+ pg_flush_data(dstfd, flush_offset, offset - flush_offset);
+
+ if (CloseTransientFile(dstfd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", tofile)));
+
+ if (CloseTransientFile(srcfd) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", fromfile)));
+
+ pfree(buffer);
+}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
new file mode 100644
index 0000000..16b3e8f
--- /dev/null
+++ b/src/backend/storage/file/fd.c
@@ -0,0 +1,3976 @@
+/*-------------------------------------------------------------------------
+ *
+ * fd.c
+ * Virtual file descriptor code.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/file/fd.c
+ *
+ * NOTES:
+ *
+ * This code manages a cache of 'virtual' file descriptors (VFDs).
+ * The server opens many file descriptors for a variety of reasons,
+ * including base tables, scratch files (e.g., sort and hash spool
+ * files), and random calls to C library routines like system(3); it
+ * is quite easy to exceed system limits on the number of open files a
+ * single process can have. (This is around 1024 on many modern
+ * operating systems, but may be lower on others.)
+ *
+ * VFDs are managed as an LRU pool, with actual OS file descriptors
+ * being opened and closed as needed. Obviously, if a routine is
+ * opened using these interfaces, all subsequent operations must also
+ * be through these interfaces (the File type is not a real file
+ * descriptor).
+ *
+ * For this scheme to work, most (if not all) routines throughout the
+ * server should use these interfaces instead of calling the C library
+ * routines (e.g., open(2) and fopen(3)) themselves. Otherwise, we
+ * may find ourselves short of real file descriptors anyway.
+ *
+ * INTERFACE ROUTINES
+ *
+ * PathNameOpenFile and OpenTemporaryFile are used to open virtual files.
+ * A File opened with OpenTemporaryFile is automatically deleted when the
+ * File is closed, either explicitly or implicitly at end of transaction or
+ * process exit. PathNameOpenFile is intended for files that are held open
+ * for a long time, like relation files. It is the caller's responsibility
+ * to close them, there is no automatic mechanism in fd.c for that.
+ *
+ * PathName(Create|Open|Delete)Temporary(File|Dir) are used to manage
+ * temporary files that have names so that they can be shared between
+ * backends. Such files are automatically closed and count against the
+ * temporary file limit of the backend that creates them, but unlike anonymous
+ * files they are not automatically deleted. See sharedfileset.c for a shared
+ * ownership mechanism that provides automatic cleanup for shared files when
+ * the last of a group of backends detaches.
+ *
+ * AllocateFile, AllocateDir, OpenPipeStream and OpenTransientFile are
+ * wrappers around fopen(3), opendir(3), popen(3) and open(2), respectively.
+ * They behave like the corresponding native functions, except that the handle
+ * is registered with the current subtransaction, and will be automatically
+ * closed at abort. These are intended mainly for short operations like
+ * reading a configuration file; there is a limit on the number of files that
+ * can be opened using these functions at any one time.
+ *
+ * Finally, BasicOpenFile is just a thin wrapper around open() that can
+ * release file descriptors in use by the virtual file descriptors if
+ * necessary. There is no automatic cleanup of file descriptors returned by
+ * BasicOpenFile, it is solely the caller's responsibility to close the file
+ * descriptor by calling close(2).
+ *
+ * If a non-virtual file descriptor needs to be held open for any length of
+ * time, report it to fd.c by calling AcquireExternalFD or ReserveExternalFD
+ * (and eventually ReleaseExternalFD), so that we can take it into account
+ * while deciding how many VFDs can be open. This applies to FDs obtained
+ * with BasicOpenFile as well as those obtained without use of any fd.c API.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <dirent.h>
+#include <sys/file.h>
+#include <sys/param.h>
+#include <sys/resource.h> /* for getrlimit */
+#include <sys/stat.h>
+#include <sys/types.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <limits.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/pg_tablespace.h"
+#include "common/file_perm.h"
+#include "common/file_utils.h"
+#include "common/pg_prng.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/mem.h"
+#include "postmaster/startup.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "utils/guc.h"
+#include "utils/guc_hooks.h"
+#include "utils/resowner_private.h"
+#include "utils/varlena.h"
+
+/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
+#if defined(HAVE_SYNC_FILE_RANGE)
+#define PG_FLUSH_DATA_WORKS 1
+#elif !defined(WIN32) && defined(MS_ASYNC)
+#define PG_FLUSH_DATA_WORKS 1
+#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+#define PG_FLUSH_DATA_WORKS 1
+#endif
+
+/*
+ * We must leave some file descriptors free for system(), the dynamic loader,
+ * and other code that tries to open files without consulting fd.c. This
+ * is the number left free. (While we try fairly hard to prevent EMFILE
+ * errors, there's never any guarantee that we won't get ENFILE due to
+ * other processes chewing up FDs. So it's a bad idea to try to open files
+ * without consulting fd.c. Nonetheless we cannot control all code.)
+ *
+ * Because this is just a fixed setting, we are effectively assuming that
+ * no such code will leave FDs open over the long term; otherwise the slop
+ * is likely to be insufficient. Note in particular that we expect that
+ * loading a shared library does not result in any permanent increase in
+ * the number of open files. (This appears to be true on most if not
+ * all platforms as of Feb 2004.)
+ */
+#define NUM_RESERVED_FDS 10
+
+/*
+ * If we have fewer than this many usable FDs after allowing for the reserved
+ * ones, choke. (This value is chosen to work with "ulimit -n 64", but not
+ * much less than that. Note that this value ensures numExternalFDs can be
+ * at least 16; as of this writing, the contrib/postgres_fdw regression tests
+ * will not pass unless that can grow to at least 14.)
+ */
+#define FD_MINFREE 48
+
+/*
+ * A number of platforms allow individual processes to open many more files
+ * than they can really support when *many* processes do the same thing.
+ * This GUC parameter lets the DBA limit max_safe_fds to something less than
+ * what the postmaster's initial probe suggests will work.
+ */
+int max_files_per_process = 1000;
+
+/*
+ * Maximum number of file descriptors to open for operations that fd.c knows
+ * about (VFDs, AllocateFile etc, or "external" FDs). This is initialized
+ * to a conservative value, and remains that way indefinitely in bootstrap or
+ * standalone-backend cases. In normal postmaster operation, the postmaster
+ * calls set_max_safe_fds() late in initialization to update the value, and
+ * that value is then inherited by forked subprocesses.
+ *
+ * Note: the value of max_files_per_process is taken into account while
+ * setting this variable, and so need not be tested separately.
+ */
+int max_safe_fds = FD_MINFREE; /* default if not changed */
+
+/* Whether it is safe to continue running after fsync() fails. */
+bool data_sync_retry = false;
+
+/* How SyncDataDirectory() should do its job. */
+int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
+
+/* Which kinds of files should be opened with PG_O_DIRECT. */
+int io_direct_flags;
+
+/* Debugging.... */
+
+#ifdef FDDEBUG
+#define DO_DB(A) \
+ do { \
+ int _do_db_save_errno = errno; \
+ A; \
+ errno = _do_db_save_errno; \
+ } while (0)
+#else
+#define DO_DB(A) \
+ ((void) 0)
+#endif
+
+#define VFD_CLOSED (-1)
+
+#define FileIsValid(file) \
+ ((file) > 0 && (file) < (int) SizeVfdCache && VfdCache[file].fileName != NULL)
+
+#define FileIsNotOpen(file) (VfdCache[file].fd == VFD_CLOSED)
+
+/* these are the assigned bits in fdstate below: */
+#define FD_DELETE_AT_CLOSE (1 << 0) /* T = delete when closed */
+#define FD_CLOSE_AT_EOXACT (1 << 1) /* T = close at eoXact */
+#define FD_TEMP_FILE_LIMIT (1 << 2) /* T = respect temp_file_limit */
+
+typedef struct vfd
+{
+ int fd; /* current FD, or VFD_CLOSED if none */
+ unsigned short fdstate; /* bitflags for VFD's state */
+ ResourceOwner resowner; /* owner, for automatic cleanup */
+ File nextFree; /* link to next free VFD, if in freelist */
+ File lruMoreRecently; /* doubly linked recency-of-use list */
+ File lruLessRecently;
+ off_t fileSize; /* current size of file (0 if not temporary) */
+ char *fileName; /* name of file, or NULL for unused VFD */
+ /* NB: fileName is malloc'd, and must be free'd when closing the VFD */
+ int fileFlags; /* open(2) flags for (re)opening the file */
+ mode_t fileMode; /* mode to pass to open(2) */
+} Vfd;
+
+/*
+ * Virtual File Descriptor array pointer and size. This grows as
+ * needed. 'File' values are indexes into this array.
+ * Note that VfdCache[0] is not a usable VFD, just a list header.
+ */
+static Vfd *VfdCache;
+static Size SizeVfdCache = 0;
+
+/*
+ * Number of file descriptors known to be in use by VFD entries.
+ */
+static int nfile = 0;
+
+/*
+ * Flag to tell whether it's worth scanning VfdCache looking for temp files
+ * to close
+ */
+static bool have_xact_temporary_files = false;
+
+/*
+ * Tracks the total size of all temporary files. Note: when temp_file_limit
+ * is being enforced, this cannot overflow since the limit cannot be more
+ * than INT_MAX kilobytes. When not enforcing, it could theoretically
+ * overflow, but we don't care.
+ */
+static uint64 temporary_files_size = 0;
+
+/* Temporary file access initialized and not yet shut down? */
+#ifdef USE_ASSERT_CHECKING
+static bool temporary_files_allowed = false;
+#endif
+
+/*
+ * List of OS handles opened with AllocateFile, AllocateDir and
+ * OpenTransientFile.
+ */
+typedef enum
+{
+ AllocateDescFile,
+ AllocateDescPipe,
+ AllocateDescDir,
+ AllocateDescRawFD
+} AllocateDescKind;
+
+typedef struct
+{
+ AllocateDescKind kind;
+ SubTransactionId create_subid;
+ union
+ {
+ FILE *file;
+ DIR *dir;
+ int fd;
+ } desc;
+} AllocateDesc;
+
+static int numAllocatedDescs = 0;
+static int maxAllocatedDescs = 0;
+static AllocateDesc *allocatedDescs = NULL;
+
+/*
+ * Number of open "external" FDs reported to Reserve/ReleaseExternalFD.
+ */
+static int numExternalFDs = 0;
+
+/*
+ * Number of temporary files opened during the current session;
+ * this is used in generation of tempfile names.
+ */
+static long tempFileCounter = 0;
+
+/*
+ * Array of OIDs of temp tablespaces. (Some entries may be InvalidOid,
+ * indicating that the current database's default tablespace should be used.)
+ * When numTempTableSpaces is -1, this has not been set in the current
+ * transaction.
+ */
+static Oid *tempTableSpaces = NULL;
+static int numTempTableSpaces = -1;
+static int nextTempTableSpace = 0;
+
+
+/*--------------------
+ *
+ * Private Routines
+ *
+ * Delete - delete a file from the Lru ring
+ * LruDelete - remove a file from the Lru ring and close its FD
+ * Insert - put a file at the front of the Lru ring
+ * LruInsert - put a file at the front of the Lru ring and open it
+ * ReleaseLruFile - Release an fd by closing the last entry in the Lru ring
+ * ReleaseLruFiles - Release fd(s) until we're under the max_safe_fds limit
+ * AllocateVfd - grab a free (or new) file record (from VfdCache)
+ * FreeVfd - free a file record
+ *
+ * The Least Recently Used ring is a doubly linked list that begins and
+ * ends on element zero. Element zero is special -- it doesn't represent
+ * a file and its "fd" field always == VFD_CLOSED. Element zero is just an
+ * anchor that shows us the beginning/end of the ring.
+ * Only VFD elements that are currently really open (have an FD assigned) are
+ * in the Lru ring. Elements that are "virtually" open can be recognized
+ * by having a non-null fileName field.
+ *
+ * example:
+ *
+ * /--less----\ /---------\
+ * v \ v \
+ * #0 --more---> LeastRecentlyUsed --more-\ \
+ * ^\ | |
+ * \\less--> MostRecentlyUsedFile <---/ |
+ * \more---/ \--less--/
+ *
+ *--------------------
+ */
+static void Delete(File file);
+static void LruDelete(File file);
+static void Insert(File file);
+static int LruInsert(File file);
+static bool ReleaseLruFile(void);
+static void ReleaseLruFiles(void);
+static File AllocateVfd(void);
+static void FreeVfd(File file);
+
+static int FileAccess(File file);
+static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError);
+static bool reserveAllocatedDesc(void);
+static int FreeDesc(AllocateDesc *desc);
+
+static void BeforeShmemExit_Files(int code, Datum arg);
+static void CleanupTempFiles(bool isCommit, bool isProcExit);
+static void RemovePgTempRelationFiles(const char *tsdirname);
+static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname);
+
+static void walkdir(const char *path,
+ void (*action) (const char *fname, bool isdir, int elevel),
+ bool process_symlinks,
+ int elevel);
+#ifdef PG_FLUSH_DATA_WORKS
+static void pre_sync_fname(const char *fname, bool isdir, int elevel);
+#endif
+static void datadir_fsync_fname(const char *fname, bool isdir, int elevel);
+static void unlink_if_exists_fname(const char *fname, bool isdir, int elevel);
+
+static int fsync_parent_path(const char *fname, int elevel);
+
+
+/*
+ * pg_fsync --- do fsync with or without writethrough
+ */
+int
+pg_fsync(int fd)
+{
+#if !defined(WIN32) && defined(USE_ASSERT_CHECKING)
+ struct stat st;
+
+ /*
+ * Some operating system implementations of fsync() have requirements
+ * about the file access modes that were used when their file descriptor
+ * argument was opened, and these requirements differ depending on whether
+ * the file descriptor is for a directory.
+ *
+ * For any file descriptor that may eventually be handed to fsync(), we
+ * should have opened it with access modes that are compatible with
+ * fsync() on all supported systems, otherwise the code may not be
+ * portable, even if it runs ok on the current system.
+ *
+ * We assert here that a descriptor for a file was opened with write
+ * permissions (either O_RDWR or O_WRONLY) and for a directory without
+ * write permissions (O_RDONLY).
+ *
+ * Ignore any fstat errors and let the follow-up fsync() do its work.
+ * Doing this sanity check here counts for the case where fsync() is
+ * disabled.
+ */
+ if (fstat(fd, &st) == 0)
+ {
+ int desc_flags = fcntl(fd, F_GETFL);
+
+ /*
+ * O_RDONLY is historically 0, so just make sure that for directories
+ * no write flags are used.
+ */
+ if (S_ISDIR(st.st_mode))
+ Assert((desc_flags & (O_RDWR | O_WRONLY)) == 0);
+ else
+ Assert((desc_flags & (O_RDWR | O_WRONLY)) != 0);
+ }
+ errno = 0;
+#endif
+
+ /* #if is to skip the sync_method test if there's no need for it */
+#if defined(HAVE_FSYNC_WRITETHROUGH) && !defined(FSYNC_WRITETHROUGH_IS_FSYNC)
+ if (sync_method == SYNC_METHOD_FSYNC_WRITETHROUGH)
+ return pg_fsync_writethrough(fd);
+ else
+#endif
+ return pg_fsync_no_writethrough(fd);
+}
+
+
+/*
+ * pg_fsync_no_writethrough --- same as fsync except does nothing if
+ * enableFsync is off
+ */
+int
+pg_fsync_no_writethrough(int fd)
+{
+ int rc;
+
+ if (!enableFsync)
+ return 0;
+
+retry:
+ rc = fsync(fd);
+
+ if (rc == -1 && errno == EINTR)
+ goto retry;
+
+ return rc;
+}
+
+/*
+ * pg_fsync_writethrough
+ */
+int
+pg_fsync_writethrough(int fd)
+{
+ if (enableFsync)
+ {
+#ifdef WIN32
+ return _commit(fd);
+#elif defined(F_FULLFSYNC)
+ return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
+#else
+ errno = ENOSYS;
+ return -1;
+#endif
+ }
+ else
+ return 0;
+}
+
+/*
+ * pg_fdatasync --- same as fdatasync except does nothing if enableFsync is off
+ */
+int
+pg_fdatasync(int fd)
+{
+ int rc;
+
+ if (!enableFsync)
+ return 0;
+
+retry:
+ rc = fdatasync(fd);
+
+ if (rc == -1 && errno == EINTR)
+ goto retry;
+
+ return rc;
+}
+
+/*
+ * pg_flush_data --- advise OS that the described dirty data should be flushed
+ *
+ * offset of 0 with nbytes 0 means that the entire file should be flushed
+ */
+void
+pg_flush_data(int fd, off_t offset, off_t nbytes)
+{
+ /*
+ * Right now file flushing is primarily used to avoid making later
+ * fsync()/fdatasync() calls have less impact. Thus don't trigger flushes
+ * if fsyncs are disabled - that's a decision we might want to make
+ * configurable at some point.
+ */
+ if (!enableFsync)
+ return;
+
+ /*
+ * We compile all alternatives that are supported on the current platform,
+ * to find portability problems more easily.
+ */
+#if defined(HAVE_SYNC_FILE_RANGE)
+ {
+ int rc;
+ static bool not_implemented_by_kernel = false;
+
+ if (not_implemented_by_kernel)
+ return;
+
+retry:
+
+ /*
+ * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific,
+ * tells the OS that writeback for the specified blocks should be
+ * started, but that we don't want to wait for completion. Note that
+ * this call might block if too much dirty data exists in the range.
+ * This is the preferable method on OSs supporting it, as it works
+ * reliably when available (contrast to msync()) and doesn't flush out
+ * clean data (like FADV_DONTNEED).
+ */
+ rc = sync_file_range(fd, offset, nbytes,
+ SYNC_FILE_RANGE_WRITE);
+ if (rc != 0)
+ {
+ int elevel;
+
+ if (rc == EINTR)
+ goto retry;
+
+ /*
+ * For systems that don't have an implementation of
+ * sync_file_range() such as Windows WSL, generate only one
+ * warning and then suppress all further attempts by this process.
+ */
+ if (errno == ENOSYS)
+ {
+ elevel = WARNING;
+ not_implemented_by_kernel = true;
+ }
+ else
+ elevel = data_sync_elevel(WARNING);
+
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not flush dirty data: %m")));
+ }
+
+ return;
+ }
+#endif
+#if !defined(WIN32) && defined(MS_ASYNC)
+ {
+ void *p;
+ static int pagesize = 0;
+
+ /*
+ * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers
+ * writeback. On linux it only does so if MS_SYNC is specified, but
+ * then it does the writeback synchronously. Luckily all common linux
+ * systems have sync_file_range(). This is preferable over
+ * FADV_DONTNEED because it doesn't flush out clean data.
+ *
+ * We map the file (mmap()), tell the kernel to sync back the contents
+ * (msync()), and then remove the mapping again (munmap()).
+ */
+
+ /* mmap() needs actual length if we want to map whole file */
+ if (offset == 0 && nbytes == 0)
+ {
+ nbytes = lseek(fd, 0, SEEK_END);
+ if (nbytes < 0)
+ {
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not determine dirty data size: %m")));
+ return;
+ }
+ }
+
+ /*
+ * Some platforms reject partial-page mmap() attempts. To deal with
+ * that, just truncate the request to a page boundary. If any extra
+ * bytes don't get flushed, well, it's only a hint anyway.
+ */
+
+ /* fetch pagesize only once */
+ if (pagesize == 0)
+ pagesize = sysconf(_SC_PAGESIZE);
+
+ /* align length to pagesize, dropping any fractional page */
+ if (pagesize > 0)
+ nbytes = (nbytes / pagesize) * pagesize;
+
+ /* fractional-page request is a no-op */
+ if (nbytes <= 0)
+ return;
+
+ /*
+ * mmap could well fail, particularly on 32-bit platforms where there
+ * may simply not be enough address space. If so, silently fall
+ * through to the next implementation.
+ */
+ if (nbytes <= (off_t) SSIZE_MAX)
+ p = mmap(NULL, nbytes, PROT_READ, MAP_SHARED, fd, offset);
+ else
+ p = MAP_FAILED;
+
+ if (p != MAP_FAILED)
+ {
+ int rc;
+
+ rc = msync(p, (size_t) nbytes, MS_ASYNC);
+ if (rc != 0)
+ {
+ ereport(data_sync_elevel(WARNING),
+ (errcode_for_file_access(),
+ errmsg("could not flush dirty data: %m")));
+ /* NB: need to fall through to munmap()! */
+ }
+
+ rc = munmap(p, (size_t) nbytes);
+ if (rc != 0)
+ {
+ /* FATAL error because mapping would remain */
+ ereport(FATAL,
+ (errcode_for_file_access(),
+ errmsg("could not munmap() while flushing data: %m")));
+ }
+
+ return;
+ }
+ }
+#endif
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+ {
+ int rc;
+
+ /*
+ * Signal the kernel that the passed in range should not be cached
+ * anymore. This has the, desired, side effect of writing out dirty
+ * data, and the, undesired, side effect of likely discarding useful
+ * clean cached blocks. For the latter reason this is the least
+ * preferable method.
+ */
+
+ rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED);
+
+ if (rc != 0)
+ {
+ /* don't error out, this is just a performance optimization */
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not flush dirty data: %m")));
+ }
+
+ return;
+ }
+#endif
+}
+
+/*
+ * Truncate an open file to a given length.
+ */
+static int
+pg_ftruncate(int fd, off_t length)
+{
+ int ret;
+
+retry:
+ ret = ftruncate(fd, length);
+
+ if (ret == -1 && errno == EINTR)
+ goto retry;
+
+ return ret;
+}
+
+/*
+ * Truncate a file to a given length by name.
+ */
+int
+pg_truncate(const char *path, off_t length)
+{
+ int ret;
+#ifdef WIN32
+ int save_errno;
+ int fd;
+
+ fd = OpenTransientFile(path, O_RDWR | PG_BINARY);
+ if (fd >= 0)
+ {
+ ret = pg_ftruncate(fd, length);
+ save_errno = errno;
+ CloseTransientFile(fd);
+ errno = save_errno;
+ }
+ else
+ ret = -1;
+#else
+
+retry:
+ ret = truncate(path, length);
+
+ if (ret == -1 && errno == EINTR)
+ goto retry;
+#endif
+
+ return ret;
+}
+
+/*
+ * fsync_fname -- fsync a file or directory, handling errors properly
+ *
+ * Try to fsync a file or directory. When doing the latter, ignore errors that
+ * indicate the OS just doesn't allow/require fsyncing directories.
+ */
+void
+fsync_fname(const char *fname, bool isdir)
+{
+ fsync_fname_ext(fname, isdir, false, data_sync_elevel(ERROR));
+}
+
+/*
+ * durable_rename -- rename(2) wrapper, issuing fsyncs required for durability
+ *
+ * This routine ensures that, after returning, the effect of renaming file
+ * persists in case of a crash. A crash while this routine is running will
+ * leave you with either the pre-existing or the moved file in place of the
+ * new file; no mixed state or truncated files are possible.
+ *
+ * It does so by using fsync on the old filename and the possibly existing
+ * target filename before the rename, and the target file and directory after.
+ *
+ * Note that rename() cannot be used across arbitrary directories, as they
+ * might not be on the same filesystem. Therefore this routine does not
+ * support renaming across directories.
+ *
+ * Log errors with the caller specified severity.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_rename(const char *oldfile, const char *newfile, int elevel)
+{
+ int fd;
+
+ /*
+ * First fsync the old and target path (if it exists), to ensure that they
+ * are properly persistent on disk. Syncing the target file is not
+ * strictly necessary, but it makes it easier to reason about crashes;
+ * because it's then guaranteed that either source or target file exists
+ * after a crash.
+ */
+ if (fsync_fname_ext(oldfile, false, false, elevel) != 0)
+ return -1;
+
+ fd = OpenTransientFile(newfile, PG_BINARY | O_RDWR);
+ if (fd < 0)
+ {
+ if (errno != ENOENT)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", newfile)));
+ return -1;
+ }
+ }
+ else
+ {
+ if (pg_fsync(fd) != 0)
+ {
+ int save_errno;
+
+ /* close file upon error, might not be in transaction context */
+ save_errno = errno;
+ CloseTransientFile(fd);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", newfile)));
+ return -1;
+ }
+
+ if (CloseTransientFile(fd) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", newfile)));
+ return -1;
+ }
+ }
+
+ /* Time to do the real deal... */
+ if (rename(oldfile, newfile) < 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not rename file \"%s\" to \"%s\": %m",
+ oldfile, newfile)));
+ return -1;
+ }
+
+ /*
+ * To guarantee renaming the file is persistent, fsync the file with its
+ * new name, and its containing directory.
+ */
+ if (fsync_fname_ext(newfile, false, false, elevel) != 0)
+ return -1;
+
+ if (fsync_parent_path(newfile, elevel) != 0)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * durable_unlink -- remove a file in a durable manner
+ *
+ * This routine ensures that, after returning, the effect of removing file
+ * persists in case of a crash. A crash while this routine is running will
+ * leave the system in no mixed state.
+ *
+ * It does so by using fsync on the parent directory of the file after the
+ * actual removal is done.
+ *
+ * Log errors with the severity specified by caller.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise. Note that errno is not
+ * valid upon return.
+ */
+int
+durable_unlink(const char *fname, int elevel)
+{
+ if (unlink(fname) < 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m",
+ fname)));
+ return -1;
+ }
+
+ /*
+ * To guarantee that the removal of the file is persistent, fsync its
+ * parent directory.
+ */
+ if (fsync_parent_path(fname, elevel) != 0)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * InitFileAccess --- initialize this module during backend startup
+ *
+ * This is called during either normal or standalone backend start.
+ * It is *not* called in the postmaster.
+ *
+ * Note that this does not initialize temporary file access, that is
+ * separately initialized via InitTemporaryFileAccess().
+ */
+void
+InitFileAccess(void)
+{
+ Assert(SizeVfdCache == 0); /* call me only once */
+
+ /* initialize cache header entry */
+ VfdCache = (Vfd *) malloc(sizeof(Vfd));
+ if (VfdCache == NULL)
+ ereport(FATAL,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+
+ MemSet((char *) &(VfdCache[0]), 0, sizeof(Vfd));
+ VfdCache->fd = VFD_CLOSED;
+
+ SizeVfdCache = 1;
+}
+
+/*
+ * InitTemporaryFileAccess --- initialize temporary file access during startup
+ *
+ * This is called during either normal or standalone backend start.
+ * It is *not* called in the postmaster.
+ *
+ * This is separate from InitFileAccess() because temporary file cleanup can
+ * cause pgstat reporting. As pgstat is shut down during before_shmem_exit(),
+ * our reporting has to happen before that. Low level file access should be
+ * available for longer, hence the separate initialization / shutdown of
+ * temporary file handling.
+ */
+void
+InitTemporaryFileAccess(void)
+{
+ Assert(SizeVfdCache != 0); /* InitFileAccess() needs to have run */
+ Assert(!temporary_files_allowed); /* call me only once */
+
+ /*
+ * Register before-shmem-exit hook to ensure temp files are dropped while
+ * we can still report stats.
+ */
+ before_shmem_exit(BeforeShmemExit_Files, 0);
+
+#ifdef USE_ASSERT_CHECKING
+ temporary_files_allowed = true;
+#endif
+}
+
+/*
+ * count_usable_fds --- count how many FDs the system will let us open,
+ * and estimate how many are already open.
+ *
+ * We stop counting if usable_fds reaches max_to_probe. Note: a small
+ * value of max_to_probe might result in an underestimate of already_open;
+ * we must fill in any "gaps" in the set of used FDs before the calculation
+ * of already_open will give the right answer. In practice, max_to_probe
+ * of a couple of dozen should be enough to ensure good results.
+ *
+ * We assume stderr (FD 2) is available for dup'ing. While the calling
+ * script could theoretically close that, it would be a really bad idea,
+ * since then one risks loss of error messages from, e.g., libc.
+ */
+static void
+count_usable_fds(int max_to_probe, int *usable_fds, int *already_open)
+{
+ int *fd;
+ int size;
+ int used = 0;
+ int highestfd = 0;
+ int j;
+
+#ifdef HAVE_GETRLIMIT
+ struct rlimit rlim;
+ int getrlimit_status;
+#endif
+
+ size = 1024;
+ fd = (int *) palloc(size * sizeof(int));
+
+#ifdef HAVE_GETRLIMIT
+ getrlimit_status = getrlimit(RLIMIT_NOFILE, &rlim);
+ if (getrlimit_status != 0)
+ ereport(WARNING, (errmsg("getrlimit failed: %m")));
+#endif /* HAVE_GETRLIMIT */
+
+ /* dup until failure or probe limit reached */
+ for (;;)
+ {
+ int thisfd;
+
+#ifdef HAVE_GETRLIMIT
+
+ /*
+ * don't go beyond RLIMIT_NOFILE; causes irritating kernel logs on
+ * some platforms
+ */
+ if (getrlimit_status == 0 && highestfd >= rlim.rlim_cur - 1)
+ break;
+#endif
+
+ thisfd = dup(2);
+ if (thisfd < 0)
+ {
+ /* Expect EMFILE or ENFILE, else it's fishy */
+ if (errno != EMFILE && errno != ENFILE)
+ elog(WARNING, "duplicating stderr file descriptor failed after %d successes: %m", used);
+ break;
+ }
+
+ if (used >= size)
+ {
+ size *= 2;
+ fd = (int *) repalloc(fd, size * sizeof(int));
+ }
+ fd[used++] = thisfd;
+
+ if (highestfd < thisfd)
+ highestfd = thisfd;
+
+ if (used >= max_to_probe)
+ break;
+ }
+
+ /* release the files we opened */
+ for (j = 0; j < used; j++)
+ close(fd[j]);
+
+ pfree(fd);
+
+ /*
+ * Return results. usable_fds is just the number of successful dups. We
+ * assume that the system limit is highestfd+1 (remember 0 is a legal FD
+ * number) and so already_open is highestfd+1 - usable_fds.
+ */
+ *usable_fds = used;
+ *already_open = highestfd + 1 - used;
+}
+
+/*
+ * set_max_safe_fds
+ * Determine number of file descriptors that fd.c is allowed to use
+ */
+void
+set_max_safe_fds(void)
+{
+ int usable_fds;
+ int already_open;
+
+ /*----------
+ * We want to set max_safe_fds to
+ * MIN(usable_fds, max_files_per_process - already_open)
+ * less the slop factor for files that are opened without consulting
+ * fd.c. This ensures that we won't exceed either max_files_per_process
+ * or the experimentally-determined EMFILE limit.
+ *----------
+ */
+ count_usable_fds(max_files_per_process,
+ &usable_fds, &already_open);
+
+ max_safe_fds = Min(usable_fds, max_files_per_process - already_open);
+
+ /*
+ * Take off the FDs reserved for system() etc.
+ */
+ max_safe_fds -= NUM_RESERVED_FDS;
+
+ /*
+ * Make sure we still have enough to get by.
+ */
+ if (max_safe_fds < FD_MINFREE)
+ ereport(FATAL,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("insufficient file descriptors available to start server process"),
+ errdetail("System allows %d, server needs at least %d.",
+ max_safe_fds + NUM_RESERVED_FDS,
+ FD_MINFREE + NUM_RESERVED_FDS)));
+
+ elog(DEBUG2, "max_safe_fds = %d, usable_fds = %d, already_open = %d",
+ max_safe_fds, usable_fds, already_open);
+}
+
+/*
+ * Open a file with BasicOpenFilePerm() and pass default file mode for the
+ * fileMode parameter.
+ */
+int
+BasicOpenFile(const char *fileName, int fileFlags)
+{
+ return BasicOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * BasicOpenFilePerm --- same as open(2) except can free other FDs if needed
+ *
+ * This is exported for use by places that really want a plain kernel FD,
+ * but need to be proof against running out of FDs. Once an FD has been
+ * successfully returned, it is the caller's responsibility to ensure that
+ * it will not be leaked on ereport()! Most users should *not* call this
+ * routine directly, but instead use the VFD abstraction level, which
+ * provides protection against descriptor leaks as well as management of
+ * files that need to be open for more than a short period of time.
+ *
+ * Ideally this should be the *only* direct call of open() in the backend.
+ * In practice, the postmaster calls open() directly, and there are some
+ * direct open() calls done early in backend startup. Those are OK since
+ * this module wouldn't have any open files to close at that point anyway.
+ */
+int
+BasicOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+ int fd;
+
+tryAgain:
+#ifdef PG_O_DIRECT_USE_F_NOCACHE
+
+ /*
+ * The value we defined to stand in for O_DIRECT when simulating it with
+ * F_NOCACHE had better not collide with any of the standard flags.
+ */
+ StaticAssertStmt((PG_O_DIRECT &
+ (O_APPEND |
+ O_CLOEXEC |
+ O_CREAT |
+ O_DSYNC |
+ O_EXCL |
+ O_RDWR |
+ O_RDONLY |
+ O_SYNC |
+ O_TRUNC |
+ O_WRONLY)) == 0,
+ "PG_O_DIRECT value collides with standard flag");
+ fd = open(fileName, fileFlags & ~PG_O_DIRECT, fileMode);
+#else
+ fd = open(fileName, fileFlags, fileMode);
+#endif
+
+ if (fd >= 0)
+ {
+#ifdef PG_O_DIRECT_USE_F_NOCACHE
+ if (fileFlags & PG_O_DIRECT)
+ {
+ if (fcntl(fd, F_NOCACHE, 1) < 0)
+ {
+ int save_errno = errno;
+
+ close(fd);
+ errno = save_errno;
+ return -1;
+ }
+ }
+#endif
+
+ return fd; /* success! */
+ }
+
+ if (errno == EMFILE || errno == ENFILE)
+ {
+ int save_errno = errno;
+
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("out of file descriptors: %m; release and retry")));
+ errno = 0;
+ if (ReleaseLruFile())
+ goto tryAgain;
+ errno = save_errno;
+ }
+
+ return -1; /* failure */
+}
+
+/*
+ * AcquireExternalFD - attempt to reserve an external file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module.
+ *
+ * The difference between this and the underlying ReserveExternalFD function
+ * is that this will report failure (by setting errno and returning false)
+ * if "too many" external FDs are already reserved. This should be used in
+ * any code where the total number of FDs to be reserved is not predictable
+ * and small.
+ */
+bool
+AcquireExternalFD(void)
+{
+ /*
+ * We don't want more than max_safe_fds / 3 FDs to be consumed for
+ * "external" FDs.
+ */
+ if (numExternalFDs < max_safe_fds / 3)
+ {
+ ReserveExternalFD();
+ return true;
+ }
+ errno = EMFILE;
+ return false;
+}
+
+/*
+ * ReserveExternalFD - report external consumption of a file descriptor
+ *
+ * This should be used by callers that need to hold a file descriptor open
+ * over more than a short interval, but cannot use any of the other facilities
+ * provided by this module. This just tracks the use of the FD and closes
+ * VFDs if needed to ensure we keep NUM_RESERVED_FDS FDs available.
+ *
+ * Call this directly only in code where failure to reserve the FD would be
+ * fatal; for example, the WAL-writing code does so, since the alternative is
+ * session failure. Also, it's very unwise to do so in code that could
+ * consume more than one FD per process.
+ *
+ * Note: as long as everybody plays nice so that NUM_RESERVED_FDS FDs remain
+ * available, it doesn't matter too much whether this is called before or
+ * after actually opening the FD; but doing so beforehand reduces the risk of
+ * an EMFILE failure if not everybody played nice. In any case, it's solely
+ * caller's responsibility to keep the external-FD count in sync with reality.
+ */
+void
+ReserveExternalFD(void)
+{
+ /*
+ * Release VFDs if needed to stay safe. Because we do this before
+ * incrementing numExternalFDs, the final state will be as desired, i.e.,
+ * nfile + numAllocatedDescs + numExternalFDs <= max_safe_fds.
+ */
+ ReleaseLruFiles();
+
+ numExternalFDs++;
+}
+
+/*
+ * ReleaseExternalFD - report release of an external file descriptor
+ *
+ * This is guaranteed not to change errno, so it can be used in failure paths.
+ */
+void
+ReleaseExternalFD(void)
+{
+ Assert(numExternalFDs > 0);
+ numExternalFDs--;
+}
+
+
+#if defined(FDDEBUG)
+
+static void
+_dump_lru(void)
+{
+ int mru = VfdCache[0].lruLessRecently;
+ Vfd *vfdP = &VfdCache[mru];
+ char buf[2048];
+
+ snprintf(buf, sizeof(buf), "LRU: MOST %d ", mru);
+ while (mru != 0)
+ {
+ mru = vfdP->lruLessRecently;
+ vfdP = &VfdCache[mru];
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "%d ", mru);
+ }
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), "LEAST");
+ elog(LOG, "%s", buf);
+}
+#endif /* FDDEBUG */
+
+static void
+Delete(File file)
+{
+ Vfd *vfdP;
+
+ Assert(file != 0);
+
+ DO_DB(elog(LOG, "Delete %d (%s)",
+ file, VfdCache[file].fileName));
+ DO_DB(_dump_lru());
+
+ vfdP = &VfdCache[file];
+
+ VfdCache[vfdP->lruLessRecently].lruMoreRecently = vfdP->lruMoreRecently;
+ VfdCache[vfdP->lruMoreRecently].lruLessRecently = vfdP->lruLessRecently;
+
+ DO_DB(_dump_lru());
+}
+
+static void
+LruDelete(File file)
+{
+ Vfd *vfdP;
+
+ Assert(file != 0);
+
+ DO_DB(elog(LOG, "LruDelete %d (%s)",
+ file, VfdCache[file].fileName));
+
+ vfdP = &VfdCache[file];
+
+ /*
+ * Close the file. We aren't expecting this to fail; if it does, better
+ * to leak the FD than to mess up our internal state.
+ */
+ if (close(vfdP->fd) != 0)
+ elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+ "could not close file \"%s\": %m", vfdP->fileName);
+ vfdP->fd = VFD_CLOSED;
+ --nfile;
+
+ /* delete the vfd record from the LRU ring */
+ Delete(file);
+}
+
+static void
+Insert(File file)
+{
+ Vfd *vfdP;
+
+ Assert(file != 0);
+
+ DO_DB(elog(LOG, "Insert %d (%s)",
+ file, VfdCache[file].fileName));
+ DO_DB(_dump_lru());
+
+ vfdP = &VfdCache[file];
+
+ vfdP->lruMoreRecently = 0;
+ vfdP->lruLessRecently = VfdCache[0].lruLessRecently;
+ VfdCache[0].lruLessRecently = file;
+ VfdCache[vfdP->lruLessRecently].lruMoreRecently = file;
+
+ DO_DB(_dump_lru());
+}
+
+/* returns 0 on success, -1 on re-open failure (with errno set) */
+static int
+LruInsert(File file)
+{
+ Vfd *vfdP;
+
+ Assert(file != 0);
+
+ DO_DB(elog(LOG, "LruInsert %d (%s)",
+ file, VfdCache[file].fileName));
+
+ vfdP = &VfdCache[file];
+
+ if (FileIsNotOpen(file))
+ {
+ /* Close excess kernel FDs. */
+ ReleaseLruFiles();
+
+ /*
+ * The open could still fail for lack of file descriptors, eg due to
+ * overall system file table being full. So, be prepared to release
+ * another FD if necessary...
+ */
+ vfdP->fd = BasicOpenFilePerm(vfdP->fileName, vfdP->fileFlags,
+ vfdP->fileMode);
+ if (vfdP->fd < 0)
+ {
+ DO_DB(elog(LOG, "re-open failed: %m"));
+ return -1;
+ }
+ else
+ {
+ ++nfile;
+ }
+ }
+
+ /*
+ * put it at the head of the Lru ring
+ */
+
+ Insert(file);
+
+ return 0;
+}
+
+/*
+ * Release one kernel FD by closing the least-recently-used VFD.
+ */
+static bool
+ReleaseLruFile(void)
+{
+ DO_DB(elog(LOG, "ReleaseLruFile. Opened %d", nfile));
+
+ if (nfile > 0)
+ {
+ /*
+ * There are opened files and so there should be at least one used vfd
+ * in the ring.
+ */
+ Assert(VfdCache[0].lruMoreRecently != 0);
+ LruDelete(VfdCache[0].lruMoreRecently);
+ return true; /* freed a file */
+ }
+ return false; /* no files available to free */
+}
+
+/*
+ * Release kernel FDs as needed to get under the max_safe_fds limit.
+ * After calling this, it's OK to try to open another file.
+ */
+static void
+ReleaseLruFiles(void)
+{
+ while (nfile + numAllocatedDescs + numExternalFDs >= max_safe_fds)
+ {
+ if (!ReleaseLruFile())
+ break;
+ }
+}
+
+static File
+AllocateVfd(void)
+{
+ Index i;
+ File file;
+
+ DO_DB(elog(LOG, "AllocateVfd. Size %zu", SizeVfdCache));
+
+ Assert(SizeVfdCache > 0); /* InitFileAccess not called? */
+
+ if (VfdCache[0].nextFree == 0)
+ {
+ /*
+ * The free list is empty so it is time to increase the size of the
+ * array. We choose to double it each time this happens. However,
+ * there's not much point in starting *real* small.
+ */
+ Size newCacheSize = SizeVfdCache * 2;
+ Vfd *newVfdCache;
+
+ if (newCacheSize < 32)
+ newCacheSize = 32;
+
+ /*
+ * Be careful not to clobber VfdCache ptr if realloc fails.
+ */
+ newVfdCache = (Vfd *) realloc(VfdCache, sizeof(Vfd) * newCacheSize);
+ if (newVfdCache == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ VfdCache = newVfdCache;
+
+ /*
+ * Initialize the new entries and link them into the free list.
+ */
+ for (i = SizeVfdCache; i < newCacheSize; i++)
+ {
+ MemSet((char *) &(VfdCache[i]), 0, sizeof(Vfd));
+ VfdCache[i].nextFree = i + 1;
+ VfdCache[i].fd = VFD_CLOSED;
+ }
+ VfdCache[newCacheSize - 1].nextFree = 0;
+ VfdCache[0].nextFree = SizeVfdCache;
+
+ /*
+ * Record the new size
+ */
+ SizeVfdCache = newCacheSize;
+ }
+
+ file = VfdCache[0].nextFree;
+
+ VfdCache[0].nextFree = VfdCache[file].nextFree;
+
+ return file;
+}
+
+static void
+FreeVfd(File file)
+{
+ Vfd *vfdP = &VfdCache[file];
+
+ DO_DB(elog(LOG, "FreeVfd: %d (%s)",
+ file, vfdP->fileName ? vfdP->fileName : ""));
+
+ if (vfdP->fileName != NULL)
+ {
+ free(vfdP->fileName);
+ vfdP->fileName = NULL;
+ }
+ vfdP->fdstate = 0x0;
+
+ vfdP->nextFree = VfdCache[0].nextFree;
+ VfdCache[0].nextFree = file;
+}
+
+/* returns 0 on success, -1 on re-open failure (with errno set) */
+static int
+FileAccess(File file)
+{
+ int returnValue;
+
+ DO_DB(elog(LOG, "FileAccess %d (%s)",
+ file, VfdCache[file].fileName));
+
+ /*
+ * Is the file open? If not, open it and put it at the head of the LRU
+ * ring (possibly closing the least recently used file to get an FD).
+ */
+
+ if (FileIsNotOpen(file))
+ {
+ returnValue = LruInsert(file);
+ if (returnValue != 0)
+ return returnValue;
+ }
+ else if (VfdCache[0].lruLessRecently != file)
+ {
+ /*
+ * We now know that the file is open and that it is not the last one
+ * accessed, so we need to move it to the head of the Lru ring.
+ */
+
+ Delete(file);
+ Insert(file);
+ }
+
+ return 0;
+}
+
+/*
+ * Called whenever a temporary file is deleted to report its size.
+ */
+static void
+ReportTemporaryFileUsage(const char *path, off_t size)
+{
+ pgstat_report_tempfile(size);
+
+ if (log_temp_files >= 0)
+ {
+ if ((size / 1024) >= log_temp_files)
+ ereport(LOG,
+ (errmsg("temporary file: path \"%s\", size %lu",
+ path, (unsigned long) size)));
+ }
+}
+
+/*
+ * Called to register a temporary file for automatic close.
+ * ResourceOwnerEnlargeFiles(CurrentResourceOwner) must have been called
+ * before the file was opened.
+ */
+static void
+RegisterTemporaryFile(File file)
+{
+ ResourceOwnerRememberFile(CurrentResourceOwner, file);
+ VfdCache[file].resowner = CurrentResourceOwner;
+
+ /* Backup mechanism for closing at end of xact. */
+ VfdCache[file].fdstate |= FD_CLOSE_AT_EOXACT;
+ have_xact_temporary_files = true;
+}
+
+/*
+ * Called when we get a shared invalidation message on some relation.
+ */
+#ifdef NOT_USED
+void
+FileInvalidate(File file)
+{
+ Assert(FileIsValid(file));
+ if (!FileIsNotOpen(file))
+ LruDelete(file);
+}
+#endif
+
+/*
+ * Open a file with PathNameOpenFilePerm() and pass default file mode for the
+ * fileMode parameter.
+ */
+File
+PathNameOpenFile(const char *fileName, int fileFlags)
+{
+ return PathNameOpenFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * open a file in an arbitrary directory
+ *
+ * NB: if the passed pathname is relative (which it usually is),
+ * it will be interpreted relative to the process' working directory
+ * (which should always be $PGDATA when this code is running).
+ */
+File
+PathNameOpenFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+ char *fnamecopy;
+ File file;
+ Vfd *vfdP;
+
+ DO_DB(elog(LOG, "PathNameOpenFilePerm: %s %x %o",
+ fileName, fileFlags, fileMode));
+
+ /*
+ * We need a malloc'd copy of the file name; fail cleanly if no room.
+ */
+ fnamecopy = strdup(fileName);
+ if (fnamecopy == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+
+ file = AllocateVfd();
+ vfdP = &VfdCache[file];
+
+ /* Close excess kernel FDs. */
+ ReleaseLruFiles();
+
+ /*
+ * Descriptors managed by VFDs are implicitly marked O_CLOEXEC. The
+ * client shouldn't be expected to know which kernel descriptors are
+ * currently open, so it wouldn't make sense for them to be inherited by
+ * executed subprograms.
+ */
+ fileFlags |= O_CLOEXEC;
+
+ vfdP->fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
+
+ if (vfdP->fd < 0)
+ {
+ int save_errno = errno;
+
+ FreeVfd(file);
+ free(fnamecopy);
+ errno = save_errno;
+ return -1;
+ }
+ ++nfile;
+ DO_DB(elog(LOG, "PathNameOpenFile: success %d",
+ vfdP->fd));
+
+ vfdP->fileName = fnamecopy;
+ /* Saved flags are adjusted to be OK for re-opening file */
+ vfdP->fileFlags = fileFlags & ~(O_CREAT | O_TRUNC | O_EXCL);
+ vfdP->fileMode = fileMode;
+ vfdP->fileSize = 0;
+ vfdP->fdstate = 0x0;
+ vfdP->resowner = NULL;
+
+ Insert(file);
+
+ return file;
+}
+
+/*
+ * Create directory 'directory'. If necessary, create 'basedir', which must
+ * be the directory above it. This is designed for creating the top-level
+ * temporary directory on demand before creating a directory underneath it.
+ * Do nothing if the directory already exists.
+ *
+ * Directories created within the top-level temporary directory should begin
+ * with PG_TEMP_FILE_PREFIX, so that they can be identified as temporary and
+ * deleted at startup by RemovePgTempFiles(). Further subdirectories below
+ * that do not need any particular prefix.
+*/
+void
+PathNameCreateTemporaryDir(const char *basedir, const char *directory)
+{
+ if (MakePGDirectory(directory) < 0)
+ {
+ if (errno == EEXIST)
+ return;
+
+ /*
+ * Failed. Try to create basedir first in case it's missing. Tolerate
+ * EEXIST to close a race against another process following the same
+ * algorithm.
+ */
+ if (MakePGDirectory(basedir) < 0 && errno != EEXIST)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("cannot create temporary directory \"%s\": %m",
+ basedir)));
+
+ /* Try again. */
+ if (MakePGDirectory(directory) < 0 && errno != EEXIST)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("cannot create temporary subdirectory \"%s\": %m",
+ directory)));
+ }
+}
+
+/*
+ * Delete a directory and everything in it, if it exists.
+ */
+void
+PathNameDeleteTemporaryDir(const char *dirname)
+{
+ struct stat statbuf;
+
+ /* Silently ignore missing directory. */
+ if (stat(dirname, &statbuf) != 0 && errno == ENOENT)
+ return;
+
+ /*
+ * Currently, walkdir doesn't offer a way for our passed in function to
+ * maintain state. Perhaps it should, so that we could tell the caller
+ * whether this operation succeeded or failed. Since this operation is
+ * used in a cleanup path, we wouldn't actually behave differently: we'll
+ * just log failures.
+ */
+ walkdir(dirname, unlink_if_exists_fname, false, LOG);
+}
+
+/*
+ * Open a temporary file that will disappear when we close it.
+ *
+ * This routine takes care of generating an appropriate tempfile name.
+ * There's no need to pass in fileFlags or fileMode either, since only
+ * one setting makes any sense for a temp file.
+ *
+ * Unless interXact is true, the file is remembered by CurrentResourceOwner
+ * to ensure it's closed and deleted when it's no longer needed, typically at
+ * the end-of-transaction. In most cases, you don't want temporary files to
+ * outlive the transaction that created them, so this should be false -- but
+ * if you need "somewhat" temporary storage, this might be useful. In either
+ * case, the file is removed when the File is explicitly closed.
+ */
+File
+OpenTemporaryFile(bool interXact)
+{
+ File file = 0;
+
+ Assert(temporary_files_allowed); /* check temp file access is up */
+
+ /*
+ * Make sure the current resource owner has space for this File before we
+ * open it, if we'll be registering it below.
+ */
+ if (!interXact)
+ ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+ /*
+ * If some temp tablespace(s) have been given to us, try to use the next
+ * one. If a given tablespace can't be found, we silently fall back to
+ * the database's default tablespace.
+ *
+ * BUT: if the temp file is slated to outlive the current transaction,
+ * force it into the database's default tablespace, so that it will not
+ * pose a threat to possible tablespace drop attempts.
+ */
+ if (numTempTableSpaces > 0 && !interXact)
+ {
+ Oid tblspcOid = GetNextTempTableSpace();
+
+ if (OidIsValid(tblspcOid))
+ file = OpenTemporaryFileInTablespace(tblspcOid, false);
+ }
+
+ /*
+ * If not, or if tablespace is bad, create in database's default
+ * tablespace. MyDatabaseTableSpace should normally be set before we get
+ * here, but just in case it isn't, fall back to pg_default tablespace.
+ */
+ if (file <= 0)
+ file = OpenTemporaryFileInTablespace(MyDatabaseTableSpace ?
+ MyDatabaseTableSpace :
+ DEFAULTTABLESPACE_OID,
+ true);
+
+ /* Mark it for deletion at close and temporary file size limit */
+ VfdCache[file].fdstate |= FD_DELETE_AT_CLOSE | FD_TEMP_FILE_LIMIT;
+
+ /* Register it with the current resource owner */
+ if (!interXact)
+ RegisterTemporaryFile(file);
+
+ return file;
+}
+
+/*
+ * Return the path of the temp directory in a given tablespace.
+ */
+void
+TempTablespacePath(char *path, Oid tablespace)
+{
+ /*
+ * Identify the tempfile directory for this tablespace.
+ *
+ * If someone tries to specify pg_global, use pg_default instead.
+ */
+ if (tablespace == InvalidOid ||
+ tablespace == DEFAULTTABLESPACE_OID ||
+ tablespace == GLOBALTABLESPACE_OID)
+ snprintf(path, MAXPGPATH, "base/%s", PG_TEMP_FILES_DIR);
+ else
+ {
+ /* All other tablespaces are accessed via symlinks */
+ snprintf(path, MAXPGPATH, "pg_tblspc/%u/%s/%s",
+ tablespace, TABLESPACE_VERSION_DIRECTORY,
+ PG_TEMP_FILES_DIR);
+ }
+}
+
+/*
+ * Open a temporary file in a specific tablespace.
+ * Subroutine for OpenTemporaryFile, which see for details.
+ */
+static File
+OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError)
+{
+ char tempdirpath[MAXPGPATH];
+ char tempfilepath[MAXPGPATH];
+ File file;
+
+ TempTablespacePath(tempdirpath, tblspcOid);
+
+ /*
+ * Generate a tempfile name that should be unique within the current
+ * database instance.
+ */
+ snprintf(tempfilepath, sizeof(tempfilepath), "%s/%s%d.%ld",
+ tempdirpath, PG_TEMP_FILE_PREFIX, MyProcPid, tempFileCounter++);
+
+ /*
+ * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
+ * temp file that can be reused.
+ */
+ file = PathNameOpenFile(tempfilepath,
+ O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+ if (file <= 0)
+ {
+ /*
+ * We might need to create the tablespace's tempfile directory, if no
+ * one has yet done so.
+ *
+ * Don't check for an error from MakePGDirectory; it could fail if
+ * someone else just did the same thing. If it doesn't work then
+ * we'll bomb out on the second create attempt, instead.
+ */
+ (void) MakePGDirectory(tempdirpath);
+
+ file = PathNameOpenFile(tempfilepath,
+ O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+ if (file <= 0 && rejectError)
+ elog(ERROR, "could not create temporary file \"%s\": %m",
+ tempfilepath);
+ }
+
+ return file;
+}
+
+
+/*
+ * Create a new file. The directory containing it must already exist. Files
+ * created this way are subject to temp_file_limit and are automatically
+ * closed at end of transaction, but are not automatically deleted on close
+ * because they are intended to be shared between cooperating backends.
+ *
+ * If the file is inside the top-level temporary directory, its name should
+ * begin with PG_TEMP_FILE_PREFIX so that it can be identified as temporary
+ * and deleted at startup by RemovePgTempFiles(). Alternatively, it can be
+ * inside a directory created with PathNameCreateTemporaryDir(), in which case
+ * the prefix isn't needed.
+ */
+File
+PathNameCreateTemporaryFile(const char *path, bool error_on_failure)
+{
+ File file;
+
+ Assert(temporary_files_allowed); /* check temp file access is up */
+
+ ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+ /*
+ * Open the file. Note: we don't use O_EXCL, in case there is an orphaned
+ * temp file that can be reused.
+ */
+ file = PathNameOpenFile(path, O_RDWR | O_CREAT | O_TRUNC | PG_BINARY);
+ if (file <= 0)
+ {
+ if (error_on_failure)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create temporary file \"%s\": %m",
+ path)));
+ else
+ return file;
+ }
+
+ /* Mark it for temp_file_limit accounting. */
+ VfdCache[file].fdstate |= FD_TEMP_FILE_LIMIT;
+
+ /* Register it for automatic close. */
+ RegisterTemporaryFile(file);
+
+ return file;
+}
+
+/*
+ * Open a file that was created with PathNameCreateTemporaryFile, possibly in
+ * another backend. Files opened this way don't count against the
+ * temp_file_limit of the caller, are automatically closed at the end of the
+ * transaction but are not deleted on close.
+ */
+File
+PathNameOpenTemporaryFile(const char *path, int mode)
+{
+ File file;
+
+ Assert(temporary_files_allowed); /* check temp file access is up */
+
+ ResourceOwnerEnlargeFiles(CurrentResourceOwner);
+
+ file = PathNameOpenFile(path, mode | PG_BINARY);
+
+ /* If no such file, then we don't raise an error. */
+ if (file <= 0 && errno != ENOENT)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open temporary file \"%s\": %m",
+ path)));
+
+ if (file > 0)
+ {
+ /* Register it for automatic close. */
+ RegisterTemporaryFile(file);
+ }
+
+ return file;
+}
+
+/*
+ * Delete a file by pathname. Return true if the file existed, false if
+ * didn't.
+ */
+bool
+PathNameDeleteTemporaryFile(const char *path, bool error_on_failure)
+{
+ struct stat filestats;
+ int stat_errno;
+
+ /* Get the final size for pgstat reporting. */
+ if (stat(path, &filestats) != 0)
+ stat_errno = errno;
+ else
+ stat_errno = 0;
+
+ /*
+ * Unlike FileClose's automatic file deletion code, we tolerate
+ * non-existence to support BufFileDeleteFileSet which doesn't know how
+ * many segments it has to delete until it runs out.
+ */
+ if (stat_errno == ENOENT)
+ return false;
+
+ if (unlink(path) < 0)
+ {
+ if (errno != ENOENT)
+ ereport(error_on_failure ? ERROR : LOG,
+ (errcode_for_file_access(),
+ errmsg("could not unlink temporary file \"%s\": %m",
+ path)));
+ return false;
+ }
+
+ if (stat_errno == 0)
+ ReportTemporaryFileUsage(path, filestats.st_size);
+ else
+ {
+ errno = stat_errno;
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", path)));
+ }
+
+ return true;
+}
+
+/*
+ * close a file when done with it
+ */
+void
+FileClose(File file)
+{
+ Vfd *vfdP;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileClose: %d (%s)",
+ file, VfdCache[file].fileName));
+
+ vfdP = &VfdCache[file];
+
+ if (!FileIsNotOpen(file))
+ {
+ /* close the file */
+ if (close(vfdP->fd) != 0)
+ {
+ /*
+ * We may need to panic on failure to close non-temporary files;
+ * see LruDelete.
+ */
+ elog(vfdP->fdstate & FD_TEMP_FILE_LIMIT ? LOG : data_sync_elevel(LOG),
+ "could not close file \"%s\": %m", vfdP->fileName);
+ }
+
+ --nfile;
+ vfdP->fd = VFD_CLOSED;
+
+ /* remove the file from the lru ring */
+ Delete(file);
+ }
+
+ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+ {
+ /* Subtract its size from current usage (do first in case of error) */
+ temporary_files_size -= vfdP->fileSize;
+ vfdP->fileSize = 0;
+ }
+
+ /*
+ * Delete the file if it was temporary, and make a log entry if wanted
+ */
+ if (vfdP->fdstate & FD_DELETE_AT_CLOSE)
+ {
+ struct stat filestats;
+ int stat_errno;
+
+ /*
+ * If we get an error, as could happen within the ereport/elog calls,
+ * we'll come right back here during transaction abort. Reset the
+ * flag to ensure that we can't get into an infinite loop. This code
+ * is arranged to ensure that the worst-case consequence is failing to
+ * emit log message(s), not failing to attempt the unlink.
+ */
+ vfdP->fdstate &= ~FD_DELETE_AT_CLOSE;
+
+
+ /* first try the stat() */
+ if (stat(vfdP->fileName, &filestats))
+ stat_errno = errno;
+ else
+ stat_errno = 0;
+
+ /* in any case do the unlink */
+ if (unlink(vfdP->fileName))
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not delete file \"%s\": %m", vfdP->fileName)));
+
+ /* and last report the stat results */
+ if (stat_errno == 0)
+ ReportTemporaryFileUsage(vfdP->fileName, filestats.st_size);
+ else
+ {
+ errno = stat_errno;
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m", vfdP->fileName)));
+ }
+ }
+
+ /* Unregister it from the resource owner */
+ if (vfdP->resowner)
+ ResourceOwnerForgetFile(vfdP->resowner, file);
+
+ /*
+ * Return the Vfd slot to the free list
+ */
+ FreeVfd(file);
+}
+
+/*
+ * FilePrefetch - initiate asynchronous read of a given range of the file.
+ *
+ * Currently the only implementation of this function is using posix_fadvise
+ * which is the simplest standardized interface that accomplishes this.
+ * We could add an implementation using libaio in the future; but note that
+ * this API is inappropriate for libaio, which wants to have a buffer provided
+ * to read into.
+ */
+int
+FilePrefetch(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED)
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+ file, VfdCache[file].fileName,
+ (int64) offset, (int64) amount));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+retry:
+ pgstat_report_wait_start(wait_event_info);
+ returnCode = posix_fadvise(VfdCache[file].fd, offset, amount,
+ POSIX_FADV_WILLNEED);
+ pgstat_report_wait_end();
+
+ if (returnCode == EINTR)
+ goto retry;
+
+ return returnCode;
+#else
+ Assert(FileIsValid(file));
+ return 0;
+#endif
+}
+
+void
+FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
+{
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+ file, VfdCache[file].fileName,
+ (int64) offset, (int64) nbytes));
+
+ if (nbytes <= 0)
+ return;
+
+ if (VfdCache[file].fileFlags & PG_O_DIRECT)
+ return;
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return;
+
+ pgstat_report_wait_start(wait_event_info);
+ pg_flush_data(VfdCache[file].fd, offset, nbytes);
+ pgstat_report_wait_end();
+}
+
+int
+FileRead(File file, void *buffer, size_t amount, off_t offset,
+ uint32 wait_event_info)
+{
+ int returnCode;
+ Vfd *vfdP;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileRead: %d (%s) " INT64_FORMAT " %zu %p",
+ file, VfdCache[file].fileName,
+ (int64) offset,
+ amount, buffer));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ vfdP = &VfdCache[file];
+
+retry:
+ pgstat_report_wait_start(wait_event_info);
+ returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
+ pgstat_report_wait_end();
+
+ if (returnCode < 0)
+ {
+ /*
+ * Windows may run out of kernel buffers and return "Insufficient
+ * system resources" error. Wait a bit and retry to solve it.
+ *
+ * It is rumored that EINTR is also possible on some Unix filesystems,
+ * in which case immediate retry is indicated.
+ */
+#ifdef WIN32
+ DWORD error = GetLastError();
+
+ switch (error)
+ {
+ case ERROR_NO_SYSTEM_RESOURCES:
+ pg_usleep(1000L);
+ errno = EINTR;
+ break;
+ default:
+ _dosmaperr(error);
+ break;
+ }
+#endif
+ /* OK to retry if interrupted */
+ if (errno == EINTR)
+ goto retry;
+ }
+
+ return returnCode;
+}
+
+int
+FileWrite(File file, const void *buffer, size_t amount, off_t offset,
+ uint32 wait_event_info)
+{
+ int returnCode;
+ Vfd *vfdP;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileWrite: %d (%s) " INT64_FORMAT " %zu %p",
+ file, VfdCache[file].fileName,
+ (int64) offset,
+ amount, buffer));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ vfdP = &VfdCache[file];
+
+ /*
+ * If enforcing temp_file_limit and it's a temp file, check to see if the
+ * write would overrun temp_file_limit, and throw error if so. Note: it's
+ * really a modularity violation to throw error here; we should set errno
+ * and return -1. However, there's no way to report a suitable error
+ * message if we do that. All current callers would just throw error
+ * immediately anyway, so this is safe at present.
+ */
+ if (temp_file_limit >= 0 && (vfdP->fdstate & FD_TEMP_FILE_LIMIT))
+ {
+ off_t past_write = offset + amount;
+
+ if (past_write > vfdP->fileSize)
+ {
+ uint64 newTotal = temporary_files_size;
+
+ newTotal += past_write - vfdP->fileSize;
+ if (newTotal > (uint64) temp_file_limit * (uint64) 1024)
+ ereport(ERROR,
+ (errcode(ERRCODE_CONFIGURATION_LIMIT_EXCEEDED),
+ errmsg("temporary file size exceeds temp_file_limit (%dkB)",
+ temp_file_limit)));
+ }
+ }
+
+retry:
+ errno = 0;
+ pgstat_report_wait_start(wait_event_info);
+ returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
+ pgstat_report_wait_end();
+
+ /* if write didn't set errno, assume problem is no disk space */
+ if (returnCode != amount && errno == 0)
+ errno = ENOSPC;
+
+ if (returnCode >= 0)
+ {
+ /*
+ * Maintain fileSize and temporary_files_size if it's a temp file.
+ */
+ if (vfdP->fdstate & FD_TEMP_FILE_LIMIT)
+ {
+ off_t past_write = offset + amount;
+
+ if (past_write > vfdP->fileSize)
+ {
+ temporary_files_size += past_write - vfdP->fileSize;
+ vfdP->fileSize = past_write;
+ }
+ }
+ }
+ else
+ {
+ /*
+ * See comments in FileRead()
+ */
+#ifdef WIN32
+ DWORD error = GetLastError();
+
+ switch (error)
+ {
+ case ERROR_NO_SYSTEM_RESOURCES:
+ pg_usleep(1000L);
+ errno = EINTR;
+ break;
+ default:
+ _dosmaperr(error);
+ break;
+ }
+#endif
+ /* OK to retry if interrupted */
+ if (errno == EINTR)
+ goto retry;
+ }
+
+ return returnCode;
+}
+
+int
+FileSync(File file, uint32 wait_event_info)
+{
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileSync: %d (%s)",
+ file, VfdCache[file].fileName));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ pgstat_report_wait_start(wait_event_info);
+ returnCode = pg_fsync(VfdCache[file].fd);
+ pgstat_report_wait_end();
+
+ return returnCode;
+}
+
+/*
+ * Zero a region of the file.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+ int returnCode;
+ ssize_t written;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+ file, VfdCache[file].fileName,
+ (int64) offset, (int64) amount));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ pgstat_report_wait_start(wait_event_info);
+ written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
+ pgstat_report_wait_end();
+
+ if (written < 0)
+ return -1;
+ else if (written != amount)
+ {
+ /* if errno is unset, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
+ * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
+ * use FileZero() instead.
+ *
+ * Note that at least glibc() implements posix_fallocate() in userspace if not
+ * implemented by the filesystem. That's not the case for all environments
+ * though.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+#ifdef HAVE_POSIX_FALLOCATE
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+ file, VfdCache[file].fileName,
+ (int64) offset, (int64) amount));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return -1;
+
+retry:
+ pgstat_report_wait_start(wait_event_info);
+ returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
+ pgstat_report_wait_end();
+
+ if (returnCode == 0)
+ return 0;
+ else if (returnCode == EINTR)
+ goto retry;
+
+ /* for compatibility with %m printing etc */
+ errno = returnCode;
+
+ /*
+ * Return in cases of a "real" failure, if fallocate is not supported,
+ * fall through to the FileZero() backed implementation.
+ */
+ if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
+ return -1;
+#endif
+
+ return FileZero(file, offset, amount, wait_event_info);
+}
+
+off_t
+FileSize(File file)
+{
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileSize %d (%s)",
+ file, VfdCache[file].fileName));
+
+ if (FileIsNotOpen(file))
+ {
+ if (FileAccess(file) < 0)
+ return (off_t) -1;
+ }
+
+ return lseek(VfdCache[file].fd, 0, SEEK_END);
+}
+
+int
+FileTruncate(File file, off_t offset, uint32 wait_event_info)
+{
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileTruncate %d (%s)",
+ file, VfdCache[file].fileName));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ pgstat_report_wait_start(wait_event_info);
+ returnCode = pg_ftruncate(VfdCache[file].fd, offset);
+ pgstat_report_wait_end();
+
+ if (returnCode == 0 && VfdCache[file].fileSize > offset)
+ {
+ /* adjust our state for truncation of a temp file */
+ Assert(VfdCache[file].fdstate & FD_TEMP_FILE_LIMIT);
+ temporary_files_size -= VfdCache[file].fileSize - offset;
+ VfdCache[file].fileSize = offset;
+ }
+
+ return returnCode;
+}
+
+/*
+ * Return the pathname associated with an open file.
+ *
+ * The returned string points to an internal buffer, which is valid until
+ * the file is closed.
+ */
+char *
+FilePathName(File file)
+{
+ Assert(FileIsValid(file));
+
+ return VfdCache[file].fileName;
+}
+
+/*
+ * Return the raw file descriptor of an opened file.
+ *
+ * The returned file descriptor will be valid until the file is closed, but
+ * there are a lot of things that can make that happen. So the caller should
+ * be careful not to do much of anything else before it finishes using the
+ * returned file descriptor.
+ */
+int
+FileGetRawDesc(File file)
+{
+ Assert(FileIsValid(file));
+ return VfdCache[file].fd;
+}
+
+/*
+ * FileGetRawFlags - returns the file flags on open(2)
+ */
+int
+FileGetRawFlags(File file)
+{
+ Assert(FileIsValid(file));
+ return VfdCache[file].fileFlags;
+}
+
+/*
+ * FileGetRawMode - returns the mode bitmask passed to open(2)
+ */
+mode_t
+FileGetRawMode(File file)
+{
+ Assert(FileIsValid(file));
+ return VfdCache[file].fileMode;
+}
+
+/*
+ * Make room for another allocatedDescs[] array entry if needed and possible.
+ * Returns true if an array element is available.
+ */
+static bool
+reserveAllocatedDesc(void)
+{
+ AllocateDesc *newDescs;
+ int newMax;
+
+ /* Quick out if array already has a free slot. */
+ if (numAllocatedDescs < maxAllocatedDescs)
+ return true;
+
+ /*
+ * If the array hasn't yet been created in the current process, initialize
+ * it with FD_MINFREE / 3 elements. In many scenarios this is as many as
+ * we will ever need, anyway. We don't want to look at max_safe_fds
+ * immediately because set_max_safe_fds() may not have run yet.
+ */
+ if (allocatedDescs == NULL)
+ {
+ newMax = FD_MINFREE / 3;
+ newDescs = (AllocateDesc *) malloc(newMax * sizeof(AllocateDesc));
+ /* Out of memory already? Treat as fatal error. */
+ if (newDescs == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ allocatedDescs = newDescs;
+ maxAllocatedDescs = newMax;
+ return true;
+ }
+
+ /*
+ * Consider enlarging the array beyond the initial allocation used above.
+ * By the time this happens, max_safe_fds should be known accurately.
+ *
+ * We mustn't let allocated descriptors hog all the available FDs, and in
+ * practice we'd better leave a reasonable number of FDs for VFD use. So
+ * set the maximum to max_safe_fds / 3. (This should certainly be at
+ * least as large as the initial size, FD_MINFREE / 3, so we aren't
+ * tightening the restriction here.) Recall that "external" FDs are
+ * allowed to consume another third of max_safe_fds.
+ */
+ newMax = max_safe_fds / 3;
+ if (newMax > maxAllocatedDescs)
+ {
+ newDescs = (AllocateDesc *) realloc(allocatedDescs,
+ newMax * sizeof(AllocateDesc));
+ /* Treat out-of-memory as a non-fatal error. */
+ if (newDescs == NULL)
+ return false;
+ allocatedDescs = newDescs;
+ maxAllocatedDescs = newMax;
+ return true;
+ }
+
+ /* Can't enlarge allocatedDescs[] any more. */
+ return false;
+}
+
+/*
+ * Routines that want to use stdio (ie, FILE*) should use AllocateFile
+ * rather than plain fopen(). This lets fd.c deal with freeing FDs if
+ * necessary to open the file. When done, call FreeFile rather than fclose.
+ *
+ * Note that files that will be open for any significant length of time
+ * should NOT be handled this way, since they cannot share kernel file
+ * descriptors with other files; there is grave risk of running out of FDs
+ * if anyone locks down too many FDs. Most callers of this routine are
+ * simply reading a config file that they will read and close immediately.
+ *
+ * fd.c will automatically close all files opened with AllocateFile at
+ * transaction commit or abort; this prevents FD leakage if a routine
+ * that calls AllocateFile is terminated prematurely by ereport(ERROR).
+ *
+ * Ideally this should be the *only* direct call of fopen() in the backend.
+ */
+FILE *
+AllocateFile(const char *name, const char *mode)
+{
+ FILE *file;
+
+ DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
+ numAllocatedDescs, name));
+
+ /* Can we allocate another non-virtual FD? */
+ if (!reserveAllocatedDesc())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
+ maxAllocatedDescs, name)));
+
+ /* Close excess kernel FDs. */
+ ReleaseLruFiles();
+
+TryAgain:
+ if ((file = fopen(name, mode)) != NULL)
+ {
+ AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+ desc->kind = AllocateDescFile;
+ desc->desc.file = file;
+ desc->create_subid = GetCurrentSubTransactionId();
+ numAllocatedDescs++;
+ return desc->desc.file;
+ }
+
+ if (errno == EMFILE || errno == ENFILE)
+ {
+ int save_errno = errno;
+
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("out of file descriptors: %m; release and retry")));
+ errno = 0;
+ if (ReleaseLruFile())
+ goto TryAgain;
+ errno = save_errno;
+ }
+
+ return NULL;
+}
+
+/*
+ * Open a file with OpenTransientFilePerm() and pass default file mode for
+ * the fileMode parameter.
+ */
+int
+OpenTransientFile(const char *fileName, int fileFlags)
+{
+ return OpenTransientFilePerm(fileName, fileFlags, pg_file_create_mode);
+}
+
+/*
+ * Like AllocateFile, but returns an unbuffered fd like open(2)
+ */
+int
+OpenTransientFilePerm(const char *fileName, int fileFlags, mode_t fileMode)
+{
+ int fd;
+
+ DO_DB(elog(LOG, "OpenTransientFile: Allocated %d (%s)",
+ numAllocatedDescs, fileName));
+
+ /* Can we allocate another non-virtual FD? */
+ if (!reserveAllocatedDesc())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("exceeded maxAllocatedDescs (%d) while trying to open file \"%s\"",
+ maxAllocatedDescs, fileName)));
+
+ /* Close excess kernel FDs. */
+ ReleaseLruFiles();
+
+ fd = BasicOpenFilePerm(fileName, fileFlags, fileMode);
+
+ if (fd >= 0)
+ {
+ AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+ desc->kind = AllocateDescRawFD;
+ desc->desc.fd = fd;
+ desc->create_subid = GetCurrentSubTransactionId();
+ numAllocatedDescs++;
+
+ return fd;
+ }
+
+ return -1; /* failure */
+}
+
+/*
+ * Routines that want to initiate a pipe stream should use OpenPipeStream
+ * rather than plain popen(). This lets fd.c deal with freeing FDs if
+ * necessary. When done, call ClosePipeStream rather than pclose.
+ *
+ * This function also ensures that the popen'd program is run with default
+ * SIGPIPE processing, rather than the SIG_IGN setting the backend normally
+ * uses. This ensures desirable response to, eg, closing a read pipe early.
+ */
+FILE *
+OpenPipeStream(const char *command, const char *mode)
+{
+ FILE *file;
+ int save_errno;
+
+ DO_DB(elog(LOG, "OpenPipeStream: Allocated %d (%s)",
+ numAllocatedDescs, command));
+
+ /* Can we allocate another non-virtual FD? */
+ if (!reserveAllocatedDesc())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("exceeded maxAllocatedDescs (%d) while trying to execute command \"%s\"",
+ maxAllocatedDescs, command)));
+
+ /* Close excess kernel FDs. */
+ ReleaseLruFiles();
+
+TryAgain:
+ fflush(NULL);
+ pqsignal(SIGPIPE, SIG_DFL);
+ errno = 0;
+ file = popen(command, mode);
+ save_errno = errno;
+ pqsignal(SIGPIPE, SIG_IGN);
+ errno = save_errno;
+ if (file != NULL)
+ {
+ AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+ desc->kind = AllocateDescPipe;
+ desc->desc.file = file;
+ desc->create_subid = GetCurrentSubTransactionId();
+ numAllocatedDescs++;
+ return desc->desc.file;
+ }
+
+ if (errno == EMFILE || errno == ENFILE)
+ {
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("out of file descriptors: %m; release and retry")));
+ if (ReleaseLruFile())
+ goto TryAgain;
+ errno = save_errno;
+ }
+
+ return NULL;
+}
+
+/*
+ * Free an AllocateDesc of any type.
+ *
+ * The argument *must* point into the allocatedDescs[] array.
+ */
+static int
+FreeDesc(AllocateDesc *desc)
+{
+ int result;
+
+ /* Close the underlying object */
+ switch (desc->kind)
+ {
+ case AllocateDescFile:
+ result = fclose(desc->desc.file);
+ break;
+ case AllocateDescPipe:
+ result = pclose(desc->desc.file);
+ break;
+ case AllocateDescDir:
+ result = closedir(desc->desc.dir);
+ break;
+ case AllocateDescRawFD:
+ result = close(desc->desc.fd);
+ break;
+ default:
+ elog(ERROR, "AllocateDesc kind not recognized");
+ result = 0; /* keep compiler quiet */
+ break;
+ }
+
+ /* Compact storage in the allocatedDescs array */
+ numAllocatedDescs--;
+ *desc = allocatedDescs[numAllocatedDescs];
+
+ return result;
+}
+
+/*
+ * Close a file returned by AllocateFile.
+ *
+ * Note we do not check fclose's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+FreeFile(FILE *file)
+{
+ int i;
+
+ DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
+
+ /* Remove file from list of allocated files, if it's present */
+ for (i = numAllocatedDescs; --i >= 0;)
+ {
+ AllocateDesc *desc = &allocatedDescs[i];
+
+ if (desc->kind == AllocateDescFile && desc->desc.file == file)
+ return FreeDesc(desc);
+ }
+
+ /* Only get here if someone passes us a file not in allocatedDescs */
+ elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
+
+ return fclose(file);
+}
+
+/*
+ * Close a file returned by OpenTransientFile.
+ *
+ * Note we do not check close's return value --- it is up to the caller
+ * to handle close errors.
+ */
+int
+CloseTransientFile(int fd)
+{
+ int i;
+
+ DO_DB(elog(LOG, "CloseTransientFile: Allocated %d", numAllocatedDescs));
+
+ /* Remove fd from list of allocated files, if it's present */
+ for (i = numAllocatedDescs; --i >= 0;)
+ {
+ AllocateDesc *desc = &allocatedDescs[i];
+
+ if (desc->kind == AllocateDescRawFD && desc->desc.fd == fd)
+ return FreeDesc(desc);
+ }
+
+ /* Only get here if someone passes us a file not in allocatedDescs */
+ elog(WARNING, "fd passed to CloseTransientFile was not obtained from OpenTransientFile");
+
+ return close(fd);
+}
+
+/*
+ * Routines that want to use <dirent.h> (ie, DIR*) should use AllocateDir
+ * rather than plain opendir(). This lets fd.c deal with freeing FDs if
+ * necessary to open the directory, and with closing it after an elog.
+ * When done, call FreeDir rather than closedir.
+ *
+ * Returns NULL, with errno set, on failure. Note that failure detection
+ * is commonly left to the following call of ReadDir or ReadDirExtended;
+ * see the comments for ReadDir.
+ *
+ * Ideally this should be the *only* direct call of opendir() in the backend.
+ */
+DIR *
+AllocateDir(const char *dirname)
+{
+ DIR *dir;
+
+ DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
+ numAllocatedDescs, dirname));
+
+ /* Can we allocate another non-virtual FD? */
+ if (!reserveAllocatedDesc())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("exceeded maxAllocatedDescs (%d) while trying to open directory \"%s\"",
+ maxAllocatedDescs, dirname)));
+
+ /* Close excess kernel FDs. */
+ ReleaseLruFiles();
+
+TryAgain:
+ if ((dir = opendir(dirname)) != NULL)
+ {
+ AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+ desc->kind = AllocateDescDir;
+ desc->desc.dir = dir;
+ desc->create_subid = GetCurrentSubTransactionId();
+ numAllocatedDescs++;
+ return desc->desc.dir;
+ }
+
+ if (errno == EMFILE || errno == ENFILE)
+ {
+ int save_errno = errno;
+
+ ereport(LOG,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("out of file descriptors: %m; release and retry")));
+ errno = 0;
+ if (ReleaseLruFile())
+ goto TryAgain;
+ errno = save_errno;
+ }
+
+ return NULL;
+}
+
+/*
+ * Read a directory opened with AllocateDir, ereport'ing any error.
+ *
+ * This is easier to use than raw readdir() since it takes care of some
+ * otherwise rather tedious and error-prone manipulation of errno. Also,
+ * if you are happy with a generic error message for AllocateDir failure,
+ * you can just do
+ *
+ * dir = AllocateDir(path);
+ * while ((dirent = ReadDir(dir, path)) != NULL)
+ * process dirent;
+ * FreeDir(dir);
+ *
+ * since a NULL dir parameter is taken as indicating AllocateDir failed.
+ * (Make sure errno isn't changed between AllocateDir and ReadDir if you
+ * use this shortcut.)
+ *
+ * The pathname passed to AllocateDir must be passed to this routine too,
+ * but it is only used for error reporting.
+ */
+struct dirent *
+ReadDir(DIR *dir, const char *dirname)
+{
+ return ReadDirExtended(dir, dirname, ERROR);
+}
+
+/*
+ * Alternate version of ReadDir that allows caller to specify the elevel
+ * for any error report (whether it's reporting an initial failure of
+ * AllocateDir or a subsequent directory read failure).
+ *
+ * If elevel < ERROR, returns NULL after any error. With the normal coding
+ * pattern, this will result in falling out of the loop immediately as
+ * though the directory contained no (more) entries.
+ */
+struct dirent *
+ReadDirExtended(DIR *dir, const char *dirname, int elevel)
+{
+ struct dirent *dent;
+
+ /* Give a generic message for AllocateDir failure, if caller didn't */
+ if (dir == NULL)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not open directory \"%s\": %m",
+ dirname)));
+ return NULL;
+ }
+
+ errno = 0;
+ if ((dent = readdir(dir)) != NULL)
+ return dent;
+
+ if (errno)
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not read directory \"%s\": %m",
+ dirname)));
+ return NULL;
+}
+
+/*
+ * Close a directory opened with AllocateDir.
+ *
+ * Returns closedir's return value (with errno set if it's not 0).
+ * Note we do not check the return value --- it is up to the caller
+ * to handle close errors if wanted.
+ *
+ * Does nothing if dir == NULL; we assume that directory open failure was
+ * already reported if desired.
+ */
+int
+FreeDir(DIR *dir)
+{
+ int i;
+
+ /* Nothing to do if AllocateDir failed */
+ if (dir == NULL)
+ return 0;
+
+ DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
+
+ /* Remove dir from list of allocated dirs, if it's present */
+ for (i = numAllocatedDescs; --i >= 0;)
+ {
+ AllocateDesc *desc = &allocatedDescs[i];
+
+ if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
+ return FreeDesc(desc);
+ }
+
+ /* Only get here if someone passes us a dir not in allocatedDescs */
+ elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
+
+ return closedir(dir);
+}
+
+
+/*
+ * Close a pipe stream returned by OpenPipeStream.
+ */
+int
+ClosePipeStream(FILE *file)
+{
+ int i;
+
+ DO_DB(elog(LOG, "ClosePipeStream: Allocated %d", numAllocatedDescs));
+
+ /* Remove file from list of allocated files, if it's present */
+ for (i = numAllocatedDescs; --i >= 0;)
+ {
+ AllocateDesc *desc = &allocatedDescs[i];
+
+ if (desc->kind == AllocateDescPipe && desc->desc.file == file)
+ return FreeDesc(desc);
+ }
+
+ /* Only get here if someone passes us a file not in allocatedDescs */
+ elog(WARNING, "file passed to ClosePipeStream was not obtained from OpenPipeStream");
+
+ return pclose(file);
+}
+
+/*
+ * closeAllVfds
+ *
+ * Force all VFDs into the physically-closed state, so that the fewest
+ * possible number of kernel file descriptors are in use. There is no
+ * change in the logical state of the VFDs.
+ */
+void
+closeAllVfds(void)
+{
+ Index i;
+
+ if (SizeVfdCache > 0)
+ {
+ Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
+ for (i = 1; i < SizeVfdCache; i++)
+ {
+ if (!FileIsNotOpen(i))
+ LruDelete(i);
+ }
+ }
+}
+
+
+/*
+ * SetTempTablespaces
+ *
+ * Define a list (actually an array) of OIDs of tablespaces to use for
+ * temporary files. This list will be used until end of transaction,
+ * unless this function is called again before then. It is caller's
+ * responsibility that the passed-in array has adequate lifespan (typically
+ * it'd be allocated in TopTransactionContext).
+ *
+ * Some entries of the array may be InvalidOid, indicating that the current
+ * database's default tablespace should be used.
+ */
+void
+SetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+ Assert(numSpaces >= 0);
+ tempTableSpaces = tableSpaces;
+ numTempTableSpaces = numSpaces;
+
+ /*
+ * Select a random starting point in the list. This is to minimize
+ * conflicts between backends that are most likely sharing the same list
+ * of temp tablespaces. Note that if we create multiple temp files in the
+ * same transaction, we'll advance circularly through the list --- this
+ * ensures that large temporary sort files are nicely spread across all
+ * available tablespaces.
+ */
+ if (numSpaces > 1)
+ nextTempTableSpace = pg_prng_uint64_range(&pg_global_prng_state,
+ 0, numSpaces - 1);
+ else
+ nextTempTableSpace = 0;
+}
+
+/*
+ * TempTablespacesAreSet
+ *
+ * Returns true if SetTempTablespaces has been called in current transaction.
+ * (This is just so that tablespaces.c doesn't need its own per-transaction
+ * state.)
+ */
+bool
+TempTablespacesAreSet(void)
+{
+ return (numTempTableSpaces >= 0);
+}
+
+/*
+ * GetTempTablespaces
+ *
+ * Populate an array with the OIDs of the tablespaces that should be used for
+ * temporary files. (Some entries may be InvalidOid, indicating that the
+ * current database's default tablespace should be used.) At most numSpaces
+ * entries will be filled.
+ * Returns the number of OIDs that were copied into the output array.
+ */
+int
+GetTempTablespaces(Oid *tableSpaces, int numSpaces)
+{
+ int i;
+
+ Assert(TempTablespacesAreSet());
+ for (i = 0; i < numTempTableSpaces && i < numSpaces; ++i)
+ tableSpaces[i] = tempTableSpaces[i];
+
+ return i;
+}
+
+/*
+ * GetNextTempTableSpace
+ *
+ * Select the next temp tablespace to use. A result of InvalidOid means
+ * to use the current database's default tablespace.
+ */
+Oid
+GetNextTempTableSpace(void)
+{
+ if (numTempTableSpaces > 0)
+ {
+ /* Advance nextTempTableSpace counter with wraparound */
+ if (++nextTempTableSpace >= numTempTableSpaces)
+ nextTempTableSpace = 0;
+ return tempTableSpaces[nextTempTableSpace];
+ }
+ return InvalidOid;
+}
+
+
+/*
+ * AtEOSubXact_Files
+ *
+ * Take care of subtransaction commit/abort. At abort, we close temp files
+ * that the subtransaction may have opened. At commit, we reassign the
+ * files that were opened to the parent subtransaction.
+ */
+void
+AtEOSubXact_Files(bool isCommit, SubTransactionId mySubid,
+ SubTransactionId parentSubid)
+{
+ Index i;
+
+ for (i = 0; i < numAllocatedDescs; i++)
+ {
+ if (allocatedDescs[i].create_subid == mySubid)
+ {
+ if (isCommit)
+ allocatedDescs[i].create_subid = parentSubid;
+ else
+ {
+ /* have to recheck the item after FreeDesc (ugly) */
+ FreeDesc(&allocatedDescs[i--]);
+ }
+ }
+ }
+}
+
+/*
+ * AtEOXact_Files
+ *
+ * This routine is called during transaction commit or abort. All still-open
+ * per-transaction temporary file VFDs are closed, which also causes the
+ * underlying files to be deleted (although they should've been closed already
+ * by the ResourceOwner cleanup). Furthermore, all "allocated" stdio files are
+ * closed. We also forget any transaction-local temp tablespace list.
+ *
+ * The isCommit flag is used only to decide whether to emit warnings about
+ * unclosed files.
+ */
+void
+AtEOXact_Files(bool isCommit)
+{
+ CleanupTempFiles(isCommit, false);
+ tempTableSpaces = NULL;
+ numTempTableSpaces = -1;
+}
+
+/*
+ * BeforeShmemExit_Files
+ *
+ * before_shmem_exit hook to clean up temp files during backend shutdown.
+ * Here, we want to clean up *all* temp files including interXact ones.
+ */
+static void
+BeforeShmemExit_Files(int code, Datum arg)
+{
+ CleanupTempFiles(false, true);
+
+ /* prevent further temp files from being created */
+#ifdef USE_ASSERT_CHECKING
+ temporary_files_allowed = false;
+#endif
+}
+
+/*
+ * Close temporary files and delete their underlying files.
+ *
+ * isCommit: if true, this is normal transaction commit, and we don't
+ * expect any remaining files; warn if there are some.
+ *
+ * isProcExit: if true, this is being called as the backend process is
+ * exiting. If that's the case, we should remove all temporary files; if
+ * that's not the case, we are being called for transaction commit/abort
+ * and should only remove transaction-local temp files. In either case,
+ * also clean up "allocated" stdio files, dirs and fds.
+ */
+static void
+CleanupTempFiles(bool isCommit, bool isProcExit)
+{
+ Index i;
+
+ /*
+ * Careful here: at proc_exit we need extra cleanup, not just
+ * xact_temporary files.
+ */
+ if (isProcExit || have_xact_temporary_files)
+ {
+ Assert(FileIsNotOpen(0)); /* Make sure ring not corrupted */
+ for (i = 1; i < SizeVfdCache; i++)
+ {
+ unsigned short fdstate = VfdCache[i].fdstate;
+
+ if (((fdstate & FD_DELETE_AT_CLOSE) || (fdstate & FD_CLOSE_AT_EOXACT)) &&
+ VfdCache[i].fileName != NULL)
+ {
+ /*
+ * If we're in the process of exiting a backend process, close
+ * all temporary files. Otherwise, only close temporary files
+ * local to the current transaction. They should be closed by
+ * the ResourceOwner mechanism already, so this is just a
+ * debugging cross-check.
+ */
+ if (isProcExit)
+ FileClose(i);
+ else if (fdstate & FD_CLOSE_AT_EOXACT)
+ {
+ elog(WARNING,
+ "temporary file %s not closed at end-of-transaction",
+ VfdCache[i].fileName);
+ FileClose(i);
+ }
+ }
+ }
+
+ have_xact_temporary_files = false;
+ }
+
+ /* Complain if any allocated files remain open at commit. */
+ if (isCommit && numAllocatedDescs > 0)
+ elog(WARNING, "%d temporary files and directories not closed at end-of-transaction",
+ numAllocatedDescs);
+
+ /* Clean up "allocated" stdio files, dirs and fds. */
+ while (numAllocatedDescs > 0)
+ FreeDesc(&allocatedDescs[0]);
+}
+
+
+/*
+ * Remove temporary and temporary relation files left over from a prior
+ * postmaster session
+ *
+ * This should be called during postmaster startup. It will forcibly
+ * remove any leftover files created by OpenTemporaryFile and any leftover
+ * temporary relation files created by mdcreate.
+ *
+ * During post-backend-crash restart cycle, this routine is called when
+ * remove_temp_files_after_crash GUC is enabled. Multiple crashes while
+ * queries are using temp files could result in useless storage usage that can
+ * only be reclaimed by a service restart. The argument against enabling it is
+ * that someone might want to examine the temporary files for debugging
+ * purposes. This does however mean that OpenTemporaryFile had better allow for
+ * collision with an existing temp file name.
+ *
+ * NOTE: this function and its subroutines generally report syscall failures
+ * with ereport(LOG) and keep going. Removing temp files is not so critical
+ * that we should fail to start the database when we can't do it.
+ */
+void
+RemovePgTempFiles(void)
+{
+ char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY) + sizeof(PG_TEMP_FILES_DIR)];
+ DIR *spc_dir;
+ struct dirent *spc_de;
+
+ /*
+ * First process temp files in pg_default ($PGDATA/base)
+ */
+ snprintf(temp_path, sizeof(temp_path), "base/%s", PG_TEMP_FILES_DIR);
+ RemovePgTempFilesInDir(temp_path, true, false);
+ RemovePgTempRelationFiles("base");
+
+ /*
+ * Cycle through temp directories for all non-default tablespaces.
+ */
+ spc_dir = AllocateDir("pg_tblspc");
+
+ while ((spc_de = ReadDirExtended(spc_dir, "pg_tblspc", LOG)) != NULL)
+ {
+ if (strcmp(spc_de->d_name, ".") == 0 ||
+ strcmp(spc_de->d_name, "..") == 0)
+ continue;
+
+ snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s/%s",
+ spc_de->d_name, TABLESPACE_VERSION_DIRECTORY, PG_TEMP_FILES_DIR);
+ RemovePgTempFilesInDir(temp_path, true, false);
+
+ snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+ spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+ RemovePgTempRelationFiles(temp_path);
+ }
+
+ FreeDir(spc_dir);
+
+ /*
+ * In EXEC_BACKEND case there is a pgsql_tmp directory at the top level of
+ * DataDir as well. However, that is *not* cleaned here because doing so
+ * would create a race condition. It's done separately, earlier in
+ * postmaster startup.
+ */
+}
+
+/*
+ * Process one pgsql_tmp directory for RemovePgTempFiles.
+ *
+ * If missing_ok is true, it's all right for the named directory to not exist.
+ * Any other problem results in a LOG message. (missing_ok should be true at
+ * the top level, since pgsql_tmp directories are not created until needed.)
+ *
+ * At the top level, this should be called with unlink_all = false, so that
+ * only files matching the temporary name prefix will be unlinked. When
+ * recursing it will be called with unlink_all = true to unlink everything
+ * under a top-level temporary directory.
+ *
+ * (These two flags could be replaced by one, but it seems clearer to keep
+ * them separate.)
+ */
+void
+RemovePgTempFilesInDir(const char *tmpdirname, bool missing_ok, bool unlink_all)
+{
+ DIR *temp_dir;
+ struct dirent *temp_de;
+ char rm_path[MAXPGPATH * 2];
+
+ temp_dir = AllocateDir(tmpdirname);
+
+ if (temp_dir == NULL && errno == ENOENT && missing_ok)
+ return;
+
+ while ((temp_de = ReadDirExtended(temp_dir, tmpdirname, LOG)) != NULL)
+ {
+ if (strcmp(temp_de->d_name, ".") == 0 ||
+ strcmp(temp_de->d_name, "..") == 0)
+ continue;
+
+ snprintf(rm_path, sizeof(rm_path), "%s/%s",
+ tmpdirname, temp_de->d_name);
+
+ if (unlink_all ||
+ strncmp(temp_de->d_name,
+ PG_TEMP_FILE_PREFIX,
+ strlen(PG_TEMP_FILE_PREFIX)) == 0)
+ {
+ PGFileType type = get_dirent_type(rm_path, temp_de, false, LOG);
+
+ if (type == PGFILETYPE_ERROR)
+ continue;
+ else if (type == PGFILETYPE_DIR)
+ {
+ /* recursively remove contents, then directory itself */
+ RemovePgTempFilesInDir(rm_path, false, true);
+
+ if (rmdir(rm_path) < 0)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not remove directory \"%s\": %m",
+ rm_path)));
+ }
+ else
+ {
+ if (unlink(rm_path) < 0)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m",
+ rm_path)));
+ }
+ }
+ else
+ ereport(LOG,
+ (errmsg("unexpected file found in temporary-files directory: \"%s\"",
+ rm_path)));
+ }
+
+ FreeDir(temp_dir);
+}
+
+/* Process one tablespace directory, look for per-DB subdirectories */
+static void
+RemovePgTempRelationFiles(const char *tsdirname)
+{
+ DIR *ts_dir;
+ struct dirent *de;
+ char dbspace_path[MAXPGPATH * 2];
+
+ ts_dir = AllocateDir(tsdirname);
+
+ while ((de = ReadDirExtended(ts_dir, tsdirname, LOG)) != NULL)
+ {
+ /*
+ * We're only interested in the per-database directories, which have
+ * numeric names. Note that this code will also (properly) ignore "."
+ * and "..".
+ */
+ if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+ continue;
+
+ snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
+ tsdirname, de->d_name);
+ RemovePgTempRelationFilesInDbspace(dbspace_path);
+ }
+
+ FreeDir(ts_dir);
+}
+
+/* Process one per-dbspace directory for RemovePgTempRelationFiles */
+static void
+RemovePgTempRelationFilesInDbspace(const char *dbspacedirname)
+{
+ DIR *dbspace_dir;
+ struct dirent *de;
+ char rm_path[MAXPGPATH * 2];
+
+ dbspace_dir = AllocateDir(dbspacedirname);
+
+ while ((de = ReadDirExtended(dbspace_dir, dbspacedirname, LOG)) != NULL)
+ {
+ if (!looks_like_temp_rel_name(de->d_name))
+ continue;
+
+ snprintf(rm_path, sizeof(rm_path), "%s/%s",
+ dbspacedirname, de->d_name);
+
+ if (unlink(rm_path) < 0)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m",
+ rm_path)));
+ }
+
+ FreeDir(dbspace_dir);
+}
+
+/* t<digits>_<digits>, or t<digits>_<digits>_<forkname> */
+bool
+looks_like_temp_rel_name(const char *name)
+{
+ int pos;
+ int savepos;
+
+ /* Must start with "t". */
+ if (name[0] != 't')
+ return false;
+
+ /* Followed by a non-empty string of digits and then an underscore. */
+ for (pos = 1; isdigit((unsigned char) name[pos]); ++pos)
+ ;
+ if (pos == 1 || name[pos] != '_')
+ return false;
+
+ /* Followed by another nonempty string of digits. */
+ for (savepos = ++pos; isdigit((unsigned char) name[pos]); ++pos)
+ ;
+ if (savepos == pos)
+ return false;
+
+ /* We might have _forkname or .segment or both. */
+ if (name[pos] == '_')
+ {
+ int forkchar = forkname_chars(&name[pos + 1], NULL);
+
+ if (forkchar <= 0)
+ return false;
+ pos += forkchar + 1;
+ }
+ if (name[pos] == '.')
+ {
+ int segchar;
+
+ for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
+ ;
+ if (segchar <= 1)
+ return false;
+ pos += segchar;
+ }
+
+ /* Now we should be at the end. */
+ if (name[pos] != '\0')
+ return false;
+ return true;
+}
+
+#ifdef HAVE_SYNCFS
+static void
+do_syncfs(const char *path)
+{
+ int fd;
+
+ ereport_startup_progress("syncing data directory (syncfs), elapsed time: %ld.%02d s, current path: %s",
+ path);
+
+ fd = OpenTransientFile(path, O_RDONLY);
+ if (fd < 0)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ return;
+ }
+ if (syncfs(fd) < 0)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not synchronize file system for file \"%s\": %m", path)));
+ CloseTransientFile(fd);
+}
+#endif
+
+/*
+ * Issue fsync recursively on PGDATA and all its contents, or issue syncfs for
+ * all potential filesystem, depending on recovery_init_sync_method setting.
+ *
+ * We fsync regular files and directories wherever they are, but we
+ * follow symlinks only for pg_wal and immediately under pg_tblspc.
+ * Other symlinks are presumed to point at files we're not responsible
+ * for fsyncing, and might not have privileges to write at all.
+ *
+ * Errors are logged but not considered fatal; that's because this is used
+ * only during database startup, to deal with the possibility that there are
+ * issued-but-unsynced writes pending against the data directory. We want to
+ * ensure that such writes reach disk before anything that's done in the new
+ * run. However, aborting on error would result in failure to start for
+ * harmless cases such as read-only files in the data directory, and that's
+ * not good either.
+ *
+ * Note that if we previously crashed due to a PANIC on fsync(), we'll be
+ * rewriting all changes again during recovery.
+ *
+ * Note we assume we're chdir'd into PGDATA to begin with.
+ */
+void
+SyncDataDirectory(void)
+{
+ bool xlog_is_symlink;
+
+ /* We can skip this whole thing if fsync is disabled. */
+ if (!enableFsync)
+ return;
+
+ /*
+ * If pg_wal is a symlink, we'll need to recurse into it separately,
+ * because the first walkdir below will ignore it.
+ */
+ xlog_is_symlink = false;
+
+ {
+ struct stat st;
+
+ if (lstat("pg_wal", &st) < 0)
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not stat file \"%s\": %m",
+ "pg_wal")));
+ else if (S_ISLNK(st.st_mode))
+ xlog_is_symlink = true;
+ }
+
+#ifdef HAVE_SYNCFS
+ if (recovery_init_sync_method == RECOVERY_INIT_SYNC_METHOD_SYNCFS)
+ {
+ DIR *dir;
+ struct dirent *de;
+
+ /*
+ * On Linux, we don't have to open every single file one by one. We
+ * can use syncfs() to sync whole filesystems. We only expect
+ * filesystem boundaries to exist where we tolerate symlinks, namely
+ * pg_wal and the tablespaces, so we call syncfs() for each of those
+ * directories.
+ */
+
+ /* Prepare to report progress syncing the data directory via syncfs. */
+ begin_startup_progress_phase();
+
+ /* Sync the top level pgdata directory. */
+ do_syncfs(".");
+ /* If any tablespaces are configured, sync each of those. */
+ dir = AllocateDir("pg_tblspc");
+ while ((de = ReadDirExtended(dir, "pg_tblspc", LOG)))
+ {
+ char path[MAXPGPATH];
+
+ if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
+ continue;
+
+ snprintf(path, MAXPGPATH, "pg_tblspc/%s", de->d_name);
+ do_syncfs(path);
+ }
+ FreeDir(dir);
+ /* If pg_wal is a symlink, process that too. */
+ if (xlog_is_symlink)
+ do_syncfs("pg_wal");
+ return;
+ }
+#endif /* !HAVE_SYNCFS */
+
+#ifdef PG_FLUSH_DATA_WORKS
+ /* Prepare to report progress of the pre-fsync phase. */
+ begin_startup_progress_phase();
+
+ /*
+ * If possible, hint to the kernel that we're soon going to fsync the data
+ * directory and its contents. Errors in this step are even less
+ * interesting than normal, so log them only at DEBUG1.
+ */
+ walkdir(".", pre_sync_fname, false, DEBUG1);
+ if (xlog_is_symlink)
+ walkdir("pg_wal", pre_sync_fname, false, DEBUG1);
+ walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1);
+#endif
+
+ /* Prepare to report progress syncing the data directory via fsync. */
+ begin_startup_progress_phase();
+
+ /*
+ * Now we do the fsync()s in the same order.
+ *
+ * The main call ignores symlinks, so in addition to specially processing
+ * pg_wal if it's a symlink, pg_tblspc has to be visited separately with
+ * process_symlinks = true. Note that if there are any plain directories
+ * in pg_tblspc, they'll get fsync'd twice. That's not an expected case
+ * so we don't worry about optimizing it.
+ */
+ walkdir(".", datadir_fsync_fname, false, LOG);
+ if (xlog_is_symlink)
+ walkdir("pg_wal", datadir_fsync_fname, false, LOG);
+ walkdir("pg_tblspc", datadir_fsync_fname, true, LOG);
+}
+
+/*
+ * walkdir: recursively walk a directory, applying the action to each
+ * regular file and directory (including the named directory itself).
+ *
+ * If process_symlinks is true, the action and recursion are also applied
+ * to regular files and directories that are pointed to by symlinks in the
+ * given directory; otherwise symlinks are ignored. Symlinks are always
+ * ignored in subdirectories, ie we intentionally don't pass down the
+ * process_symlinks flag to recursive calls.
+ *
+ * Errors are reported at level elevel, which might be ERROR or less.
+ *
+ * See also walkdir in file_utils.c, which is a frontend version of this
+ * logic.
+ */
+static void
+walkdir(const char *path,
+ void (*action) (const char *fname, bool isdir, int elevel),
+ bool process_symlinks,
+ int elevel)
+{
+ DIR *dir;
+ struct dirent *de;
+
+ dir = AllocateDir(path);
+
+ while ((de = ReadDirExtended(dir, path, elevel)) != NULL)
+ {
+ char subpath[MAXPGPATH * 2];
+
+ CHECK_FOR_INTERRUPTS();
+
+ if (strcmp(de->d_name, ".") == 0 ||
+ strcmp(de->d_name, "..") == 0)
+ continue;
+
+ snprintf(subpath, sizeof(subpath), "%s/%s", path, de->d_name);
+
+ switch (get_dirent_type(subpath, de, process_symlinks, elevel))
+ {
+ case PGFILETYPE_REG:
+ (*action) (subpath, false, elevel);
+ break;
+ case PGFILETYPE_DIR:
+ walkdir(subpath, action, false, elevel);
+ break;
+ default:
+
+ /*
+ * Errors are already reported directly by get_dirent_type(),
+ * and any remaining symlinks and unknown file types are
+ * ignored.
+ */
+ break;
+ }
+ }
+
+ FreeDir(dir); /* we ignore any error here */
+
+ /*
+ * It's important to fsync the destination directory itself as individual
+ * file fsyncs don't guarantee that the directory entry for the file is
+ * synced. However, skip this if AllocateDir failed; the action function
+ * might not be robust against that.
+ */
+ if (dir)
+ (*action) (path, true, elevel);
+}
+
+
+/*
+ * Hint to the OS that it should get ready to fsync() this file.
+ *
+ * Ignores errors trying to open unreadable files, and logs other errors at a
+ * caller-specified level.
+ */
+#ifdef PG_FLUSH_DATA_WORKS
+
+static void
+pre_sync_fname(const char *fname, bool isdir, int elevel)
+{
+ int fd;
+
+ /* Don't try to flush directories, it'll likely just fail */
+ if (isdir)
+ return;
+
+ ereport_startup_progress("syncing data directory (pre-fsync), elapsed time: %ld.%02d s, current path: %s",
+ fname);
+
+ fd = OpenTransientFile(fname, O_RDONLY | PG_BINARY);
+
+ if (fd < 0)
+ {
+ if (errno == EACCES)
+ return;
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", fname)));
+ return;
+ }
+
+ /*
+ * pg_flush_data() ignores errors, which is ok because this is only a
+ * hint.
+ */
+ pg_flush_data(fd, 0, 0);
+
+ if (CloseTransientFile(fd) != 0)
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", fname)));
+}
+
+#endif /* PG_FLUSH_DATA_WORKS */
+
+static void
+datadir_fsync_fname(const char *fname, bool isdir, int elevel)
+{
+ ereport_startup_progress("syncing data directory (fsync), elapsed time: %ld.%02d s, current path: %s",
+ fname);
+
+ /*
+ * We want to silently ignoring errors about unreadable files. Pass that
+ * desire on to fsync_fname_ext().
+ */
+ fsync_fname_ext(fname, isdir, true, elevel);
+}
+
+static void
+unlink_if_exists_fname(const char *fname, bool isdir, int elevel)
+{
+ if (isdir)
+ {
+ if (rmdir(fname) != 0 && errno != ENOENT)
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not remove directory \"%s\": %m", fname)));
+ }
+ else
+ {
+ /* Use PathNameDeleteTemporaryFile to report filesize */
+ PathNameDeleteTemporaryFile(fname, false);
+ }
+}
+
+/*
+ * fsync_fname_ext -- Try to fsync a file or directory
+ *
+ * If ignore_perm is true, ignore errors upon trying to open unreadable
+ * files. Logs other errors at a caller-specified level.
+ *
+ * Returns 0 if the operation succeeded, -1 otherwise.
+ */
+int
+fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel)
+{
+ int fd;
+ int flags;
+ int returncode;
+
+ /*
+ * Some OSs require directories to be opened read-only whereas other
+ * systems don't allow us to fsync files opened read-only; so we need both
+ * cases here. Using O_RDWR will cause us to fail to fsync files that are
+ * not writable by our userid, but we assume that's OK.
+ */
+ flags = PG_BINARY;
+ if (!isdir)
+ flags |= O_RDWR;
+ else
+ flags |= O_RDONLY;
+
+ fd = OpenTransientFile(fname, flags);
+
+ /*
+ * Some OSs don't allow us to open directories at all (Windows returns
+ * EACCES), just ignore the error in that case. If desired also silently
+ * ignoring errors about unreadable files. Log others.
+ */
+ if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES))
+ return 0;
+ else if (fd < 0 && ignore_perm && errno == EACCES)
+ return 0;
+ else if (fd < 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", fname)));
+ return -1;
+ }
+
+ returncode = pg_fsync(fd);
+
+ /*
+ * Some OSes don't allow us to fsync directories at all, so we can ignore
+ * those errors. Anything else needs to be logged.
+ */
+ if (returncode != 0 && !(isdir && (errno == EBADF || errno == EINVAL)))
+ {
+ int save_errno;
+
+ /* close file upon error, might not be in transaction context */
+ save_errno = errno;
+ (void) CloseTransientFile(fd);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m", fname)));
+ return -1;
+ }
+
+ if (CloseTransientFile(fd) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not close file \"%s\": %m", fname)));
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * fsync_parent_path -- fsync the parent path of a file or directory
+ *
+ * This is aimed at making file operations persistent on disk in case of
+ * an OS crash or power failure.
+ */
+static int
+fsync_parent_path(const char *fname, int elevel)
+{
+ char parentpath[MAXPGPATH];
+
+ strlcpy(parentpath, fname, MAXPGPATH);
+ get_parent_directory(parentpath);
+
+ /*
+ * get_parent_directory() returns an empty string if the input argument is
+ * just a file name (see comments in path.c), so handle that as being the
+ * current directory.
+ */
+ if (strlen(parentpath) == 0)
+ strlcpy(parentpath, ".", MAXPGPATH);
+
+ if (fsync_fname_ext(parentpath, true, false, elevel) != 0)
+ return -1;
+
+ return 0;
+}
+
+/*
+ * Create a PostgreSQL data sub-directory
+ *
+ * The data directory itself, and most of its sub-directories, are created at
+ * initdb time, but we do have some occasions when we create directories in
+ * the backend (CREATE TABLESPACE, for example). In those cases, we want to
+ * make sure that those directories are created consistently. Today, that means
+ * making sure that the created directory has the correct permissions, which is
+ * what pg_dir_create_mode tracks for us.
+ *
+ * Note that we also set the umask() based on what we understand the correct
+ * permissions to be (see file_perm.c).
+ *
+ * For permissions other than the default, mkdir() can be used directly, but
+ * be sure to consider carefully such cases -- a sub-directory with incorrect
+ * permissions in a PostgreSQL data directory could cause backups and other
+ * processes to fail.
+ */
+int
+MakePGDirectory(const char *directoryName)
+{
+ return mkdir(directoryName, pg_dir_create_mode);
+}
+
+/*
+ * Return the passed-in error level, or PANIC if data_sync_retry is off.
+ *
+ * Failure to fsync any data file is cause for immediate panic, unless
+ * data_sync_retry is enabled. Data may have been written to the operating
+ * system and removed from our buffer pool already, and if we are running on
+ * an operating system that forgets dirty data on write-back failure, there
+ * may be only one copy of the data remaining: in the WAL. A later attempt to
+ * fsync again might falsely report success. Therefore we must not allow any
+ * further checkpoints to be attempted. data_sync_retry can in theory be
+ * enabled on systems known not to drop dirty buffered data on write-back
+ * failure (with the likely outcome that checkpoints will continue to fail
+ * until the underlying problem is fixed).
+ *
+ * Any code that reports a failure from fsync() or related functions should
+ * filter the error level with this function.
+ */
+int
+data_sync_elevel(int elevel)
+{
+ return data_sync_retry ? elevel : PANIC;
+}
+
+bool
+check_debug_io_direct(char **newval, void **extra, GucSource source)
+{
+ bool result = true;
+ int flags;
+
+#if PG_O_DIRECT == 0
+ if (strcmp(*newval, "") != 0)
+ {
+ GUC_check_errdetail("debug_io_direct is not supported on this platform.");
+ result = false;
+ }
+ flags = 0;
+#else
+ List *elemlist;
+ ListCell *l;
+ char *rawstring;
+
+ /* Need a modifiable copy of string */
+ rawstring = pstrdup(*newval);
+
+ if (!SplitGUCList(rawstring, ',', &elemlist))
+ {
+ GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
+ "debug_io_direct");
+ pfree(rawstring);
+ list_free(elemlist);
+ return false;
+ }
+
+ flags = 0;
+ foreach(l, elemlist)
+ {
+ char *item = (char *) lfirst(l);
+
+ if (pg_strcasecmp(item, "data") == 0)
+ flags |= IO_DIRECT_DATA;
+ else if (pg_strcasecmp(item, "wal") == 0)
+ flags |= IO_DIRECT_WAL;
+ else if (pg_strcasecmp(item, "wal_init") == 0)
+ flags |= IO_DIRECT_WAL_INIT;
+ else
+ {
+ GUC_check_errdetail("invalid option \"%s\"", item);
+ result = false;
+ break;
+ }
+ }
+
+ /*
+ * It's possible to configure block sizes smaller than our assumed I/O
+ * alignment size, which could result in invalid I/O requests.
+ */
+#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
+ if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
+ {
+ GUC_check_errdetail("debug_io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
+ result = false;
+ }
+#endif
+#if BLCKSZ < PG_IO_ALIGN_SIZE
+ if (result && (flags & IO_DIRECT_DATA))
+ {
+ GUC_check_errdetail("debug_io_direct is not supported for data because BLCKSZ is too small");
+ result = false;
+ }
+#endif
+
+ pfree(rawstring);
+ list_free(elemlist);
+#endif
+
+ if (!result)
+ return result;
+
+ /* Save the flags in *extra, for use by assign_debug_io_direct */
+ *extra = guc_malloc(ERROR, sizeof(int));
+ *((int *) *extra) = flags;
+
+ return result;
+}
+
+extern void
+assign_debug_io_direct(const char *newval, void *extra)
+{
+ int *flags = (int *) extra;
+
+ io_direct_flags = *flags;
+}
diff --git a/src/backend/storage/file/fileset.c b/src/backend/storage/file/fileset.c
new file mode 100644
index 0000000..e9951b0
--- /dev/null
+++ b/src/backend/storage/file/fileset.c
@@ -0,0 +1,205 @@
+/*-------------------------------------------------------------------------
+ *
+ * fileset.c
+ * Management of named temporary files.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/file/fileset.c
+ *
+ * FileSets provide a temporary namespace (think directory) so that files can
+ * be discovered by name.
+ *
+ * FileSets can be used by backends when the temporary files need to be
+ * opened/closed multiple times and the underlying files need to survive across
+ * transactions.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "common/hashfn.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/fileset.h"
+#include "utils/builtins.h"
+
+static void FileSetPath(char *path, FileSet *fileset, Oid tablespace);
+static void FilePath(char *path, FileSet *fileset, const char *name);
+static Oid ChooseTablespace(const FileSet *fileset, const char *name);
+
+/*
+ * Initialize a space for temporary files. This API can be used by shared
+ * fileset as well as if the temporary files are used only by single backend
+ * but the files need to be opened and closed multiple times and also the
+ * underlying files need to survive across transactions.
+ *
+ * The callers are expected to explicitly remove such files by using
+ * FileSetDelete/FileSetDeleteAll.
+ *
+ * Files will be distributed over the tablespaces configured in
+ * temp_tablespaces.
+ *
+ * Under the covers the set is one or more directories which will eventually
+ * be deleted.
+ */
+void
+FileSetInit(FileSet *fileset)
+{
+ static uint32 counter = 0;
+
+ fileset->creator_pid = MyProcPid;
+ fileset->number = counter;
+ counter = (counter + 1) % INT_MAX;
+
+ /* Capture the tablespace OIDs so that all backends agree on them. */
+ PrepareTempTablespaces();
+ fileset->ntablespaces =
+ GetTempTablespaces(&fileset->tablespaces[0],
+ lengthof(fileset->tablespaces));
+ if (fileset->ntablespaces == 0)
+ {
+ /* If the GUC is empty, use current database's default tablespace */
+ fileset->tablespaces[0] = MyDatabaseTableSpace;
+ fileset->ntablespaces = 1;
+ }
+ else
+ {
+ int i;
+
+ /*
+ * An entry of InvalidOid means use the default tablespace for the
+ * current database. Replace that now, to be sure that all users of
+ * the FileSet agree on what to do.
+ */
+ for (i = 0; i < fileset->ntablespaces; i++)
+ {
+ if (fileset->tablespaces[i] == InvalidOid)
+ fileset->tablespaces[i] = MyDatabaseTableSpace;
+ }
+ }
+}
+
+/*
+ * Create a new file in the given set.
+ */
+File
+FileSetCreate(FileSet *fileset, const char *name)
+{
+ char path[MAXPGPATH];
+ File file;
+
+ FilePath(path, fileset, name);
+ file = PathNameCreateTemporaryFile(path, false);
+
+ /* If we failed, see if we need to create the directory on demand. */
+ if (file <= 0)
+ {
+ char tempdirpath[MAXPGPATH];
+ char filesetpath[MAXPGPATH];
+ Oid tablespace = ChooseTablespace(fileset, name);
+
+ TempTablespacePath(tempdirpath, tablespace);
+ FileSetPath(filesetpath, fileset, tablespace);
+ PathNameCreateTemporaryDir(tempdirpath, filesetpath);
+ file = PathNameCreateTemporaryFile(path, true);
+ }
+
+ return file;
+}
+
+/*
+ * Open a file that was created with FileSetCreate() */
+File
+FileSetOpen(FileSet *fileset, const char *name, int mode)
+{
+ char path[MAXPGPATH];
+ File file;
+
+ FilePath(path, fileset, name);
+ file = PathNameOpenTemporaryFile(path, mode);
+
+ return file;
+}
+
+/*
+ * Delete a file that was created with FileSetCreate().
+ *
+ * Return true if the file existed, false if didn't.
+ */
+bool
+FileSetDelete(FileSet *fileset, const char *name,
+ bool error_on_failure)
+{
+ char path[MAXPGPATH];
+
+ FilePath(path, fileset, name);
+
+ return PathNameDeleteTemporaryFile(path, error_on_failure);
+}
+
+/*
+ * Delete all files in the set.
+ */
+void
+FileSetDeleteAll(FileSet *fileset)
+{
+ char dirpath[MAXPGPATH];
+ int i;
+
+ /*
+ * Delete the directory we created in each tablespace. Doesn't fail
+ * because we use this in error cleanup paths, but can generate LOG
+ * message on IO error.
+ */
+ for (i = 0; i < fileset->ntablespaces; ++i)
+ {
+ FileSetPath(dirpath, fileset, fileset->tablespaces[i]);
+ PathNameDeleteTemporaryDir(dirpath);
+ }
+}
+
+/*
+ * Build the path for the directory holding the files backing a FileSet in a
+ * given tablespace.
+ */
+static void
+FileSetPath(char *path, FileSet *fileset, Oid tablespace)
+{
+ char tempdirpath[MAXPGPATH];
+
+ TempTablespacePath(tempdirpath, tablespace);
+ snprintf(path, MAXPGPATH, "%s/%s%lu.%u.fileset",
+ tempdirpath, PG_TEMP_FILE_PREFIX,
+ (unsigned long) fileset->creator_pid, fileset->number);
+}
+
+/*
+ * Sorting has to determine which tablespace a given temporary file belongs in.
+ */
+static Oid
+ChooseTablespace(const FileSet *fileset, const char *name)
+{
+ uint32 hash = hash_any((const unsigned char *) name, strlen(name));
+
+ return fileset->tablespaces[hash % fileset->ntablespaces];
+}
+
+/*
+ * Compute the full path of a file in a FileSet.
+ */
+static void
+FilePath(char *path, FileSet *fileset, const char *name)
+{
+ char dirpath[MAXPGPATH];
+
+ FileSetPath(dirpath, fileset, ChooseTablespace(fileset, name));
+ snprintf(path, MAXPGPATH, "%s/%s", dirpath, name);
+}
diff --git a/src/backend/storage/file/meson.build b/src/backend/storage/file/meson.build
new file mode 100644
index 0000000..e7fe850
--- /dev/null
+++ b/src/backend/storage/file/meson.build
@@ -0,0 +1,10 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'buffile.c',
+ 'copydir.c',
+ 'fd.c',
+ 'fileset.c',
+ 'reinit.c',
+ 'sharedfileset.c',
+)
diff --git a/src/backend/storage/file/reinit.c b/src/backend/storage/file/reinit.c
new file mode 100644
index 0000000..fb55371
--- /dev/null
+++ b/src/backend/storage/file/reinit.c
@@ -0,0 +1,422 @@
+/*-------------------------------------------------------------------------
+ *
+ * reinit.c
+ * Reinitialization of unlogged relations
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/file/reinit.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <unistd.h>
+
+#include "common/relpath.h"
+#include "postmaster/startup.h"
+#include "storage/copydir.h"
+#include "storage/fd.h"
+#include "storage/reinit.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+
+static void ResetUnloggedRelationsInTablespaceDir(const char *tsdirname,
+ int op);
+static void ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname,
+ int op);
+
+typedef struct
+{
+ Oid reloid; /* hash key */
+} unlogged_relation_entry;
+
+/*
+ * Reset unlogged relations from before the last restart.
+ *
+ * If op includes UNLOGGED_RELATION_CLEANUP, we remove all forks of any
+ * relation with an "init" fork, except for the "init" fork itself.
+ *
+ * If op includes UNLOGGED_RELATION_INIT, we copy the "init" fork to the main
+ * fork.
+ */
+void
+ResetUnloggedRelations(int op)
+{
+ char temp_path[MAXPGPATH + 10 + sizeof(TABLESPACE_VERSION_DIRECTORY)];
+ DIR *spc_dir;
+ struct dirent *spc_de;
+ MemoryContext tmpctx,
+ oldctx;
+
+ /* Log it. */
+ elog(DEBUG1, "resetting unlogged relations: cleanup %d init %d",
+ (op & UNLOGGED_RELATION_CLEANUP) != 0,
+ (op & UNLOGGED_RELATION_INIT) != 0);
+
+ /*
+ * Just to be sure we don't leak any memory, let's create a temporary
+ * memory context for this operation.
+ */
+ tmpctx = AllocSetContextCreate(CurrentMemoryContext,
+ "ResetUnloggedRelations",
+ ALLOCSET_DEFAULT_SIZES);
+ oldctx = MemoryContextSwitchTo(tmpctx);
+
+ /* Prepare to report progress resetting unlogged relations. */
+ begin_startup_progress_phase();
+
+ /*
+ * First process unlogged files in pg_default ($PGDATA/base)
+ */
+ ResetUnloggedRelationsInTablespaceDir("base", op);
+
+ /*
+ * Cycle through directories for all non-default tablespaces.
+ */
+ spc_dir = AllocateDir("pg_tblspc");
+
+ while ((spc_de = ReadDir(spc_dir, "pg_tblspc")) != NULL)
+ {
+ if (strcmp(spc_de->d_name, ".") == 0 ||
+ strcmp(spc_de->d_name, "..") == 0)
+ continue;
+
+ snprintf(temp_path, sizeof(temp_path), "pg_tblspc/%s/%s",
+ spc_de->d_name, TABLESPACE_VERSION_DIRECTORY);
+ ResetUnloggedRelationsInTablespaceDir(temp_path, op);
+ }
+
+ FreeDir(spc_dir);
+
+ /*
+ * Restore memory context.
+ */
+ MemoryContextSwitchTo(oldctx);
+ MemoryContextDelete(tmpctx);
+}
+
+/*
+ * Process one tablespace directory for ResetUnloggedRelations
+ */
+static void
+ResetUnloggedRelationsInTablespaceDir(const char *tsdirname, int op)
+{
+ DIR *ts_dir;
+ struct dirent *de;
+ char dbspace_path[MAXPGPATH * 2];
+
+ ts_dir = AllocateDir(tsdirname);
+
+ /*
+ * If we get ENOENT on a tablespace directory, log it and return. This
+ * can happen if a previous DROP TABLESPACE crashed between removing the
+ * tablespace directory and removing the symlink in pg_tblspc. We don't
+ * really want to prevent database startup in that scenario, so let it
+ * pass instead. Any other type of error will be reported by ReadDir
+ * (causing a startup failure).
+ */
+ if (ts_dir == NULL && errno == ENOENT)
+ {
+ ereport(LOG,
+ (errcode_for_file_access(),
+ errmsg("could not open directory \"%s\": %m",
+ tsdirname)));
+ return;
+ }
+
+ while ((de = ReadDir(ts_dir, tsdirname)) != NULL)
+ {
+ /*
+ * We're only interested in the per-database directories, which have
+ * numeric names. Note that this code will also (properly) ignore "."
+ * and "..".
+ */
+ if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
+ continue;
+
+ snprintf(dbspace_path, sizeof(dbspace_path), "%s/%s",
+ tsdirname, de->d_name);
+
+ if (op & UNLOGGED_RELATION_INIT)
+ ereport_startup_progress("resetting unlogged relations (init), elapsed time: %ld.%02d s, current path: %s",
+ dbspace_path);
+ else if (op & UNLOGGED_RELATION_CLEANUP)
+ ereport_startup_progress("resetting unlogged relations (cleanup), elapsed time: %ld.%02d s, current path: %s",
+ dbspace_path);
+
+ ResetUnloggedRelationsInDbspaceDir(dbspace_path, op);
+ }
+
+ FreeDir(ts_dir);
+}
+
+/*
+ * Process one per-dbspace directory for ResetUnloggedRelations
+ */
+static void
+ResetUnloggedRelationsInDbspaceDir(const char *dbspacedirname, int op)
+{
+ DIR *dbspace_dir;
+ struct dirent *de;
+ char rm_path[MAXPGPATH * 2];
+
+ /* Caller must specify at least one operation. */
+ Assert((op & (UNLOGGED_RELATION_CLEANUP | UNLOGGED_RELATION_INIT)) != 0);
+
+ /*
+ * Cleanup is a two-pass operation. First, we go through and identify all
+ * the files with init forks. Then, we go through again and nuke
+ * everything with the same OID except the init fork.
+ */
+ if ((op & UNLOGGED_RELATION_CLEANUP) != 0)
+ {
+ HTAB *hash;
+ HASHCTL ctl;
+
+ /*
+ * It's possible that someone could create a ton of unlogged relations
+ * in the same database & tablespace, so we'd better use a hash table
+ * rather than an array or linked list to keep track of which files
+ * need to be reset. Otherwise, this cleanup operation would be
+ * O(n^2).
+ */
+ ctl.keysize = sizeof(Oid);
+ ctl.entrysize = sizeof(unlogged_relation_entry);
+ ctl.hcxt = CurrentMemoryContext;
+ hash = hash_create("unlogged relation OIDs", 32, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ /* Scan the directory. */
+ dbspace_dir = AllocateDir(dbspacedirname);
+ while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+ {
+ ForkNumber forkNum;
+ int relnumchars;
+ unlogged_relation_entry ent;
+
+ /* Skip anything that doesn't look like a relation data file. */
+ if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+ &forkNum))
+ continue;
+
+ /* Also skip it unless this is the init fork. */
+ if (forkNum != INIT_FORKNUM)
+ continue;
+
+ /*
+ * Put the OID portion of the name into the hash table, if it
+ * isn't already.
+ */
+ ent.reloid = atooid(de->d_name);
+ (void) hash_search(hash, &ent, HASH_ENTER, NULL);
+ }
+
+ /* Done with the first pass. */
+ FreeDir(dbspace_dir);
+
+ /*
+ * If we didn't find any init forks, there's no point in continuing;
+ * we can bail out now.
+ */
+ if (hash_get_num_entries(hash) == 0)
+ {
+ hash_destroy(hash);
+ return;
+ }
+
+ /*
+ * Now, make a second pass and remove anything that matches.
+ */
+ dbspace_dir = AllocateDir(dbspacedirname);
+ while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+ {
+ ForkNumber forkNum;
+ int relnumchars;
+ unlogged_relation_entry ent;
+
+ /* Skip anything that doesn't look like a relation data file. */
+ if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+ &forkNum))
+ continue;
+
+ /* We never remove the init fork. */
+ if (forkNum == INIT_FORKNUM)
+ continue;
+
+ /*
+ * See whether the OID portion of the name shows up in the hash
+ * table. If so, nuke it!
+ */
+ ent.reloid = atooid(de->d_name);
+ if (hash_search(hash, &ent, HASH_FIND, NULL))
+ {
+ snprintf(rm_path, sizeof(rm_path), "%s/%s",
+ dbspacedirname, de->d_name);
+ if (unlink(rm_path) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m",
+ rm_path)));
+ else
+ elog(DEBUG2, "unlinked file \"%s\"", rm_path);
+ }
+ }
+
+ /* Cleanup is complete. */
+ FreeDir(dbspace_dir);
+ hash_destroy(hash);
+ }
+
+ /*
+ * Initialization happens after cleanup is complete: we copy each init
+ * fork file to the corresponding main fork file. Note that if we are
+ * asked to do both cleanup and init, we may never get here: if the
+ * cleanup code determines that there are no init forks in this dbspace,
+ * it will return before we get to this point.
+ */
+ if ((op & UNLOGGED_RELATION_INIT) != 0)
+ {
+ /* Scan the directory. */
+ dbspace_dir = AllocateDir(dbspacedirname);
+ while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+ {
+ ForkNumber forkNum;
+ int relnumchars;
+ char relnumbuf[OIDCHARS + 1];
+ char srcpath[MAXPGPATH * 2];
+ char dstpath[MAXPGPATH];
+
+ /* Skip anything that doesn't look like a relation data file. */
+ if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+ &forkNum))
+ continue;
+
+ /* Also skip it unless this is the init fork. */
+ if (forkNum != INIT_FORKNUM)
+ continue;
+
+ /* Construct source pathname. */
+ snprintf(srcpath, sizeof(srcpath), "%s/%s",
+ dbspacedirname, de->d_name);
+
+ /* Construct destination pathname. */
+ memcpy(relnumbuf, de->d_name, relnumchars);
+ relnumbuf[relnumchars] = '\0';
+ snprintf(dstpath, sizeof(dstpath), "%s/%s%s",
+ dbspacedirname, relnumbuf, de->d_name + relnumchars + 1 +
+ strlen(forkNames[INIT_FORKNUM]));
+
+ /* OK, we're ready to perform the actual copy. */
+ elog(DEBUG2, "copying %s to %s", srcpath, dstpath);
+ copy_file(srcpath, dstpath);
+ }
+
+ FreeDir(dbspace_dir);
+
+ /*
+ * copy_file() above has already called pg_flush_data() on the files
+ * it created. Now we need to fsync those files, because a checkpoint
+ * won't do it for us while we're in recovery. We do this in a
+ * separate pass to allow the kernel to perform all the flushes
+ * (especially the metadata ones) at once.
+ */
+ dbspace_dir = AllocateDir(dbspacedirname);
+ while ((de = ReadDir(dbspace_dir, dbspacedirname)) != NULL)
+ {
+ ForkNumber forkNum;
+ int relnumchars;
+ char relnumbuf[OIDCHARS + 1];
+ char mainpath[MAXPGPATH];
+
+ /* Skip anything that doesn't look like a relation data file. */
+ if (!parse_filename_for_nontemp_relation(de->d_name, &relnumchars,
+ &forkNum))
+ continue;
+
+ /* Also skip it unless this is the init fork. */
+ if (forkNum != INIT_FORKNUM)
+ continue;
+
+ /* Construct main fork pathname. */
+ memcpy(relnumbuf, de->d_name, relnumchars);
+ relnumbuf[relnumchars] = '\0';
+ snprintf(mainpath, sizeof(mainpath), "%s/%s%s",
+ dbspacedirname, relnumbuf, de->d_name + relnumchars + 1 +
+ strlen(forkNames[INIT_FORKNUM]));
+
+ fsync_fname(mainpath, false);
+ }
+
+ FreeDir(dbspace_dir);
+
+ /*
+ * Lastly, fsync the database directory itself, ensuring the
+ * filesystem remembers the file creations and deletions we've done.
+ * We don't bother with this during a call that does only
+ * UNLOGGED_RELATION_CLEANUP, because if recovery crashes before we
+ * get to doing UNLOGGED_RELATION_INIT, we'll redo the cleanup step
+ * too at the next startup attempt.
+ */
+ fsync_fname(dbspacedirname, true);
+ }
+}
+
+/*
+ * Basic parsing of putative relation filenames.
+ *
+ * This function returns true if the file appears to be in the correct format
+ * for a non-temporary relation and false otherwise.
+ *
+ * NB: If this function returns true, the caller is entitled to assume that
+ * *relnumchars has been set to a value no more than OIDCHARS, and thus
+ * that a buffer of OIDCHARS+1 characters is sufficient to hold the
+ * RelFileNumber portion of the filename. This is critical to protect against
+ * a possible buffer overrun.
+ */
+bool
+parse_filename_for_nontemp_relation(const char *name, int *relnumchars,
+ ForkNumber *fork)
+{
+ int pos;
+
+ /* Look for a non-empty string of digits (that isn't too long). */
+ for (pos = 0; isdigit((unsigned char) name[pos]); ++pos)
+ ;
+ if (pos == 0 || pos > OIDCHARS)
+ return false;
+ *relnumchars = pos;
+
+ /* Check for a fork name. */
+ if (name[pos] != '_')
+ *fork = MAIN_FORKNUM;
+ else
+ {
+ int forkchar;
+
+ forkchar = forkname_chars(&name[pos + 1], fork);
+ if (forkchar <= 0)
+ return false;
+ pos += forkchar + 1;
+ }
+
+ /* Check for a segment number. */
+ if (name[pos] == '.')
+ {
+ int segchar;
+
+ for (segchar = 1; isdigit((unsigned char) name[pos + segchar]); ++segchar)
+ ;
+ if (segchar <= 1)
+ return false;
+ pos += segchar;
+ }
+
+ /* Now we should be at the end. */
+ if (name[pos] != '\0')
+ return false;
+ return true;
+}
diff --git a/src/backend/storage/file/sharedfileset.c b/src/backend/storage/file/sharedfileset.c
new file mode 100644
index 0000000..a13c8ed
--- /dev/null
+++ b/src/backend/storage/file/sharedfileset.c
@@ -0,0 +1,120 @@
+/*-------------------------------------------------------------------------
+ *
+ * sharedfileset.c
+ * Shared temporary file management.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/file/sharedfileset.c
+ *
+ * SharedFileSets provide a temporary namespace (think directory) so that
+ * files can be discovered by name, and a shared ownership semantics so that
+ * shared files survive until the last user detaches.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "catalog/pg_tablespace.h"
+#include "commands/tablespace.h"
+#include "common/hashfn.h"
+#include "miscadmin.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/sharedfileset.h"
+#include "utils/builtins.h"
+
+static void SharedFileSetOnDetach(dsm_segment *segment, Datum datum);
+
+/*
+ * Initialize a space for temporary files that can be opened by other backends.
+ * Other backends must attach to it before accessing it. Associate this
+ * SharedFileSet with 'seg'. Any contained files will be deleted when the
+ * last backend detaches.
+ *
+ * Under the covers the set is one or more directories which will eventually
+ * be deleted.
+ */
+void
+SharedFileSetInit(SharedFileSet *fileset, dsm_segment *seg)
+{
+ /* Initialize the shared fileset specific members. */
+ SpinLockInit(&fileset->mutex);
+ fileset->refcnt = 1;
+
+ /* Initialize the fileset. */
+ FileSetInit(&fileset->fs);
+
+ /* Register our cleanup callback. */
+ if (seg)
+ on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Attach to a set of directories that was created with SharedFileSetInit.
+ */
+void
+SharedFileSetAttach(SharedFileSet *fileset, dsm_segment *seg)
+{
+ bool success;
+
+ SpinLockAcquire(&fileset->mutex);
+ if (fileset->refcnt == 0)
+ success = false;
+ else
+ {
+ ++fileset->refcnt;
+ success = true;
+ }
+ SpinLockRelease(&fileset->mutex);
+
+ if (!success)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not attach to a SharedFileSet that is already destroyed")));
+
+ /* Register our cleanup callback. */
+ on_dsm_detach(seg, SharedFileSetOnDetach, PointerGetDatum(fileset));
+}
+
+/*
+ * Delete all files in the set.
+ */
+void
+SharedFileSetDeleteAll(SharedFileSet *fileset)
+{
+ FileSetDeleteAll(&fileset->fs);
+}
+
+/*
+ * Callback function that will be invoked when this backend detaches from a
+ * DSM segment holding a SharedFileSet that it has created or attached to. If
+ * we are the last to detach, then try to remove the directories and
+ * everything in them. We can't raise an error on failures, because this runs
+ * in error cleanup paths.
+ */
+static void
+SharedFileSetOnDetach(dsm_segment *segment, Datum datum)
+{
+ bool unlink_all = false;
+ SharedFileSet *fileset = (SharedFileSet *) DatumGetPointer(datum);
+
+ SpinLockAcquire(&fileset->mutex);
+ Assert(fileset->refcnt > 0);
+ if (--fileset->refcnt == 0)
+ unlink_all = true;
+ SpinLockRelease(&fileset->mutex);
+
+ /*
+ * If we are the last to detach, we delete the directory in all
+ * tablespaces. Note that we are still actually attached for the rest of
+ * this function so we can safely access its data.
+ */
+ if (unlink_all)
+ FileSetDeleteAll(&fileset->fs);
+}
diff --git a/src/backend/storage/freespace/Makefile b/src/backend/storage/freespace/Makefile
new file mode 100644
index 0000000..ac0fa8b
--- /dev/null
+++ b/src/backend/storage/freespace/Makefile
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/freespace
+#
+# IDENTIFICATION
+# src/backend/storage/freespace/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/freespace
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ freespace.o \
+ fsmpage.o \
+ indexfsm.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README
new file mode 100644
index 0000000..e7ff23b
--- /dev/null
+++ b/src/backend/storage/freespace/README
@@ -0,0 +1,196 @@
+src/backend/storage/freespace/README
+
+Free Space Map
+--------------
+
+The purpose of the free space map is to quickly locate a page with enough
+free space to hold a tuple to be stored; or to determine that no such page
+exists and the relation must be extended by one page. As of PostgreSQL 8.4
+each relation has its own, extensible free space map stored in a separate
+"fork" of its relation. This eliminates the disadvantages of the former
+fixed-size FSM.
+
+It is important to keep the map small so that it can be searched rapidly.
+Therefore, we don't attempt to record the exact free space on a page.
+We allocate one map byte to each page, allowing us to record free space
+at a granularity of 1/256th of a page. Another way to say it is that
+the stored value is the free space divided by BLCKSZ/256 (rounding down).
+We assume that the free space must always be less than BLCKSZ, since
+all pages have some overhead; so the maximum map value is 255.
+
+To assist in fast searching, the map isn't simply an array of per-page
+entries, but has a tree structure above those entries. There is a tree
+structure of pages, and a tree structure within each page, as described
+below.
+
+FSM page structure
+------------------
+
+Within each FSM page, we use a binary tree structure where leaf nodes store
+the amount of free space on heap pages (or lower level FSM pages, see
+"Higher-level structure" below), with one leaf node per heap page. A non-leaf
+node stores the max amount of free space on any of its children.
+
+For example:
+
+ 4
+ 4 2
+3 4 0 2 <- This level represents heap pages
+
+We need two basic operations: search and update.
+
+To search for a page with X amount of free space, traverse down the tree
+along a path where n >= X, until you hit the bottom. If both children of a
+node satisfy the condition, you can pick either one arbitrarily.
+
+To update the amount of free space on a page to X, first update the leaf node
+corresponding to the heap page, then "bubble up" the change to upper nodes,
+by walking up to each parent and recomputing its value as the max of its
+two children. Repeat until reaching the root or a parent whose value
+doesn't change.
+
+This data structure has a couple of nice properties:
+- to discover that there is no page with X bytes of free space, you only
+ need to look at the root node
+- by varying which child to traverse to in the search algorithm, when you have
+ a choice, we can implement various strategies, like preferring pages closer
+ to a given page, or spreading the load across the table.
+
+Higher-level routines that use FSM pages access them through the fsm_set_avail()
+and fsm_search_avail() functions. The interface to those functions hides the
+page's internal tree structure, treating the FSM page as a black box that has
+a certain number of "slots" for storing free space information. (However,
+the higher routines have to be aware of the tree structure of the whole map.)
+
+The binary tree is stored on each FSM page as an array. Because the page
+header takes some space on a page, the binary tree isn't perfect. That is,
+a few right-most leaf nodes are missing, and there are some useless non-leaf
+nodes at the right. So the tree looks something like this:
+
+ 0
+ 1 2
+ 3 4 5 6
+7 8 9 A B
+
+where the numbers denote each node's position in the array. Note that the
+tree is guaranteed complete above the leaf level; only some leaf nodes are
+missing. This is reflected in the number of usable "slots" per page not
+being an exact power of 2.
+
+A FSM page also has a next slot pointer, fp_next_slot, that determines where
+to start the next search for free space within that page. The reason for that
+is to spread out the pages that are returned by FSM searches. When several
+backends are concurrently inserting into a relation, contention can be avoided
+by having them insert into different pages. But it is also desirable to fill
+up pages in sequential order, to get the benefit of OS prefetching and batched
+writes. The FSM is responsible for making that happen, and the next slot
+pointer helps provide the desired behavior.
+
+Higher-level structure
+----------------------
+
+To scale up the data structure described above beyond a single page, we
+maintain a similar tree-structure across pages. Leaf nodes in higher level
+pages correspond to lower level FSM pages. The root node within each page
+has the same value as the corresponding leaf node on its parent page.
+
+The root page is always stored at physical block 0.
+
+For example, assuming each FSM page can hold information about 4 pages (in
+reality, it holds (BLCKSZ - headers) / 2, or ~4000 with default BLCKSZ),
+we get a disk layout like this:
+
+ 0 <-- page 0 at level 2 (root page)
+ 0 <-- page 0 at level 1
+ 0 <-- page 0 at level 0
+ 1 <-- page 1 at level 0
+ 2 <-- ...
+ 3
+ 1 <-- page 1 at level 1
+ 4
+ 5
+ 6
+ 7
+ 2
+ 8
+ 9
+ 10
+ 11
+ 3
+ 12
+ 13
+ 14
+ 15
+
+where the numbers are page numbers *at that level*, starting from 0.
+
+To find the physical block # corresponding to leaf page n, we need to
+count the number of leaf and upper-level pages preceding page n.
+This turns out to be
+
+y = n + (n / F + 1) + (n / F^2 + 1) + ... + 1
+
+where F is the fanout (4 in the above example). The first term n is the number
+of preceding leaf pages, the second term is the number of pages at level 1,
+and so forth.
+
+To keep things simple, the tree is always constant height. To cover the
+maximum relation size of 2^32-1 blocks, three levels is enough with the default
+BLCKSZ (4000^3 > 2^32).
+
+Addressing
+----------
+
+The higher-level routines operate on "logical" addresses, consisting of
+- level,
+- logical page number, and
+- slot (if applicable)
+
+Bottom level FSM pages have level of 0, the level above that 1, and root 2.
+As in the diagram above, logical page number is the page number at that level,
+starting from 0.
+
+Locking
+-------
+
+When traversing down to search for free space, only one page is locked at a
+time: the parent page is released before locking the child. If the child page
+is concurrently modified, and there no longer is free space on the child page
+when you land on it, you need to start from scratch (after correcting the
+parent page, so that you don't get into an infinite loop).
+
+We use shared buffer locks when searching, but exclusive buffer lock when
+updating a page. However, the next slot search pointer is updated during
+searches even though we have only a shared lock. fp_next_slot is just a hint
+and we can easily reset it if it gets corrupted; so it seems better to accept
+some risk of that type than to pay the overhead of exclusive locking.
+
+Recovery
+--------
+
+The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of
+self-correcting measures to repair possible corruption. As a result when
+we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint()
+rather than MarkBufferDirty().
+
+First of all, whenever a value is set on an FSM page, the root node of the
+page is compared against the new value after bubbling up the change is
+finished. It should be greater than or equal to the value just set, or we
+have a corrupted page, with a parent somewhere with too small a value.
+Secondly, if we detect corrupted pages while we search, traversing down
+the tree. That check will notice if a parent node is set to too high a value.
+In both cases, the upper nodes on the page are immediately rebuilt, fixing
+the corruption so far as that page is concerned.
+
+VACUUM updates all the bottom-level FSM pages with the correct amount of free
+space on corresponding heap pages, as it proceeds through the heap. This
+goes through fsm_set_avail(), so that the upper nodes on those pages are
+immediately updated. Periodically, VACUUM calls FreeSpaceMapVacuum[Range]
+to propagate the new free-space info into the upper pages of the FSM tree.
+
+TODO
+----
+
+- fastroot to avoid traversing upper nodes with just 1 child
+- use a different system for tables that fit into one FSM page, with a
+ mechanism to switch to the real thing as it grows.
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
new file mode 100644
index 0000000..fb9440f
--- /dev/null
+++ b/src/backend/storage/freespace/freespace.c
@@ -0,0 +1,865 @@
+/*-------------------------------------------------------------------------
+ *
+ * freespace.c
+ * POSTGRES free space map for quickly finding free space in relations
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/freespace/freespace.c
+ *
+ *
+ * NOTES:
+ *
+ * Free Space Map keeps track of the amount of free space on pages, and
+ * allows quickly searching for a page with enough free space. The FSM is
+ * stored in a dedicated relation fork of all heap relations, and those
+ * index access methods that need it (see also indexfsm.c). See README for
+ * more information.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/xloginsert.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "storage/freespace.h"
+#include "storage/fsm_internals.h"
+#include "storage/lmgr.h"
+#include "storage/smgr.h"
+
+
+/*
+ * We use just one byte to store the amount of free space on a page, so we
+ * divide the amount of free space a page can have into 256 different
+ * categories. The highest category, 255, represents a page with at least
+ * MaxFSMRequestSize bytes of free space, and the second highest category
+ * represents the range from 254 * FSM_CAT_STEP, inclusive, to
+ * MaxFSMRequestSize, exclusive.
+ *
+ * MaxFSMRequestSize depends on the architecture and BLCKSZ, but assuming
+ * default 8k BLCKSZ, and that MaxFSMRequestSize is 8164 bytes, the
+ * categories look like this:
+ *
+ *
+ * Range Category
+ * 0 - 31 0
+ * 32 - 63 1
+ * ... ... ...
+ * 8096 - 8127 253
+ * 8128 - 8163 254
+ * 8164 - 8192 255
+ *
+ * The reason that MaxFSMRequestSize is special is that if MaxFSMRequestSize
+ * isn't equal to a range boundary, a page with exactly MaxFSMRequestSize
+ * bytes of free space wouldn't satisfy a request for MaxFSMRequestSize
+ * bytes. If there isn't more than MaxFSMRequestSize bytes of free space on a
+ * completely empty page, that would mean that we could never satisfy a
+ * request of exactly MaxFSMRequestSize bytes.
+ */
+#define FSM_CATEGORIES 256
+#define FSM_CAT_STEP (BLCKSZ / FSM_CATEGORIES)
+#define MaxFSMRequestSize MaxHeapTupleSize
+
+/*
+ * Depth of the on-disk tree. We need to be able to address 2^32-1 blocks,
+ * and 1626 is the smallest number that satisfies X^3 >= 2^32-1. Likewise,
+ * 256 is the smallest number that satisfies X^4 >= 2^32-1. In practice,
+ * this means that 4096 bytes is the smallest BLCKSZ that we can get away
+ * with a 3-level tree, and 512 is the smallest we support.
+ */
+#define FSM_TREE_DEPTH ((SlotsPerFSMPage >= 1626) ? 3 : 4)
+
+#define FSM_ROOT_LEVEL (FSM_TREE_DEPTH - 1)
+#define FSM_BOTTOM_LEVEL 0
+
+/*
+ * The internal FSM routines work on a logical addressing scheme. Each
+ * level of the tree can be thought of as a separately addressable file.
+ */
+typedef struct
+{
+ int level; /* level */
+ int logpageno; /* page number within the level */
+} FSMAddress;
+
+/* Address of the root page. */
+static const FSMAddress FSM_ROOT_ADDRESS = {FSM_ROOT_LEVEL, 0};
+
+/* functions to navigate the tree */
+static FSMAddress fsm_get_child(FSMAddress parent, uint16 slot);
+static FSMAddress fsm_get_parent(FSMAddress child, uint16 *slot);
+static FSMAddress fsm_get_location(BlockNumber heapblk, uint16 *slot);
+static BlockNumber fsm_get_heap_blk(FSMAddress addr, uint16 slot);
+static BlockNumber fsm_logical_to_physical(FSMAddress addr);
+
+static Buffer fsm_readbuf(Relation rel, FSMAddress addr, bool extend);
+static Buffer fsm_extend(Relation rel, BlockNumber fsm_nblocks);
+
+/* functions to convert amount of free space to a FSM category */
+static uint8 fsm_space_avail_to_cat(Size avail);
+static uint8 fsm_space_needed_to_cat(Size needed);
+static Size fsm_space_cat_to_avail(uint8 cat);
+
+/* workhorse functions for various operations */
+static int fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
+ uint8 newValue, uint8 minValue);
+static BlockNumber fsm_search(Relation rel, uint8 min_cat);
+static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
+ BlockNumber start, BlockNumber end,
+ bool *eof_p);
+
+
+/******** Public API ********/
+
+/*
+ * GetPageWithFreeSpace - try to find a page in the given relation with
+ * at least the specified amount of free space.
+ *
+ * If successful, return the block number; if not, return InvalidBlockNumber.
+ *
+ * The caller must be prepared for the possibility that the returned page
+ * will turn out to have too little space available by the time the caller
+ * gets a lock on it. In that case, the caller should report the actual
+ * amount of free space available on that page and then try again (see
+ * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
+ * extend the relation.
+ */
+BlockNumber
+GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
+{
+ uint8 min_cat = fsm_space_needed_to_cat(spaceNeeded);
+
+ return fsm_search(rel, min_cat);
+}
+
+/*
+ * RecordAndGetPageWithFreeSpace - update info about a page and try again.
+ *
+ * We provide this combo form to save some locking overhead, compared to
+ * separate RecordPageWithFreeSpace + GetPageWithFreeSpace calls. There's
+ * also some effort to return a page close to the old page; if there's a
+ * page with enough free space on the same FSM page where the old one page
+ * is located, it is preferred.
+ */
+BlockNumber
+RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
+ Size oldSpaceAvail, Size spaceNeeded)
+{
+ int old_cat = fsm_space_avail_to_cat(oldSpaceAvail);
+ int search_cat = fsm_space_needed_to_cat(spaceNeeded);
+ FSMAddress addr;
+ uint16 slot;
+ int search_slot;
+
+ /* Get the location of the FSM byte representing the heap block */
+ addr = fsm_get_location(oldPage, &slot);
+
+ search_slot = fsm_set_and_search(rel, addr, slot, old_cat, search_cat);
+
+ /*
+ * If fsm_set_and_search found a suitable new block, return that.
+ * Otherwise, search as usual.
+ */
+ if (search_slot != -1)
+ return fsm_get_heap_blk(addr, search_slot);
+ else
+ return fsm_search(rel, search_cat);
+}
+
+/*
+ * RecordPageWithFreeSpace - update info about a page.
+ *
+ * Note that if the new spaceAvail value is higher than the old value stored
+ * in the FSM, the space might not become visible to searchers until the next
+ * FreeSpaceMapVacuum call, which updates the upper level pages.
+ */
+void
+RecordPageWithFreeSpace(Relation rel, BlockNumber heapBlk, Size spaceAvail)
+{
+ int new_cat = fsm_space_avail_to_cat(spaceAvail);
+ FSMAddress addr;
+ uint16 slot;
+
+ /* Get the location of the FSM byte representing the heap block */
+ addr = fsm_get_location(heapBlk, &slot);
+
+ fsm_set_and_search(rel, addr, slot, new_cat, 0);
+}
+
+/*
+ * XLogRecordPageWithFreeSpace - like RecordPageWithFreeSpace, for use in
+ * WAL replay
+ */
+void
+XLogRecordPageWithFreeSpace(RelFileLocator rlocator, BlockNumber heapBlk,
+ Size spaceAvail)
+{
+ int new_cat = fsm_space_avail_to_cat(spaceAvail);
+ FSMAddress addr;
+ uint16 slot;
+ BlockNumber blkno;
+ Buffer buf;
+ Page page;
+
+ /* Get the location of the FSM byte representing the heap block */
+ addr = fsm_get_location(heapBlk, &slot);
+ blkno = fsm_logical_to_physical(addr);
+
+ /* If the page doesn't exist already, extend */
+ buf = XLogReadBufferExtended(rlocator, FSM_FORKNUM, blkno,
+ RBM_ZERO_ON_ERROR, InvalidBuffer);
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ page = BufferGetPage(buf);
+ if (PageIsNew(page))
+ PageInit(page, BLCKSZ, 0);
+
+ if (fsm_set_avail(page, slot, new_cat))
+ MarkBufferDirtyHint(buf, false);
+ UnlockReleaseBuffer(buf);
+}
+
+/*
+ * GetRecordedFreeSpace - return the amount of free space on a particular page,
+ * according to the FSM.
+ */
+Size
+GetRecordedFreeSpace(Relation rel, BlockNumber heapBlk)
+{
+ FSMAddress addr;
+ uint16 slot;
+ Buffer buf;
+ uint8 cat;
+
+ /* Get the location of the FSM byte representing the heap block */
+ addr = fsm_get_location(heapBlk, &slot);
+
+ buf = fsm_readbuf(rel, addr, false);
+ if (!BufferIsValid(buf))
+ return 0;
+ cat = fsm_get_avail(BufferGetPage(buf), slot);
+ ReleaseBuffer(buf);
+
+ return fsm_space_cat_to_avail(cat);
+}
+
+/*
+ * FreeSpaceMapPrepareTruncateRel - prepare for truncation of a relation.
+ *
+ * nblocks is the new size of the heap.
+ *
+ * Return the number of blocks of new FSM.
+ * If it's InvalidBlockNumber, there is nothing to truncate;
+ * otherwise the caller is responsible for calling smgrtruncate()
+ * to truncate the FSM pages, and FreeSpaceMapVacuumRange()
+ * to update upper-level pages in the FSM.
+ */
+BlockNumber
+FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
+{
+ BlockNumber new_nfsmblocks;
+ FSMAddress first_removed_address;
+ uint16 first_removed_slot;
+ Buffer buf;
+
+ /*
+ * If no FSM has been created yet for this relation, there's nothing to
+ * truncate.
+ */
+ if (!smgrexists(RelationGetSmgr(rel), FSM_FORKNUM))
+ return InvalidBlockNumber;
+
+ /* Get the location in the FSM of the first removed heap block */
+ first_removed_address = fsm_get_location(nblocks, &first_removed_slot);
+
+ /*
+ * Zero out the tail of the last remaining FSM page. If the slot
+ * representing the first removed heap block is at a page boundary, as the
+ * first slot on the FSM page that first_removed_address points to, we can
+ * just truncate that page altogether.
+ */
+ if (first_removed_slot > 0)
+ {
+ buf = fsm_readbuf(rel, first_removed_address, false);
+ if (!BufferIsValid(buf))
+ return InvalidBlockNumber; /* nothing to do; the FSM was already
+ * smaller */
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ /* NO EREPORT(ERROR) from here till changes are logged */
+ START_CRIT_SECTION();
+
+ fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
+
+ /*
+ * Truncation of a relation is WAL-logged at a higher-level, and we
+ * will be called at WAL replay. But if checksums are enabled, we need
+ * to still write a WAL record to protect against a torn page, if the
+ * page is flushed to disk before the truncation WAL record. We cannot
+ * use MarkBufferDirtyHint here, because that will not dirty the page
+ * during recovery.
+ */
+ MarkBufferDirty(buf);
+ if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
+ log_newpage_buffer(buf, false);
+
+ END_CRIT_SECTION();
+
+ UnlockReleaseBuffer(buf);
+
+ new_nfsmblocks = fsm_logical_to_physical(first_removed_address) + 1;
+ }
+ else
+ {
+ new_nfsmblocks = fsm_logical_to_physical(first_removed_address);
+ if (smgrnblocks(RelationGetSmgr(rel), FSM_FORKNUM) <= new_nfsmblocks)
+ return InvalidBlockNumber; /* nothing to do; the FSM was already
+ * smaller */
+ }
+
+ return new_nfsmblocks;
+}
+
+/*
+ * FreeSpaceMapVacuum - update upper-level pages in the rel's FSM
+ *
+ * We assume that the bottom-level pages have already been updated with
+ * new free-space information.
+ */
+void
+FreeSpaceMapVacuum(Relation rel)
+{
+ bool dummy;
+
+ /* Recursively scan the tree, starting at the root */
+ (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS,
+ (BlockNumber) 0, InvalidBlockNumber,
+ &dummy);
+}
+
+/*
+ * FreeSpaceMapVacuumRange - update upper-level pages in the rel's FSM
+ *
+ * As above, but assume that only heap pages between start and end-1 inclusive
+ * have new free-space information, so update only the upper-level slots
+ * covering that block range. end == InvalidBlockNumber is equivalent to
+ * "all the rest of the relation".
+ */
+void
+FreeSpaceMapVacuumRange(Relation rel, BlockNumber start, BlockNumber end)
+{
+ bool dummy;
+
+ /* Recursively scan the tree, starting at the root */
+ if (end > start)
+ (void) fsm_vacuum_page(rel, FSM_ROOT_ADDRESS, start, end, &dummy);
+}
+
+/******** Internal routines ********/
+
+/*
+ * Return category corresponding x bytes of free space
+ */
+static uint8
+fsm_space_avail_to_cat(Size avail)
+{
+ int cat;
+
+ Assert(avail < BLCKSZ);
+
+ if (avail >= MaxFSMRequestSize)
+ return 255;
+
+ cat = avail / FSM_CAT_STEP;
+
+ /*
+ * The highest category, 255, is reserved for MaxFSMRequestSize bytes or
+ * more.
+ */
+ if (cat > 254)
+ cat = 254;
+
+ return (uint8) cat;
+}
+
+/*
+ * Return the lower bound of the range of free space represented by given
+ * category.
+ */
+static Size
+fsm_space_cat_to_avail(uint8 cat)
+{
+ /* The highest category represents exactly MaxFSMRequestSize bytes. */
+ if (cat == 255)
+ return MaxFSMRequestSize;
+ else
+ return cat * FSM_CAT_STEP;
+}
+
+/*
+ * Which category does a page need to have, to accommodate x bytes of data?
+ * While fsm_space_avail_to_cat() rounds down, this needs to round up.
+ */
+static uint8
+fsm_space_needed_to_cat(Size needed)
+{
+ int cat;
+
+ /* Can't ask for more space than the highest category represents */
+ if (needed > MaxFSMRequestSize)
+ elog(ERROR, "invalid FSM request size %zu", needed);
+
+ if (needed == 0)
+ return 1;
+
+ cat = (needed + FSM_CAT_STEP - 1) / FSM_CAT_STEP;
+
+ if (cat > 255)
+ cat = 255;
+
+ return (uint8) cat;
+}
+
+/*
+ * Returns the physical block number of a FSM page
+ */
+static BlockNumber
+fsm_logical_to_physical(FSMAddress addr)
+{
+ BlockNumber pages;
+ int leafno;
+ int l;
+
+ /*
+ * Calculate the logical page number of the first leaf page below the
+ * given page.
+ */
+ leafno = addr.logpageno;
+ for (l = 0; l < addr.level; l++)
+ leafno *= SlotsPerFSMPage;
+
+ /* Count upper level nodes required to address the leaf page */
+ pages = 0;
+ for (l = 0; l < FSM_TREE_DEPTH; l++)
+ {
+ pages += leafno + 1;
+ leafno /= SlotsPerFSMPage;
+ }
+
+ /*
+ * If the page we were asked for wasn't at the bottom level, subtract the
+ * additional lower level pages we counted above.
+ */
+ pages -= addr.level;
+
+ /* Turn the page count into 0-based block number */
+ return pages - 1;
+}
+
+/*
+ * Return the FSM location corresponding to given heap block.
+ */
+static FSMAddress
+fsm_get_location(BlockNumber heapblk, uint16 *slot)
+{
+ FSMAddress addr;
+
+ addr.level = FSM_BOTTOM_LEVEL;
+ addr.logpageno = heapblk / SlotsPerFSMPage;
+ *slot = heapblk % SlotsPerFSMPage;
+
+ return addr;
+}
+
+/*
+ * Return the heap block number corresponding to given location in the FSM.
+ */
+static BlockNumber
+fsm_get_heap_blk(FSMAddress addr, uint16 slot)
+{
+ Assert(addr.level == FSM_BOTTOM_LEVEL);
+ return ((unsigned int) addr.logpageno) * SlotsPerFSMPage + slot;
+}
+
+/*
+ * Given a logical address of a child page, get the logical page number of
+ * the parent, and the slot within the parent corresponding to the child.
+ */
+static FSMAddress
+fsm_get_parent(FSMAddress child, uint16 *slot)
+{
+ FSMAddress parent;
+
+ Assert(child.level < FSM_ROOT_LEVEL);
+
+ parent.level = child.level + 1;
+ parent.logpageno = child.logpageno / SlotsPerFSMPage;
+ *slot = child.logpageno % SlotsPerFSMPage;
+
+ return parent;
+}
+
+/*
+ * Given a logical address of a parent page and a slot number, get the
+ * logical address of the corresponding child page.
+ */
+static FSMAddress
+fsm_get_child(FSMAddress parent, uint16 slot)
+{
+ FSMAddress child;
+
+ Assert(parent.level > FSM_BOTTOM_LEVEL);
+
+ child.level = parent.level - 1;
+ child.logpageno = parent.logpageno * SlotsPerFSMPage + slot;
+
+ return child;
+}
+
+/*
+ * Read a FSM page.
+ *
+ * If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
+ * true, the FSM file is extended.
+ */
+static Buffer
+fsm_readbuf(Relation rel, FSMAddress addr, bool extend)
+{
+ BlockNumber blkno = fsm_logical_to_physical(addr);
+ Buffer buf;
+ SMgrRelation reln;
+
+ /*
+ * Caution: re-using this smgr pointer could fail if the relcache entry
+ * gets closed. It's safe as long as we only do smgr-level operations
+ * between here and the last use of the pointer.
+ */
+ reln = RelationGetSmgr(rel);
+
+ /*
+ * If we haven't cached the size of the FSM yet, check it first. Also
+ * recheck if the requested block seems to be past end, since our cached
+ * value might be stale. (We send smgr inval messages on truncation, but
+ * not on extension.)
+ */
+ if (reln->smgr_cached_nblocks[FSM_FORKNUM] == InvalidBlockNumber ||
+ blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
+ {
+ /* Invalidate the cache so smgrnblocks asks the kernel. */
+ reln->smgr_cached_nblocks[FSM_FORKNUM] = InvalidBlockNumber;
+ if (smgrexists(reln, FSM_FORKNUM))
+ smgrnblocks(reln, FSM_FORKNUM);
+ else
+ reln->smgr_cached_nblocks[FSM_FORKNUM] = 0;
+ }
+
+ /*
+ * For reading we use ZERO_ON_ERROR mode, and initialize the page if
+ * necessary. The FSM information is not accurate anyway, so it's better
+ * to clear corrupt pages than error out. Since the FSM changes are not
+ * WAL-logged, the so-called torn page problem on crash can lead to pages
+ * with corrupt headers, for example.
+ *
+ * We use the same path below to initialize pages when extending the
+ * relation, as a concurrent extension can end up with vm_extend()
+ * returning an already-initialized page.
+ */
+ if (blkno >= reln->smgr_cached_nblocks[FSM_FORKNUM])
+ {
+ if (extend)
+ buf = fsm_extend(rel, blkno + 1);
+ else
+ return InvalidBuffer;
+ }
+ else
+ buf = ReadBufferExtended(rel, FSM_FORKNUM, blkno, RBM_ZERO_ON_ERROR, NULL);
+
+ /*
+ * Initializing the page when needed is trickier than it looks, because of
+ * the possibility of multiple backends doing this concurrently, and our
+ * desire to not uselessly take the buffer lock in the normal path where
+ * the page is OK. We must take the lock to initialize the page, so
+ * recheck page newness after we have the lock, in case someone else
+ * already did it. Also, because we initially check PageIsNew with no
+ * lock, it's possible to fall through and return the buffer while someone
+ * else is still initializing the page (i.e., we might see pd_upper as set
+ * but other page header fields are still zeroes). This is harmless for
+ * callers that will take a buffer lock themselves, but some callers
+ * inspect the page without any lock at all. The latter is OK only so
+ * long as it doesn't depend on the page header having correct contents.
+ * Current usage is safe because PageGetContents() does not require that.
+ */
+ if (PageIsNew(BufferGetPage(buf)))
+ {
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ if (PageIsNew(BufferGetPage(buf)))
+ PageInit(BufferGetPage(buf), BLCKSZ, 0);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+ return buf;
+}
+
+/*
+ * Ensure that the FSM fork is at least fsm_nblocks long, extending
+ * it if necessary with empty pages. And by empty, I mean pages filled
+ * with zeros, meaning there's no free space.
+ */
+static Buffer
+fsm_extend(Relation rel, BlockNumber fsm_nblocks)
+{
+ return ExtendBufferedRelTo(BMR_REL(rel), FSM_FORKNUM, NULL,
+ EB_CREATE_FORK_IF_NEEDED |
+ EB_CLEAR_SIZE_CACHE,
+ fsm_nblocks,
+ RBM_ZERO_ON_ERROR);
+}
+
+/*
+ * Set value in given FSM page and slot.
+ *
+ * If minValue > 0, the updated page is also searched for a page with at
+ * least minValue of free space. If one is found, its slot number is
+ * returned, -1 otherwise.
+ */
+static int
+fsm_set_and_search(Relation rel, FSMAddress addr, uint16 slot,
+ uint8 newValue, uint8 minValue)
+{
+ Buffer buf;
+ Page page;
+ int newslot = -1;
+
+ buf = fsm_readbuf(rel, addr, true);
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+
+ page = BufferGetPage(buf);
+
+ if (fsm_set_avail(page, slot, newValue))
+ MarkBufferDirtyHint(buf, false);
+
+ if (minValue != 0)
+ {
+ /* Search while we still hold the lock */
+ newslot = fsm_search_avail(buf, minValue,
+ addr.level == FSM_BOTTOM_LEVEL,
+ true);
+ }
+
+ UnlockReleaseBuffer(buf);
+
+ return newslot;
+}
+
+/*
+ * Search the tree for a heap page with at least min_cat of free space
+ */
+static BlockNumber
+fsm_search(Relation rel, uint8 min_cat)
+{
+ int restarts = 0;
+ FSMAddress addr = FSM_ROOT_ADDRESS;
+
+ for (;;)
+ {
+ int slot;
+ Buffer buf;
+ uint8 max_avail = 0;
+
+ /* Read the FSM page. */
+ buf = fsm_readbuf(rel, addr, false);
+
+ /* Search within the page */
+ if (BufferIsValid(buf))
+ {
+ LockBuffer(buf, BUFFER_LOCK_SHARE);
+ slot = fsm_search_avail(buf, min_cat,
+ (addr.level == FSM_BOTTOM_LEVEL),
+ false);
+ if (slot == -1)
+ max_avail = fsm_get_max_avail(BufferGetPage(buf));
+ UnlockReleaseBuffer(buf);
+ }
+ else
+ slot = -1;
+
+ if (slot != -1)
+ {
+ /*
+ * Descend the tree, or return the found block if we're at the
+ * bottom.
+ */
+ if (addr.level == FSM_BOTTOM_LEVEL)
+ return fsm_get_heap_blk(addr, slot);
+
+ addr = fsm_get_child(addr, slot);
+ }
+ else if (addr.level == FSM_ROOT_LEVEL)
+ {
+ /*
+ * At the root, failure means there's no page with enough free
+ * space in the FSM. Give up.
+ */
+ return InvalidBlockNumber;
+ }
+ else
+ {
+ uint16 parentslot;
+ FSMAddress parent;
+
+ /*
+ * At lower level, failure can happen if the value in the upper-
+ * level node didn't reflect the value on the lower page. Update
+ * the upper node, to avoid falling into the same trap again, and
+ * start over.
+ *
+ * There's a race condition here, if another backend updates this
+ * page right after we release it, and gets the lock on the parent
+ * page before us. We'll then update the parent page with the now
+ * stale information we had. It's OK, because it should happen
+ * rarely, and will be fixed by the next vacuum.
+ */
+ parent = fsm_get_parent(addr, &parentslot);
+ fsm_set_and_search(rel, parent, parentslot, max_avail, 0);
+
+ /*
+ * If the upper pages are badly out of date, we might need to loop
+ * quite a few times, updating them as we go. Any inconsistencies
+ * should eventually be corrected and the loop should end. Looping
+ * indefinitely is nevertheless scary, so provide an emergency
+ * valve.
+ */
+ if (restarts++ > 10000)
+ return InvalidBlockNumber;
+
+ /* Start search all over from the root */
+ addr = FSM_ROOT_ADDRESS;
+ }
+ }
+}
+
+
+/*
+ * Recursive guts of FreeSpaceMapVacuum
+ *
+ * Examine the FSM page indicated by addr, as well as its children, updating
+ * upper-level nodes that cover the heap block range from start to end-1.
+ * (It's okay if end is beyond the actual end of the map.)
+ * Return the maximum freespace value on this page.
+ *
+ * If addr is past the end of the FSM, set *eof_p to true and return 0.
+ *
+ * This traverses the tree in depth-first order. The tree is stored
+ * physically in depth-first order, so this should be pretty I/O efficient.
+ */
+static uint8
+fsm_vacuum_page(Relation rel, FSMAddress addr,
+ BlockNumber start, BlockNumber end,
+ bool *eof_p)
+{
+ Buffer buf;
+ Page page;
+ uint8 max_avail;
+
+ /* Read the page if it exists, or return EOF */
+ buf = fsm_readbuf(rel, addr, false);
+ if (!BufferIsValid(buf))
+ {
+ *eof_p = true;
+ return 0;
+ }
+ else
+ *eof_p = false;
+
+ page = BufferGetPage(buf);
+
+ /*
+ * If we're above the bottom level, recurse into children, and fix the
+ * information stored about them at this level.
+ */
+ if (addr.level > FSM_BOTTOM_LEVEL)
+ {
+ FSMAddress fsm_start,
+ fsm_end;
+ uint16 fsm_start_slot,
+ fsm_end_slot;
+ int slot,
+ start_slot,
+ end_slot;
+ bool eof = false;
+
+ /*
+ * Compute the range of slots we need to update on this page, given
+ * the requested range of heap blocks to consider. The first slot to
+ * update is the one covering the "start" block, and the last slot is
+ * the one covering "end - 1". (Some of this work will be duplicated
+ * in each recursive call, but it's cheap enough to not worry about.)
+ */
+ fsm_start = fsm_get_location(start, &fsm_start_slot);
+ fsm_end = fsm_get_location(end - 1, &fsm_end_slot);
+
+ while (fsm_start.level < addr.level)
+ {
+ fsm_start = fsm_get_parent(fsm_start, &fsm_start_slot);
+ fsm_end = fsm_get_parent(fsm_end, &fsm_end_slot);
+ }
+ Assert(fsm_start.level == addr.level);
+
+ if (fsm_start.logpageno == addr.logpageno)
+ start_slot = fsm_start_slot;
+ else if (fsm_start.logpageno > addr.logpageno)
+ start_slot = SlotsPerFSMPage; /* shouldn't get here... */
+ else
+ start_slot = 0;
+
+ if (fsm_end.logpageno == addr.logpageno)
+ end_slot = fsm_end_slot;
+ else if (fsm_end.logpageno > addr.logpageno)
+ end_slot = SlotsPerFSMPage - 1;
+ else
+ end_slot = -1; /* shouldn't get here... */
+
+ for (slot = start_slot; slot <= end_slot; slot++)
+ {
+ int child_avail;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* After we hit end-of-file, just clear the rest of the slots */
+ if (!eof)
+ child_avail = fsm_vacuum_page(rel, fsm_get_child(addr, slot),
+ start, end,
+ &eof);
+ else
+ child_avail = 0;
+
+ /* Update information about the child */
+ if (fsm_get_avail(page, slot) != child_avail)
+ {
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ fsm_set_avail(page, slot, child_avail);
+ MarkBufferDirtyHint(buf, false);
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
+ }
+ }
+
+ /* Now get the maximum value on the page, to return to caller */
+ max_avail = fsm_get_max_avail(page);
+
+ /*
+ * Reset the next slot pointer. This encourages the use of low-numbered
+ * pages, increasing the chances that a later vacuum can truncate the
+ * relation. We don't bother with a lock here, nor with marking the page
+ * dirty if it wasn't already, since this is just a hint.
+ */
+ ((FSMPage) PageGetContents(page))->fp_next_slot = 0;
+
+ ReleaseBuffer(buf);
+
+ return max_avail;
+}
diff --git a/src/backend/storage/freespace/fsmpage.c b/src/backend/storage/freespace/fsmpage.c
new file mode 100644
index 0000000..0cfb2ae
--- /dev/null
+++ b/src/backend/storage/freespace/fsmpage.c
@@ -0,0 +1,374 @@
+/*-------------------------------------------------------------------------
+ *
+ * fsmpage.c
+ * routines to search and manipulate one FSM page.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/freespace/fsmpage.c
+ *
+ * NOTES:
+ *
+ * The public functions in this file form an API that hides the internal
+ * structure of a FSM page. This allows freespace.c to treat each FSM page
+ * as a black box with SlotsPerPage "slots". fsm_set_avail() and
+ * fsm_get_avail() let you get/set the value of a slot, and
+ * fsm_search_avail() lets you search for a slot with value >= X.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/bufmgr.h"
+#include "storage/fsm_internals.h"
+
+/* Macros to navigate the tree within a page. Root has index zero. */
+#define leftchild(x) (2 * (x) + 1)
+#define rightchild(x) (2 * (x) + 2)
+#define parentof(x) (((x) - 1) / 2)
+
+/*
+ * Find right neighbor of x, wrapping around within the level
+ */
+static int
+rightneighbor(int x)
+{
+ /*
+ * Move right. This might wrap around, stepping to the leftmost node at
+ * the next level.
+ */
+ x++;
+
+ /*
+ * Check if we stepped to the leftmost node at next level, and correct if
+ * so. The leftmost nodes at each level are numbered x = 2^level - 1, so
+ * check if (x + 1) is a power of two, using a standard
+ * twos-complement-arithmetic trick.
+ */
+ if (((x + 1) & x) == 0)
+ x = parentof(x);
+
+ return x;
+}
+
+/*
+ * Sets the value of a slot on page. Returns true if the page was modified.
+ *
+ * The caller must hold an exclusive lock on the page.
+ */
+bool
+fsm_set_avail(Page page, int slot, uint8 value)
+{
+ int nodeno = NonLeafNodesPerPage + slot;
+ FSMPage fsmpage = (FSMPage) PageGetContents(page);
+ uint8 oldvalue;
+
+ Assert(slot < LeafNodesPerPage);
+
+ oldvalue = fsmpage->fp_nodes[nodeno];
+
+ /* If the value hasn't changed, we don't need to do anything */
+ if (oldvalue == value && value <= fsmpage->fp_nodes[0])
+ return false;
+
+ fsmpage->fp_nodes[nodeno] = value;
+
+ /*
+ * Propagate up, until we hit the root or a node that doesn't need to be
+ * updated.
+ */
+ do
+ {
+ uint8 newvalue = 0;
+ int lchild;
+ int rchild;
+
+ nodeno = parentof(nodeno);
+ lchild = leftchild(nodeno);
+ rchild = lchild + 1;
+
+ newvalue = fsmpage->fp_nodes[lchild];
+ if (rchild < NodesPerPage)
+ newvalue = Max(newvalue,
+ fsmpage->fp_nodes[rchild]);
+
+ oldvalue = fsmpage->fp_nodes[nodeno];
+ if (oldvalue == newvalue)
+ break;
+
+ fsmpage->fp_nodes[nodeno] = newvalue;
+ } while (nodeno > 0);
+
+ /*
+ * sanity check: if the new value is (still) higher than the value at the
+ * top, the tree is corrupt. If so, rebuild.
+ */
+ if (value > fsmpage->fp_nodes[0])
+ fsm_rebuild_page(page);
+
+ return true;
+}
+
+/*
+ * Returns the value of given slot on page.
+ *
+ * Since this is just a read-only access of a single byte, the page doesn't
+ * need to be locked.
+ */
+uint8
+fsm_get_avail(Page page, int slot)
+{
+ FSMPage fsmpage = (FSMPage) PageGetContents(page);
+
+ Assert(slot < LeafNodesPerPage);
+
+ return fsmpage->fp_nodes[NonLeafNodesPerPage + slot];
+}
+
+/*
+ * Returns the value at the root of a page.
+ *
+ * Since this is just a read-only access of a single byte, the page doesn't
+ * need to be locked.
+ */
+uint8
+fsm_get_max_avail(Page page)
+{
+ FSMPage fsmpage = (FSMPage) PageGetContents(page);
+
+ return fsmpage->fp_nodes[0];
+}
+
+/*
+ * Searches for a slot with category at least minvalue.
+ * Returns slot number, or -1 if none found.
+ *
+ * The caller must hold at least a shared lock on the page, and this
+ * function can unlock and lock the page again in exclusive mode if it
+ * needs to be updated. exclusive_lock_held should be set to true if the
+ * caller is already holding an exclusive lock, to avoid extra work.
+ *
+ * If advancenext is false, fp_next_slot is set to point to the returned
+ * slot, and if it's true, to the slot after the returned slot.
+ */
+int
+fsm_search_avail(Buffer buf, uint8 minvalue, bool advancenext,
+ bool exclusive_lock_held)
+{
+ Page page = BufferGetPage(buf);
+ FSMPage fsmpage = (FSMPage) PageGetContents(page);
+ int nodeno;
+ int target;
+ uint16 slot;
+
+restart:
+
+ /*
+ * Check the root first, and exit quickly if there's no leaf with enough
+ * free space
+ */
+ if (fsmpage->fp_nodes[0] < minvalue)
+ return -1;
+
+ /*
+ * Start search using fp_next_slot. It's just a hint, so check that it's
+ * sane. (This also handles wrapping around when the prior call returned
+ * the last slot on the page.)
+ */
+ target = fsmpage->fp_next_slot;
+ if (target < 0 || target >= LeafNodesPerPage)
+ target = 0;
+ target += NonLeafNodesPerPage;
+
+ /*----------
+ * Start the search from the target slot. At every step, move one
+ * node to the right, then climb up to the parent. Stop when we reach
+ * a node with enough free space (as we must, since the root has enough
+ * space).
+ *
+ * The idea is to gradually expand our "search triangle", that is, all
+ * nodes covered by the current node, and to be sure we search to the
+ * right from the start point. At the first step, only the target slot
+ * is examined. When we move up from a left child to its parent, we are
+ * adding the right-hand subtree of that parent to the search triangle.
+ * When we move right then up from a right child, we are dropping the
+ * current search triangle (which we know doesn't contain any suitable
+ * page) and instead looking at the next-larger-size triangle to its
+ * right. So we never look left from our original start point, and at
+ * each step the size of the search triangle doubles, ensuring it takes
+ * only log2(N) work to search N pages.
+ *
+ * The "move right" operation will wrap around if it hits the right edge
+ * of the tree, so the behavior is still good if we start near the right.
+ * Note also that the move-and-climb behavior ensures that we can't end
+ * up on one of the missing nodes at the right of the leaf level.
+ *
+ * For example, consider this tree:
+ *
+ * 7
+ * 7 6
+ * 5 7 6 5
+ * 4 5 5 7 2 6 5 2
+ * T
+ *
+ * Assume that the target node is the node indicated by the letter T,
+ * and we're searching for a node with value of 6 or higher. The search
+ * begins at T. At the first iteration, we move to the right, then to the
+ * parent, arriving at the rightmost 5. At the second iteration, we move
+ * to the right, wrapping around, then climb up, arriving at the 7 on the
+ * third level. 7 satisfies our search, so we descend down to the bottom,
+ * following the path of sevens. This is in fact the first suitable page
+ * to the right of (allowing for wraparound) our start point.
+ *----------
+ */
+ nodeno = target;
+ while (nodeno > 0)
+ {
+ if (fsmpage->fp_nodes[nodeno] >= minvalue)
+ break;
+
+ /*
+ * Move to the right, wrapping around on same level if necessary, then
+ * climb up.
+ */
+ nodeno = parentof(rightneighbor(nodeno));
+ }
+
+ /*
+ * We're now at a node with enough free space, somewhere in the middle of
+ * the tree. Descend to the bottom, following a path with enough free
+ * space, preferring to move left if there's a choice.
+ */
+ while (nodeno < NonLeafNodesPerPage)
+ {
+ int childnodeno = leftchild(nodeno);
+
+ if (childnodeno < NodesPerPage &&
+ fsmpage->fp_nodes[childnodeno] >= minvalue)
+ {
+ nodeno = childnodeno;
+ continue;
+ }
+ childnodeno++; /* point to right child */
+ if (childnodeno < NodesPerPage &&
+ fsmpage->fp_nodes[childnodeno] >= minvalue)
+ {
+ nodeno = childnodeno;
+ }
+ else
+ {
+ /*
+ * Oops. The parent node promised that either left or right child
+ * has enough space, but neither actually did. This can happen in
+ * case of a "torn page", IOW if we crashed earlier while writing
+ * the page to disk, and only part of the page made it to disk.
+ *
+ * Fix the corruption and restart.
+ */
+ RelFileLocator rlocator;
+ ForkNumber forknum;
+ BlockNumber blknum;
+
+ BufferGetTag(buf, &rlocator, &forknum, &blknum);
+ elog(DEBUG1, "fixing corrupt FSM block %u, relation %u/%u/%u",
+ blknum, rlocator.spcOid, rlocator.dbOid, rlocator.relNumber);
+
+ /* make sure we hold an exclusive lock */
+ if (!exclusive_lock_held)
+ {
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ exclusive_lock_held = true;
+ }
+ fsm_rebuild_page(page);
+ MarkBufferDirtyHint(buf, false);
+ goto restart;
+ }
+ }
+
+ /* We're now at the bottom level, at a node with enough space. */
+ slot = nodeno - NonLeafNodesPerPage;
+
+ /*
+ * Update the next-target pointer. Note that we do this even if we're only
+ * holding a shared lock, on the grounds that it's better to use a shared
+ * lock and get a garbled next pointer every now and then, than take the
+ * concurrency hit of an exclusive lock.
+ *
+ * Wrap-around is handled at the beginning of this function.
+ */
+ fsmpage->fp_next_slot = slot + (advancenext ? 1 : 0);
+
+ return slot;
+}
+
+/*
+ * Sets the available space to zero for all slots numbered >= nslots.
+ * Returns true if the page was modified.
+ */
+bool
+fsm_truncate_avail(Page page, int nslots)
+{
+ FSMPage fsmpage = (FSMPage) PageGetContents(page);
+ uint8 *ptr;
+ bool changed = false;
+
+ Assert(nslots >= 0 && nslots < LeafNodesPerPage);
+
+ /* Clear all truncated leaf nodes */
+ ptr = &fsmpage->fp_nodes[NonLeafNodesPerPage + nslots];
+ for (; ptr < &fsmpage->fp_nodes[NodesPerPage]; ptr++)
+ {
+ if (*ptr != 0)
+ changed = true;
+ *ptr = 0;
+ }
+
+ /* Fix upper nodes. */
+ if (changed)
+ fsm_rebuild_page(page);
+
+ return changed;
+}
+
+/*
+ * Reconstructs the upper levels of a page. Returns true if the page
+ * was modified.
+ */
+bool
+fsm_rebuild_page(Page page)
+{
+ FSMPage fsmpage = (FSMPage) PageGetContents(page);
+ bool changed = false;
+ int nodeno;
+
+ /*
+ * Start from the lowest non-leaf level, at last node, working our way
+ * backwards, through all non-leaf nodes at all levels, up to the root.
+ */
+ for (nodeno = NonLeafNodesPerPage - 1; nodeno >= 0; nodeno--)
+ {
+ int lchild = leftchild(nodeno);
+ int rchild = lchild + 1;
+ uint8 newvalue = 0;
+
+ /* The first few nodes we examine might have zero or one child. */
+ if (lchild < NodesPerPage)
+ newvalue = fsmpage->fp_nodes[lchild];
+
+ if (rchild < NodesPerPage)
+ newvalue = Max(newvalue,
+ fsmpage->fp_nodes[rchild]);
+
+ if (fsmpage->fp_nodes[nodeno] != newvalue)
+ {
+ fsmpage->fp_nodes[nodeno] = newvalue;
+ changed = true;
+ }
+ }
+
+ return changed;
+}
diff --git a/src/backend/storage/freespace/indexfsm.c b/src/backend/storage/freespace/indexfsm.c
new file mode 100644
index 0000000..fff8f4f
--- /dev/null
+++ b/src/backend/storage/freespace/indexfsm.c
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * indexfsm.c
+ * POSTGRES free space map for quickly finding free pages in relations
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/freespace/indexfsm.c
+ *
+ *
+ * NOTES:
+ *
+ * This is similar to the FSM used for heap, in freespace.c, but instead
+ * of tracking the amount of free space on pages, we only track whether
+ * pages are completely free or in-use. We use the same FSM implementation
+ * as for heaps, using BLCKSZ - 1 to denote used pages, and 0 for unused.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/freespace.h"
+#include "storage/indexfsm.h"
+
+/*
+ * Exported routines
+ */
+
+/*
+ * GetFreeIndexPage - return a free page from the FSM
+ *
+ * As a side effect, the page is marked as used in the FSM.
+ */
+BlockNumber
+GetFreeIndexPage(Relation rel)
+{
+ BlockNumber blkno = GetPageWithFreeSpace(rel, BLCKSZ / 2);
+
+ if (blkno != InvalidBlockNumber)
+ RecordUsedIndexPage(rel, blkno);
+
+ return blkno;
+}
+
+/*
+ * RecordFreeIndexPage - mark a page as free in the FSM
+ */
+void
+RecordFreeIndexPage(Relation rel, BlockNumber freeBlock)
+{
+ RecordPageWithFreeSpace(rel, freeBlock, BLCKSZ - 1);
+}
+
+
+/*
+ * RecordUsedIndexPage - mark a page as used in the FSM
+ */
+void
+RecordUsedIndexPage(Relation rel, BlockNumber usedBlock)
+{
+ RecordPageWithFreeSpace(rel, usedBlock, 0);
+}
+
+/*
+ * IndexFreeSpaceMapVacuum - scan and fix any inconsistencies in the FSM
+ */
+void
+IndexFreeSpaceMapVacuum(Relation rel)
+{
+ FreeSpaceMapVacuum(rel);
+}
diff --git a/src/backend/storage/freespace/meson.build b/src/backend/storage/freespace/meson.build
new file mode 100644
index 0000000..4dd8602
--- /dev/null
+++ b/src/backend/storage/freespace/meson.build
@@ -0,0 +1,7 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'freespace.c',
+ 'fsmpage.c',
+ 'indexfsm.c',
+)
diff --git a/src/backend/storage/ipc/Makefile b/src/backend/storage/ipc/Makefile
new file mode 100644
index 0000000..6d5b921
--- /dev/null
+++ b/src/backend/storage/ipc/Makefile
@@ -0,0 +1,29 @@
+#
+# Makefile for storage/ipc
+#
+# src/backend/storage/ipc/Makefile
+#
+
+subdir = src/backend/storage/ipc
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ barrier.o \
+ dsm.o \
+ dsm_impl.o \
+ ipc.o \
+ ipci.o \
+ latch.o \
+ pmsignal.o \
+ procarray.o \
+ procsignal.o \
+ shm_mq.o \
+ shm_toc.o \
+ shmem.o \
+ signalfuncs.o \
+ sinval.o \
+ sinvaladt.o \
+ standby.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/ipc/barrier.c b/src/backend/storage/ipc/barrier.c
new file mode 100644
index 0000000..4734dc6
--- /dev/null
+++ b/src/backend/storage/ipc/barrier.c
@@ -0,0 +1,333 @@
+/*-------------------------------------------------------------------------
+ *
+ * barrier.c
+ * Barriers for synchronizing cooperating processes.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * From Wikipedia[1]: "In parallel computing, a barrier is a type of
+ * synchronization method. A barrier for a group of threads or processes in
+ * the source code means any thread/process must stop at this point and cannot
+ * proceed until all other threads/processes reach this barrier."
+ *
+ * This implementation of barriers allows for static sets of participants
+ * known up front, or dynamic sets of participants which processes can join or
+ * leave at any time. In the dynamic case, a phase number can be used to
+ * track progress through a parallel algorithm, and may be necessary to
+ * synchronize with the current phase of a multi-phase algorithm when a new
+ * participant joins. In the static case, the phase number is used
+ * internally, but it isn't strictly necessary for client code to access it
+ * because the phase can only advance when the declared number of participants
+ * reaches the barrier, so client code should be in no doubt about the current
+ * phase of computation at all times.
+ *
+ * Consider a parallel algorithm that involves separate phases of computation
+ * A, B and C where the output of each phase is needed before the next phase
+ * can begin.
+ *
+ * In the case of a static barrier initialized with 4 participants, each
+ * participant works on phase A, then calls BarrierArriveAndWait to wait until
+ * all 4 participants have reached that point. When BarrierArriveAndWait
+ * returns control, each participant can work on B, and so on. Because the
+ * barrier knows how many participants to expect, the phases of computation
+ * don't need labels or numbers, since each process's program counter implies
+ * the current phase. Even if some of the processes are slow to start up and
+ * begin running phase A, the other participants are expecting them and will
+ * patiently wait at the barrier. The code could be written as follows:
+ *
+ * perform_a();
+ * BarrierArriveAndWait(&barrier, ...);
+ * perform_b();
+ * BarrierArriveAndWait(&barrier, ...);
+ * perform_c();
+ * BarrierArriveAndWait(&barrier, ...);
+ *
+ * If the number of participants is not known up front, then a dynamic barrier
+ * is needed and the number should be set to zero at initialization. New
+ * complications arise because the number necessarily changes over time as
+ * participants attach and detach, and therefore phases B, C or even the end
+ * of processing may be reached before any given participant has started
+ * running and attached. Therefore the client code must perform an initial
+ * test of the phase number after attaching, because it needs to find out
+ * which phase of the algorithm has been reached by any participants that are
+ * already attached in order to synchronize with that work. Once the program
+ * counter or some other representation of current progress is synchronized
+ * with the barrier's phase, normal control flow can be used just as in the
+ * static case. Our example could be written using a switch statement with
+ * cases that fall-through, as follows:
+ *
+ * phase = BarrierAttach(&barrier);
+ * switch (phase)
+ * {
+ * case PHASE_A:
+ * perform_a();
+ * BarrierArriveAndWait(&barrier, ...);
+ * case PHASE_B:
+ * perform_b();
+ * BarrierArriveAndWait(&barrier, ...);
+ * case PHASE_C:
+ * perform_c();
+ * BarrierArriveAndWait(&barrier, ...);
+ * }
+ * BarrierDetach(&barrier);
+ *
+ * Static barriers behave similarly to POSIX's pthread_barrier_t. Dynamic
+ * barriers behave similarly to Java's java.util.concurrent.Phaser.
+ *
+ * [1] https://en.wikipedia.org/wiki/Barrier_(computer_science)
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/barrier.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "storage/barrier.h"
+
+static inline bool BarrierDetachImpl(Barrier *barrier, bool arrive);
+
+/*
+ * Initialize this barrier. To use a static party size, provide the number of
+ * participants to wait for at each phase indicating that that number of
+ * backends is implicitly attached. To use a dynamic party size, specify zero
+ * here and then use BarrierAttach() and
+ * BarrierDetach()/BarrierArriveAndDetach() to register and deregister
+ * participants explicitly.
+ */
+void
+BarrierInit(Barrier *barrier, int participants)
+{
+ SpinLockInit(&barrier->mutex);
+ barrier->participants = participants;
+ barrier->arrived = 0;
+ barrier->phase = 0;
+ barrier->elected = 0;
+ barrier->static_party = participants > 0;
+ ConditionVariableInit(&barrier->condition_variable);
+}
+
+/*
+ * Arrive at this barrier, wait for all other attached participants to arrive
+ * too and then return. Increments the current phase. The caller must be
+ * attached.
+ *
+ * While waiting, pg_stat_activity shows a wait_event_type and wait_event
+ * controlled by the wait_event_info passed in, which should be a value from
+ * one of the WaitEventXXX enums defined in pgstat.h.
+ *
+ * Return true in one arbitrarily chosen participant. Return false in all
+ * others. The return code can be used to elect one participant to execute a
+ * phase of work that must be done serially while other participants wait.
+ */
+bool
+BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info)
+{
+ bool release = false;
+ bool elected;
+ int start_phase;
+ int next_phase;
+
+ SpinLockAcquire(&barrier->mutex);
+ start_phase = barrier->phase;
+ next_phase = start_phase + 1;
+ ++barrier->arrived;
+ if (barrier->arrived == barrier->participants)
+ {
+ release = true;
+ barrier->arrived = 0;
+ barrier->phase = next_phase;
+ barrier->elected = next_phase;
+ }
+ SpinLockRelease(&barrier->mutex);
+
+ /*
+ * If we were the last expected participant to arrive, we can release our
+ * peers and return true to indicate that this backend has been elected to
+ * perform any serial work.
+ */
+ if (release)
+ {
+ ConditionVariableBroadcast(&barrier->condition_variable);
+
+ return true;
+ }
+
+ /*
+ * Otherwise we have to wait for the last participant to arrive and
+ * advance the phase.
+ */
+ elected = false;
+ ConditionVariablePrepareToSleep(&barrier->condition_variable);
+ for (;;)
+ {
+ /*
+ * We know that phase must either be start_phase, indicating that we
+ * need to keep waiting, or next_phase, indicating that the last
+ * participant that we were waiting for has either arrived or detached
+ * so that the next phase has begun. The phase cannot advance any
+ * further than that without this backend's participation, because
+ * this backend is attached.
+ */
+ SpinLockAcquire(&barrier->mutex);
+ Assert(barrier->phase == start_phase || barrier->phase == next_phase);
+ release = barrier->phase == next_phase;
+ if (release && barrier->elected != next_phase)
+ {
+ /*
+ * Usually the backend that arrives last and releases the other
+ * backends is elected to return true (see above), so that it can
+ * begin processing serial work while it has a CPU timeslice.
+ * However, if the barrier advanced because someone detached, then
+ * one of the backends that is awoken will need to be elected.
+ */
+ barrier->elected = barrier->phase;
+ elected = true;
+ }
+ SpinLockRelease(&barrier->mutex);
+ if (release)
+ break;
+ ConditionVariableSleep(&barrier->condition_variable, wait_event_info);
+ }
+ ConditionVariableCancelSleep();
+
+ return elected;
+}
+
+/*
+ * Arrive at this barrier, but detach rather than waiting. Returns true if
+ * the caller was the last to detach.
+ */
+bool
+BarrierArriveAndDetach(Barrier *barrier)
+{
+ return BarrierDetachImpl(barrier, true);
+}
+
+/*
+ * Arrive at a barrier, and detach all but the last to arrive. Returns true if
+ * the caller was the last to arrive, and is therefore still attached.
+ */
+bool
+BarrierArriveAndDetachExceptLast(Barrier *barrier)
+{
+ SpinLockAcquire(&barrier->mutex);
+ if (barrier->participants > 1)
+ {
+ --barrier->participants;
+ SpinLockRelease(&barrier->mutex);
+
+ return false;
+ }
+ Assert(barrier->participants == 1);
+ ++barrier->phase;
+ SpinLockRelease(&barrier->mutex);
+
+ return true;
+}
+
+/*
+ * Attach to a barrier. All waiting participants will now wait for this
+ * participant to call BarrierArriveAndWait(), BarrierDetach() or
+ * BarrierArriveAndDetach(). Return the current phase.
+ */
+int
+BarrierAttach(Barrier *barrier)
+{
+ int phase;
+
+ Assert(!barrier->static_party);
+
+ SpinLockAcquire(&barrier->mutex);
+ ++barrier->participants;
+ phase = barrier->phase;
+ SpinLockRelease(&barrier->mutex);
+
+ return phase;
+}
+
+/*
+ * Detach from a barrier. This may release other waiters from
+ * BarrierArriveAndWait() and advance the phase if they were only waiting for
+ * this backend. Return true if this participant was the last to detach.
+ */
+bool
+BarrierDetach(Barrier *barrier)
+{
+ return BarrierDetachImpl(barrier, false);
+}
+
+/*
+ * Return the current phase of a barrier. The caller must be attached.
+ */
+int
+BarrierPhase(Barrier *barrier)
+{
+ /*
+ * It is OK to read barrier->phase without locking, because it can't
+ * change without us (we are attached to it), and we executed a memory
+ * barrier when we either attached or participated in changing it last
+ * time.
+ */
+ return barrier->phase;
+}
+
+/*
+ * Return an instantaneous snapshot of the number of participants currently
+ * attached to this barrier. For debugging purposes only.
+ */
+int
+BarrierParticipants(Barrier *barrier)
+{
+ int participants;
+
+ SpinLockAcquire(&barrier->mutex);
+ participants = barrier->participants;
+ SpinLockRelease(&barrier->mutex);
+
+ return participants;
+}
+
+/*
+ * Detach from a barrier. If 'arrive' is true then also increment the phase
+ * if there are no other participants. If there are other participants
+ * waiting, then the phase will be advanced and they'll be released if they
+ * were only waiting for the caller. Return true if this participant was the
+ * last to detach.
+ */
+static inline bool
+BarrierDetachImpl(Barrier *barrier, bool arrive)
+{
+ bool release;
+ bool last;
+
+ Assert(!barrier->static_party);
+
+ SpinLockAcquire(&barrier->mutex);
+ Assert(barrier->participants > 0);
+ --barrier->participants;
+
+ /*
+ * If any other participants are waiting and we were the last participant
+ * waited for, release them. If no other participants are waiting, but
+ * this is a BarrierArriveAndDetach() call, then advance the phase too.
+ */
+ if ((arrive || barrier->participants > 0) &&
+ barrier->arrived == barrier->participants)
+ {
+ release = true;
+ barrier->arrived = 0;
+ ++barrier->phase;
+ }
+ else
+ release = false;
+
+ last = barrier->participants == 0;
+ SpinLockRelease(&barrier->mutex);
+
+ if (release)
+ ConditionVariableBroadcast(&barrier->condition_variable);
+
+ return last;
+}
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
new file mode 100644
index 0000000..7e4e278
--- /dev/null
+++ b/src/backend/storage/ipc/dsm.c
@@ -0,0 +1,1257 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm.c
+ * manage dynamic shared memory segments
+ *
+ * This file provides a set of services to make programming with dynamic
+ * shared memory segments more convenient. Unlike the low-level
+ * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
+ * created using this module will be cleaned up automatically. Mappings
+ * will be removed when the resource owner under which they were created
+ * is cleaned up, unless dsm_pin_mapping() is used, in which case they
+ * have session lifespan. Segments will be removed when there are no
+ * remaining mappings, or at postmaster shutdown in any case. After a
+ * hard postmaster crash, remaining segments will be removed, if they
+ * still exist, at the next postmaster startup.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/dsm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+
+#include "common/pg_prng.h"
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "port/pg_bitutils.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "utils/freepage.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+#define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32
+
+#define PG_DYNSHMEM_FIXED_SLOTS 64
+#define PG_DYNSHMEM_SLOTS_PER_BACKEND 5
+
+#define INVALID_CONTROL_SLOT ((uint32) -1)
+
+/* Backend-local tracking for on-detach callbacks. */
+typedef struct dsm_segment_detach_callback
+{
+ on_dsm_detach_callback function;
+ Datum arg;
+ slist_node node;
+} dsm_segment_detach_callback;
+
+/* Backend-local state for a dynamic shared memory segment. */
+struct dsm_segment
+{
+ dlist_node node; /* List link in dsm_segment_list. */
+ ResourceOwner resowner; /* Resource owner. */
+ dsm_handle handle; /* Segment name. */
+ uint32 control_slot; /* Slot in control segment. */
+ void *impl_private; /* Implementation-specific private data. */
+ void *mapped_address; /* Mapping address, or NULL if unmapped. */
+ Size mapped_size; /* Size of our mapping. */
+ slist_head on_detach; /* On-detach callbacks. */
+};
+
+/* Shared-memory state for a dynamic shared memory segment. */
+typedef struct dsm_control_item
+{
+ dsm_handle handle;
+ uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */
+ size_t first_page;
+ size_t npages;
+ void *impl_private_pm_handle; /* only needed on Windows */
+ bool pinned;
+} dsm_control_item;
+
+/* Layout of the dynamic shared memory control segment. */
+typedef struct dsm_control_header
+{
+ uint32 magic;
+ uint32 nitems;
+ uint32 maxitems;
+ dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
+} dsm_control_header;
+
+static void dsm_cleanup_for_mmap(void);
+static void dsm_postmaster_shutdown(int code, Datum arg);
+static dsm_segment *dsm_create_descriptor(void);
+static bool dsm_control_segment_sane(dsm_control_header *control,
+ Size mapped_size);
+static uint64 dsm_control_bytes_needed(uint32 nitems);
+static inline dsm_handle make_main_region_dsm_handle(int slot);
+static inline bool is_main_region_dsm_handle(dsm_handle handle);
+
+/* Has this backend initialized the dynamic shared memory system yet? */
+static bool dsm_init_done = false;
+
+/* Preallocated DSM space in the main shared memory region. */
+static void *dsm_main_space_begin = NULL;
+
+/*
+ * List of dynamic shared memory segments used by this backend.
+ *
+ * At process exit time, we must decrement the reference count of each
+ * segment we have attached; this list makes it possible to find all such
+ * segments.
+ *
+ * This list should always be empty in the postmaster. We could probably
+ * allow the postmaster to map dynamic shared memory segments before it
+ * begins to start child processes, provided that each process adjusted
+ * the reference counts for those segments in the control segment at
+ * startup time, but there's no obvious need for such a facility, which
+ * would also be complex to handle in the EXEC_BACKEND case. Once the
+ * postmaster has begun spawning children, there's an additional problem:
+ * each new mapping would require an update to the control segment,
+ * which requires locking, in which the postmaster must not be involved.
+ */
+static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
+
+/*
+ * Control segment information.
+ *
+ * Unlike ordinary shared memory segments, the control segment is not
+ * reference counted; instead, it lasts for the postmaster's entire
+ * life cycle. For simplicity, it doesn't have a dsm_segment object either.
+ */
+static dsm_handle dsm_control_handle;
+static dsm_control_header *dsm_control;
+static Size dsm_control_mapped_size = 0;
+static void *dsm_control_impl_private = NULL;
+
+/*
+ * Start up the dynamic shared memory system.
+ *
+ * This is called just once during each cluster lifetime, at postmaster
+ * startup time.
+ */
+void
+dsm_postmaster_startup(PGShmemHeader *shim)
+{
+ void *dsm_control_address = NULL;
+ uint32 maxitems;
+ Size segsize;
+
+ Assert(!IsUnderPostmaster);
+
+ /*
+ * If we're using the mmap implementations, clean up any leftovers.
+ * Cleanup isn't needed on Windows, and happens earlier in startup for
+ * POSIX and System V shared memory, via a direct call to
+ * dsm_cleanup_using_control_segment.
+ */
+ if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
+ dsm_cleanup_for_mmap();
+
+ /* Determine size for new control segment. */
+ maxitems = PG_DYNSHMEM_FIXED_SLOTS
+ + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
+ elog(DEBUG2, "dynamic shared memory system will support %u segments",
+ maxitems);
+ segsize = dsm_control_bytes_needed(maxitems);
+
+ /*
+ * Loop until we find an unused identifier for the new control segment. We
+ * sometimes use DSM_HANDLE_INVALID as a sentinel value indicating "no
+ * control segment", so avoid generating that value for a real handle.
+ */
+ for (;;)
+ {
+ Assert(dsm_control_address == NULL);
+ Assert(dsm_control_mapped_size == 0);
+ /* Use even numbers only */
+ dsm_control_handle = pg_prng_uint32(&pg_global_prng_state) << 1;
+ if (dsm_control_handle == DSM_HANDLE_INVALID)
+ continue;
+ if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
+ &dsm_control_impl_private, &dsm_control_address,
+ &dsm_control_mapped_size, ERROR))
+ break;
+ }
+ dsm_control = dsm_control_address;
+ on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
+ elog(DEBUG2,
+ "created dynamic shared memory control segment %u (%zu bytes)",
+ dsm_control_handle, segsize);
+ shim->dsm_control = dsm_control_handle;
+
+ /* Initialize control segment. */
+ dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
+ dsm_control->nitems = 0;
+ dsm_control->maxitems = maxitems;
+}
+
+/*
+ * Determine whether the control segment from the previous postmaster
+ * invocation still exists. If so, remove the dynamic shared memory
+ * segments to which it refers, and then the control segment itself.
+ */
+void
+dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
+{
+ void *mapped_address = NULL;
+ void *junk_mapped_address = NULL;
+ void *impl_private = NULL;
+ void *junk_impl_private = NULL;
+ Size mapped_size = 0;
+ Size junk_mapped_size = 0;
+ uint32 nitems;
+ uint32 i;
+ dsm_control_header *old_control;
+
+ /*
+ * Try to attach the segment. If this fails, it probably just means that
+ * the operating system has been rebooted and the segment no longer
+ * exists, or an unrelated process has used the same shm ID. So just fall
+ * out quietly.
+ */
+ if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
+ &mapped_address, &mapped_size, DEBUG1))
+ return;
+
+ /*
+ * We've managed to reattach it, but the contents might not be sane. If
+ * they aren't, we disregard the segment after all.
+ */
+ old_control = (dsm_control_header *) mapped_address;
+ if (!dsm_control_segment_sane(old_control, mapped_size))
+ {
+ dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
+ &mapped_address, &mapped_size, LOG);
+ return;
+ }
+
+ /*
+ * OK, the control segment looks basically valid, so we can use it to get
+ * a list of segments that need to be removed.
+ */
+ nitems = old_control->nitems;
+ for (i = 0; i < nitems; ++i)
+ {
+ dsm_handle handle;
+ uint32 refcnt;
+
+ /* If the reference count is 0, the slot is actually unused. */
+ refcnt = old_control->item[i].refcnt;
+ if (refcnt == 0)
+ continue;
+
+ /* If it was using the main shmem area, there is nothing to do. */
+ handle = old_control->item[i].handle;
+ if (is_main_region_dsm_handle(handle))
+ continue;
+
+ /* Log debugging information. */
+ elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
+ handle, refcnt);
+
+ /* Destroy the referenced segment. */
+ dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+ &junk_mapped_address, &junk_mapped_size, LOG);
+ }
+
+ /* Destroy the old control segment, too. */
+ elog(DEBUG2,
+ "cleaning up dynamic shared memory control segment with ID %u",
+ old_control_handle);
+ dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
+ &mapped_address, &mapped_size, LOG);
+}
+
+/*
+ * When we're using the mmap shared memory implementation, "shared memory"
+ * segments might even manage to survive an operating system reboot.
+ * But there's no guarantee as to exactly what will survive: some segments
+ * may survive, and others may not, and the contents of some may be out
+ * of date. In particular, the control segment may be out of date, so we
+ * can't rely on it to figure out what to remove. However, since we know
+ * what directory contains the files we used as shared memory, we can simply
+ * scan the directory and blow everything away that shouldn't be there.
+ */
+static void
+dsm_cleanup_for_mmap(void)
+{
+ DIR *dir;
+ struct dirent *dent;
+
+ /* Scan the directory for something with a name of the correct format. */
+ dir = AllocateDir(PG_DYNSHMEM_DIR);
+
+ while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
+ {
+ if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
+ strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
+ {
+ char buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
+
+ snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
+
+ elog(DEBUG2, "removing file \"%s\"", buf);
+
+ /* We found a matching file; so remove it. */
+ if (unlink(buf) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", buf)));
+ }
+ }
+
+ /* Cleanup complete. */
+ FreeDir(dir);
+}
+
+/*
+ * At shutdown time, we iterate over the control segment and remove all
+ * remaining dynamic shared memory segments. We avoid throwing errors here;
+ * the postmaster is shutting down either way, and this is just non-critical
+ * resource cleanup.
+ */
+static void
+dsm_postmaster_shutdown(int code, Datum arg)
+{
+ uint32 nitems;
+ uint32 i;
+ void *dsm_control_address;
+ void *junk_mapped_address = NULL;
+ void *junk_impl_private = NULL;
+ Size junk_mapped_size = 0;
+ PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
+
+ /*
+ * If some other backend exited uncleanly, it might have corrupted the
+ * control segment while it was dying. In that case, we warn and ignore
+ * the contents of the control segment. This may end up leaving behind
+ * stray shared memory segments, but there's not much we can do about that
+ * if the metadata is gone.
+ */
+ nitems = dsm_control->nitems;
+ if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
+ {
+ ereport(LOG,
+ (errmsg("dynamic shared memory control segment is corrupt")));
+ return;
+ }
+
+ /* Remove any remaining segments. */
+ for (i = 0; i < nitems; ++i)
+ {
+ dsm_handle handle;
+
+ /* If the reference count is 0, the slot is actually unused. */
+ if (dsm_control->item[i].refcnt == 0)
+ continue;
+
+ handle = dsm_control->item[i].handle;
+ if (is_main_region_dsm_handle(handle))
+ continue;
+
+ /* Log debugging information. */
+ elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
+ handle);
+
+ /* Destroy the segment. */
+ dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+ &junk_mapped_address, &junk_mapped_size, LOG);
+ }
+
+ /* Remove the control segment itself. */
+ elog(DEBUG2,
+ "cleaning up dynamic shared memory control segment with ID %u",
+ dsm_control_handle);
+ dsm_control_address = dsm_control;
+ dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
+ &dsm_control_impl_private, &dsm_control_address,
+ &dsm_control_mapped_size, LOG);
+ dsm_control = dsm_control_address;
+ shim->dsm_control = 0;
+}
+
+/*
+ * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND,
+ * we must reread the state file and map the control segment; in other cases,
+ * we'll have inherited the postmaster's mapping and global variables.
+ */
+static void
+dsm_backend_startup(void)
+{
+#ifdef EXEC_BACKEND
+ if (IsUnderPostmaster)
+ {
+ void *control_address = NULL;
+
+ /* Attach control segment. */
+ Assert(dsm_control_handle != 0);
+ dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
+ &dsm_control_impl_private, &control_address,
+ &dsm_control_mapped_size, ERROR);
+ dsm_control = control_address;
+ /* If control segment doesn't look sane, something is badly wrong. */
+ if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
+ {
+ dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
+ &dsm_control_impl_private, &control_address,
+ &dsm_control_mapped_size, WARNING);
+ ereport(FATAL,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("dynamic shared memory control segment is not valid")));
+ }
+ }
+#endif
+
+ dsm_init_done = true;
+}
+
+#ifdef EXEC_BACKEND
+/*
+ * When running under EXEC_BACKEND, we get a callback here when the main
+ * shared memory segment is re-attached, so that we can record the control
+ * handle retrieved from it.
+ */
+void
+dsm_set_control_handle(dsm_handle h)
+{
+ Assert(dsm_control_handle == 0 && h != 0);
+ dsm_control_handle = h;
+}
+#endif
+
+/*
+ * Reserve some space in the main shared memory segment for DSM segments.
+ */
+size_t
+dsm_estimate_size(void)
+{
+ return 1024 * 1024 * (size_t) min_dynamic_shared_memory;
+}
+
+/*
+ * Initialize space in the main shared memory segment for DSM segments.
+ */
+void
+dsm_shmem_init(void)
+{
+ size_t size = dsm_estimate_size();
+ bool found;
+
+ if (size == 0)
+ return;
+
+ dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
+ if (!found)
+ {
+ FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
+ size_t first_page = 0;
+ size_t pages;
+
+ /* Reserve space for the FreePageManager. */
+ while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
+ ++first_page;
+
+ /* Initialize it and give it all the rest of the space. */
+ FreePageManagerInitialize(fpm, dsm_main_space_begin);
+ pages = (size / FPM_PAGE_SIZE) - first_page;
+ FreePageManagerPut(fpm, first_page, pages);
+ }
+}
+
+/*
+ * Create a new dynamic shared memory segment.
+ *
+ * If there is a non-NULL CurrentResourceOwner, the new segment is associated
+ * with it and must be detached before the resource owner releases, or a
+ * warning will be logged. If CurrentResourceOwner is NULL, the segment
+ * remains attached until explicitly detached or the session ends.
+ * Creating with a NULL CurrentResourceOwner is equivalent to creating
+ * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
+ */
+dsm_segment *
+dsm_create(Size size, int flags)
+{
+ dsm_segment *seg;
+ uint32 i;
+ uint32 nitems;
+ size_t npages = 0;
+ size_t first_page = 0;
+ FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
+ bool using_main_dsm_region = false;
+
+ /*
+ * Unsafe in postmaster. It might seem pointless to allow use of dsm in
+ * single user mode, but otherwise some subsystems will need dedicated
+ * single user mode code paths.
+ */
+ Assert(IsUnderPostmaster || !IsPostmasterEnvironment);
+
+ if (!dsm_init_done)
+ dsm_backend_startup();
+
+ /* Create a new segment descriptor. */
+ seg = dsm_create_descriptor();
+
+ /*
+ * Lock the control segment while we try to allocate from the main shared
+ * memory area, if configured.
+ */
+ if (dsm_main_space_fpm)
+ {
+ npages = size / FPM_PAGE_SIZE;
+ if (size % FPM_PAGE_SIZE > 0)
+ ++npages;
+
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
+ {
+ /* We can carve out a piece of the main shared memory segment. */
+ seg->mapped_address = (char *) dsm_main_space_begin +
+ first_page * FPM_PAGE_SIZE;
+ seg->mapped_size = npages * FPM_PAGE_SIZE;
+ using_main_dsm_region = true;
+ /* We'll choose a handle below. */
+ }
+ }
+
+ if (!using_main_dsm_region)
+ {
+ /*
+ * We need to create a new memory segment. Loop until we find an
+ * unused segment identifier.
+ */
+ if (dsm_main_space_fpm)
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ for (;;)
+ {
+ Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
+ /* Use even numbers only */
+ seg->handle = pg_prng_uint32(&pg_global_prng_state) << 1;
+ if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */
+ continue;
+ if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, ERROR))
+ break;
+ }
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ }
+
+ /* Search the control segment for an unused slot. */
+ nitems = dsm_control->nitems;
+ for (i = 0; i < nitems; ++i)
+ {
+ if (dsm_control->item[i].refcnt == 0)
+ {
+ if (using_main_dsm_region)
+ {
+ seg->handle = make_main_region_dsm_handle(i);
+ dsm_control->item[i].first_page = first_page;
+ dsm_control->item[i].npages = npages;
+ }
+ else
+ Assert(!is_main_region_dsm_handle(seg->handle));
+ dsm_control->item[i].handle = seg->handle;
+ /* refcnt of 1 triggers destruction, so start at 2 */
+ dsm_control->item[i].refcnt = 2;
+ dsm_control->item[i].impl_private_pm_handle = NULL;
+ dsm_control->item[i].pinned = false;
+ seg->control_slot = i;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ return seg;
+ }
+ }
+
+ /* Verify that we can support an additional mapping. */
+ if (nitems >= dsm_control->maxitems)
+ {
+ if (using_main_dsm_region)
+ FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ if (!using_main_dsm_region)
+ dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, WARNING);
+ if (seg->resowner != NULL)
+ ResourceOwnerForgetDSM(seg->resowner, seg);
+ dlist_delete(&seg->node);
+ pfree(seg);
+
+ if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
+ return NULL;
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("too many dynamic shared memory segments")));
+ }
+
+ /* Enter the handle into a new array slot. */
+ if (using_main_dsm_region)
+ {
+ seg->handle = make_main_region_dsm_handle(nitems);
+ dsm_control->item[i].first_page = first_page;
+ dsm_control->item[i].npages = npages;
+ }
+ dsm_control->item[nitems].handle = seg->handle;
+ /* refcnt of 1 triggers destruction, so start at 2 */
+ dsm_control->item[nitems].refcnt = 2;
+ dsm_control->item[nitems].impl_private_pm_handle = NULL;
+ dsm_control->item[nitems].pinned = false;
+ seg->control_slot = nitems;
+ dsm_control->nitems++;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+
+ return seg;
+}
+
+/*
+ * Attach a dynamic shared memory segment.
+ *
+ * See comments for dsm_segment_handle() for an explanation of how this
+ * is intended to be used.
+ *
+ * This function will return NULL if the segment isn't known to the system.
+ * This can happen if we're asked to attach the segment, but then everyone
+ * else detaches it (causing it to be destroyed) before we get around to
+ * attaching it.
+ *
+ * If there is a non-NULL CurrentResourceOwner, the attached segment is
+ * associated with it and must be detached before the resource owner releases,
+ * or a warning will be logged. Otherwise the segment remains attached until
+ * explicitly detached or the session ends. See the note atop dsm_create().
+ */
+dsm_segment *
+dsm_attach(dsm_handle h)
+{
+ dsm_segment *seg;
+ dlist_iter iter;
+ uint32 i;
+ uint32 nitems;
+
+ /* Unsafe in postmaster (and pointless in a stand-alone backend). */
+ Assert(IsUnderPostmaster);
+
+ if (!dsm_init_done)
+ dsm_backend_startup();
+
+ /*
+ * Since this is just a debugging cross-check, we could leave it out
+ * altogether, or include it only in assert-enabled builds. But since the
+ * list of attached segments should normally be very short, let's include
+ * it always for right now.
+ *
+ * If you're hitting this error, you probably want to attempt to find an
+ * existing mapping via dsm_find_mapping() before calling dsm_attach() to
+ * create a new one.
+ */
+ dlist_foreach(iter, &dsm_segment_list)
+ {
+ seg = dlist_container(dsm_segment, node, iter.cur);
+ if (seg->handle == h)
+ elog(ERROR, "can't attach the same segment more than once");
+ }
+
+ /* Create a new segment descriptor. */
+ seg = dsm_create_descriptor();
+ seg->handle = h;
+
+ /* Bump reference count for this segment in shared memory. */
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ nitems = dsm_control->nitems;
+ for (i = 0; i < nitems; ++i)
+ {
+ /*
+ * If the reference count is 0, the slot is actually unused. If the
+ * reference count is 1, the slot is still in use, but the segment is
+ * in the process of going away; even if the handle matches, another
+ * slot may already have started using the same handle value by
+ * coincidence so we have to keep searching.
+ */
+ if (dsm_control->item[i].refcnt <= 1)
+ continue;
+
+ /* If the handle doesn't match, it's not the slot we want. */
+ if (dsm_control->item[i].handle != seg->handle)
+ continue;
+
+ /* Otherwise we've found a match. */
+ dsm_control->item[i].refcnt++;
+ seg->control_slot = i;
+ if (is_main_region_dsm_handle(seg->handle))
+ {
+ seg->mapped_address = (char *) dsm_main_space_begin +
+ dsm_control->item[i].first_page * FPM_PAGE_SIZE;
+ seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
+ }
+ break;
+ }
+ LWLockRelease(DynamicSharedMemoryControlLock);
+
+ /*
+ * If we didn't find the handle we're looking for in the control segment,
+ * it probably means that everyone else who had it mapped, including the
+ * original creator, died before we got to this point. It's up to the
+ * caller to decide what to do about that.
+ */
+ if (seg->control_slot == INVALID_CONTROL_SLOT)
+ {
+ dsm_detach(seg);
+ return NULL;
+ }
+
+ /* Here's where we actually try to map the segment. */
+ if (!is_main_region_dsm_handle(seg->handle))
+ dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, ERROR);
+
+ return seg;
+}
+
+/*
+ * At backend shutdown time, detach any segments that are still attached.
+ * (This is similar to dsm_detach_all, except that there's no reason to
+ * unmap the control segment before exiting, so we don't bother.)
+ */
+void
+dsm_backend_shutdown(void)
+{
+ while (!dlist_is_empty(&dsm_segment_list))
+ {
+ dsm_segment *seg;
+
+ seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
+ dsm_detach(seg);
+ }
+}
+
+/*
+ * Detach all shared memory segments, including the control segments. This
+ * should be called, along with PGSharedMemoryDetach, in processes that
+ * might inherit mappings but are not intended to be connected to dynamic
+ * shared memory.
+ */
+void
+dsm_detach_all(void)
+{
+ void *control_address = dsm_control;
+
+ while (!dlist_is_empty(&dsm_segment_list))
+ {
+ dsm_segment *seg;
+
+ seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
+ dsm_detach(seg);
+ }
+
+ if (control_address != NULL)
+ dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
+ &dsm_control_impl_private, &control_address,
+ &dsm_control_mapped_size, ERROR);
+}
+
+/*
+ * Detach from a shared memory segment, destroying the segment if we
+ * remove the last reference.
+ *
+ * This function should never fail. It will often be invoked when aborting
+ * a transaction, and a further error won't serve any purpose. It's not a
+ * complete disaster if we fail to unmap or destroy the segment; it means a
+ * resource leak, but that doesn't necessarily preclude further operations.
+ */
+void
+dsm_detach(dsm_segment *seg)
+{
+ /*
+ * Invoke registered callbacks. Just in case one of those callbacks
+ * throws a further error that brings us back here, pop the callback
+ * before invoking it, to avoid infinite error recursion. Don't allow
+ * interrupts while running the individual callbacks in non-error code
+ * paths, to avoid leaving cleanup work unfinished if we're interrupted by
+ * a statement timeout or similar.
+ */
+ HOLD_INTERRUPTS();
+ while (!slist_is_empty(&seg->on_detach))
+ {
+ slist_node *node;
+ dsm_segment_detach_callback *cb;
+ on_dsm_detach_callback function;
+ Datum arg;
+
+ node = slist_pop_head_node(&seg->on_detach);
+ cb = slist_container(dsm_segment_detach_callback, node, node);
+ function = cb->function;
+ arg = cb->arg;
+ pfree(cb);
+
+ function(seg, arg);
+ }
+ RESUME_INTERRUPTS();
+
+ /*
+ * Try to remove the mapping, if one exists. Normally, there will be, but
+ * maybe not, if we failed partway through a create or attach operation.
+ * We remove the mapping before decrementing the reference count so that
+ * the process that sees a zero reference count can be certain that no
+ * remaining mappings exist. Even if this fails, we pretend that it
+ * works, because retrying is likely to fail in the same way.
+ */
+ if (seg->mapped_address != NULL)
+ {
+ if (!is_main_region_dsm_handle(seg->handle))
+ dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, WARNING);
+ seg->impl_private = NULL;
+ seg->mapped_address = NULL;
+ seg->mapped_size = 0;
+ }
+
+ /* Reduce reference count, if we previously increased it. */
+ if (seg->control_slot != INVALID_CONTROL_SLOT)
+ {
+ uint32 refcnt;
+ uint32 control_slot = seg->control_slot;
+
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ Assert(dsm_control->item[control_slot].handle == seg->handle);
+ Assert(dsm_control->item[control_slot].refcnt > 1);
+ refcnt = --dsm_control->item[control_slot].refcnt;
+ seg->control_slot = INVALID_CONTROL_SLOT;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+
+ /* If new reference count is 1, try to destroy the segment. */
+ if (refcnt == 1)
+ {
+ /* A pinned segment should never reach 1. */
+ Assert(!dsm_control->item[control_slot].pinned);
+
+ /*
+ * If we fail to destroy the segment here, or are killed before we
+ * finish doing so, the reference count will remain at 1, which
+ * will mean that nobody else can attach to the segment. At
+ * postmaster shutdown time, or when a new postmaster is started
+ * after a hard kill, another attempt will be made to remove the
+ * segment.
+ *
+ * The main case we're worried about here is being killed by a
+ * signal before we can finish removing the segment. In that
+ * case, it's important to be sure that the segment still gets
+ * removed. If we actually fail to remove the segment for some
+ * other reason, the postmaster may not have any better luck than
+ * we did. There's not much we can do about that, though.
+ */
+ if (is_main_region_dsm_handle(seg->handle) ||
+ dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, WARNING))
+ {
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ if (is_main_region_dsm_handle(seg->handle))
+ FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+ dsm_control->item[control_slot].first_page,
+ dsm_control->item[control_slot].npages);
+ Assert(dsm_control->item[control_slot].handle == seg->handle);
+ Assert(dsm_control->item[control_slot].refcnt == 1);
+ dsm_control->item[control_slot].refcnt = 0;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ }
+ }
+ }
+
+ /* Clean up our remaining backend-private data structures. */
+ if (seg->resowner != NULL)
+ ResourceOwnerForgetDSM(seg->resowner, seg);
+ dlist_delete(&seg->node);
+ pfree(seg);
+}
+
+/*
+ * Keep a dynamic shared memory mapping until end of session.
+ *
+ * By default, mappings are owned by the current resource owner, which
+ * typically means they stick around for the duration of the current query
+ * only.
+ */
+void
+dsm_pin_mapping(dsm_segment *seg)
+{
+ if (seg->resowner != NULL)
+ {
+ ResourceOwnerForgetDSM(seg->resowner, seg);
+ seg->resowner = NULL;
+ }
+}
+
+/*
+ * Arrange to remove a dynamic shared memory mapping at cleanup time.
+ *
+ * dsm_pin_mapping() can be used to preserve a mapping for the entire
+ * lifetime of a process; this function reverses that decision, making
+ * the segment owned by the current resource owner. This may be useful
+ * just before performing some operation that will invalidate the segment
+ * for future use by this backend.
+ */
+void
+dsm_unpin_mapping(dsm_segment *seg)
+{
+ Assert(seg->resowner == NULL);
+ ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
+ seg->resowner = CurrentResourceOwner;
+ ResourceOwnerRememberDSM(seg->resowner, seg);
+}
+
+/*
+ * Keep a dynamic shared memory segment until postmaster shutdown, or until
+ * dsm_unpin_segment is called.
+ *
+ * This function should not be called more than once per segment, unless the
+ * segment is explicitly unpinned with dsm_unpin_segment in between calls.
+ *
+ * Note that this function does not arrange for the current process to
+ * keep the segment mapped indefinitely; if that behavior is desired,
+ * dsm_pin_mapping() should be used from each process that needs to
+ * retain the mapping.
+ */
+void
+dsm_pin_segment(dsm_segment *seg)
+{
+ void *handle = NULL;
+
+ /*
+ * Bump reference count for this segment in shared memory. This will
+ * ensure that even if there is no session which is attached to this
+ * segment, it will remain until postmaster shutdown or an explicit call
+ * to unpin.
+ */
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ if (dsm_control->item[seg->control_slot].pinned)
+ elog(ERROR, "cannot pin a segment that is already pinned");
+ if (!is_main_region_dsm_handle(seg->handle))
+ dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
+ dsm_control->item[seg->control_slot].pinned = true;
+ dsm_control->item[seg->control_slot].refcnt++;
+ dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+}
+
+/*
+ * Unpin a dynamic shared memory segment that was previously pinned with
+ * dsm_pin_segment. This function should not be called unless dsm_pin_segment
+ * was previously called for this segment.
+ *
+ * The argument is a dsm_handle rather than a dsm_segment in case you want
+ * to unpin a segment to which you haven't attached. This turns out to be
+ * useful if, for example, a reference to one shared memory segment is stored
+ * within another shared memory segment. You might want to unpin the
+ * referenced segment before destroying the referencing segment.
+ */
+void
+dsm_unpin_segment(dsm_handle handle)
+{
+ uint32 control_slot = INVALID_CONTROL_SLOT;
+ bool destroy = false;
+ uint32 i;
+
+ /* Find the control slot for the given handle. */
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ for (i = 0; i < dsm_control->nitems; ++i)
+ {
+ /* Skip unused slots and segments that are concurrently going away. */
+ if (dsm_control->item[i].refcnt <= 1)
+ continue;
+
+ /* If we've found our handle, we can stop searching. */
+ if (dsm_control->item[i].handle == handle)
+ {
+ control_slot = i;
+ break;
+ }
+ }
+
+ /*
+ * We should definitely have found the slot, and it should not already be
+ * in the process of going away, because this function should only be
+ * called on a segment which is pinned.
+ */
+ if (control_slot == INVALID_CONTROL_SLOT)
+ elog(ERROR, "cannot unpin unknown segment handle");
+ if (!dsm_control->item[control_slot].pinned)
+ elog(ERROR, "cannot unpin a segment that is not pinned");
+ Assert(dsm_control->item[control_slot].refcnt > 1);
+
+ /*
+ * Allow implementation-specific code to run. We have to do this before
+ * releasing the lock, because impl_private_pm_handle may get modified by
+ * dsm_impl_unpin_segment.
+ */
+ if (!is_main_region_dsm_handle(handle))
+ dsm_impl_unpin_segment(handle,
+ &dsm_control->item[control_slot].impl_private_pm_handle);
+
+ /* Note that 1 means no references (0 means unused slot). */
+ if (--dsm_control->item[control_slot].refcnt == 1)
+ destroy = true;
+ dsm_control->item[control_slot].pinned = false;
+
+ /* Now we can release the lock. */
+ LWLockRelease(DynamicSharedMemoryControlLock);
+
+ /* Clean up resources if that was the last reference. */
+ if (destroy)
+ {
+ void *junk_impl_private = NULL;
+ void *junk_mapped_address = NULL;
+ Size junk_mapped_size = 0;
+
+ /*
+ * For an explanation of how error handling works in this case, see
+ * comments in dsm_detach. Note that if we reach this point, the
+ * current process certainly does not have the segment mapped, because
+ * if it did, the reference count would have still been greater than 1
+ * even after releasing the reference count held by the pin. The fact
+ * that there can't be a dsm_segment for this handle makes it OK to
+ * pass the mapped size, mapped address, and private data as NULL
+ * here.
+ */
+ if (is_main_region_dsm_handle(handle) ||
+ dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+ &junk_mapped_address, &junk_mapped_size, WARNING))
+ {
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ if (is_main_region_dsm_handle(handle))
+ FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+ dsm_control->item[control_slot].first_page,
+ dsm_control->item[control_slot].npages);
+ Assert(dsm_control->item[control_slot].handle == handle);
+ Assert(dsm_control->item[control_slot].refcnt == 1);
+ dsm_control->item[control_slot].refcnt = 0;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ }
+ }
+}
+
+/*
+ * Find an existing mapping for a shared memory segment, if there is one.
+ */
+dsm_segment *
+dsm_find_mapping(dsm_handle handle)
+{
+ dlist_iter iter;
+ dsm_segment *seg;
+
+ dlist_foreach(iter, &dsm_segment_list)
+ {
+ seg = dlist_container(dsm_segment, node, iter.cur);
+ if (seg->handle == handle)
+ return seg;
+ }
+
+ return NULL;
+}
+
+/*
+ * Get the address at which a dynamic shared memory segment is mapped.
+ */
+void *
+dsm_segment_address(dsm_segment *seg)
+{
+ Assert(seg->mapped_address != NULL);
+ return seg->mapped_address;
+}
+
+/*
+ * Get the size of a mapping.
+ */
+Size
+dsm_segment_map_length(dsm_segment *seg)
+{
+ Assert(seg->mapped_address != NULL);
+ return seg->mapped_size;
+}
+
+/*
+ * Get a handle for a mapping.
+ *
+ * To establish communication via dynamic shared memory between two backends,
+ * one of them should first call dsm_create() to establish a new shared
+ * memory mapping. That process should then call dsm_segment_handle() to
+ * obtain a handle for the mapping, and pass that handle to the
+ * coordinating backend via some means (e.g. bgw_main_arg, or via the
+ * main shared memory segment). The recipient, once in possession of the
+ * handle, should call dsm_attach().
+ */
+dsm_handle
+dsm_segment_handle(dsm_segment *seg)
+{
+ return seg->handle;
+}
+
+/*
+ * Register an on-detach callback for a dynamic shared memory segment.
+ */
+void
+on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
+{
+ dsm_segment_detach_callback *cb;
+
+ cb = MemoryContextAlloc(TopMemoryContext,
+ sizeof(dsm_segment_detach_callback));
+ cb->function = function;
+ cb->arg = arg;
+ slist_push_head(&seg->on_detach, &cb->node);
+}
+
+/*
+ * Unregister an on-detach callback for a dynamic shared memory segment.
+ */
+void
+cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
+ Datum arg)
+{
+ slist_mutable_iter iter;
+
+ slist_foreach_modify(iter, &seg->on_detach)
+ {
+ dsm_segment_detach_callback *cb;
+
+ cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
+ if (cb->function == function && cb->arg == arg)
+ {
+ slist_delete_current(&iter);
+ pfree(cb);
+ break;
+ }
+ }
+}
+
+/*
+ * Discard all registered on-detach callbacks without executing them.
+ */
+void
+reset_on_dsm_detach(void)
+{
+ dlist_iter iter;
+
+ dlist_foreach(iter, &dsm_segment_list)
+ {
+ dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
+
+ /* Throw away explicit on-detach actions one by one. */
+ while (!slist_is_empty(&seg->on_detach))
+ {
+ slist_node *node;
+ dsm_segment_detach_callback *cb;
+
+ node = slist_pop_head_node(&seg->on_detach);
+ cb = slist_container(dsm_segment_detach_callback, node, node);
+ pfree(cb);
+ }
+
+ /*
+ * Decrementing the reference count is a sort of implicit on-detach
+ * action; make sure we don't do that, either.
+ */
+ seg->control_slot = INVALID_CONTROL_SLOT;
+ }
+}
+
+/*
+ * Create a segment descriptor.
+ */
+static dsm_segment *
+dsm_create_descriptor(void)
+{
+ dsm_segment *seg;
+
+ if (CurrentResourceOwner)
+ ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
+
+ seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
+ dlist_push_head(&dsm_segment_list, &seg->node);
+
+ /* seg->handle must be initialized by the caller */
+ seg->control_slot = INVALID_CONTROL_SLOT;
+ seg->impl_private = NULL;
+ seg->mapped_address = NULL;
+ seg->mapped_size = 0;
+
+ seg->resowner = CurrentResourceOwner;
+ if (CurrentResourceOwner)
+ ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
+
+ slist_init(&seg->on_detach);
+
+ return seg;
+}
+
+/*
+ * Sanity check a control segment.
+ *
+ * The goal here isn't to detect everything that could possibly be wrong with
+ * the control segment; there's not enough information for that. Rather, the
+ * goal is to make sure that someone can iterate over the items in the segment
+ * without overrunning the end of the mapping and crashing. We also check
+ * the magic number since, if that's messed up, this may not even be one of
+ * our segments at all.
+ */
+static bool
+dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
+{
+ if (mapped_size < offsetof(dsm_control_header, item))
+ return false; /* Mapped size too short to read header. */
+ if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
+ return false; /* Magic number doesn't match. */
+ if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
+ return false; /* Max item count won't fit in map. */
+ if (control->nitems > control->maxitems)
+ return false; /* Overfull. */
+ return true;
+}
+
+/*
+ * Compute the number of control-segment bytes needed to store a given
+ * number of items.
+ */
+static uint64
+dsm_control_bytes_needed(uint32 nitems)
+{
+ return offsetof(dsm_control_header, item)
+ + sizeof(dsm_control_item) * (uint64) nitems;
+}
+
+static inline dsm_handle
+make_main_region_dsm_handle(int slot)
+{
+ dsm_handle handle;
+
+ /*
+ * We need to create a handle that doesn't collide with any existing extra
+ * segment created by dsm_impl_op(), so we'll make it odd. It also
+ * mustn't collide with any other main area pseudo-segment, so we'll
+ * include the slot number in some of the bits. We also want to make an
+ * effort to avoid newly created and recently destroyed handles from being
+ * confused, so we'll make the rest of the bits random.
+ */
+ handle = 1;
+ handle |= slot << 1;
+ handle |= pg_prng_uint32(&pg_global_prng_state) << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1);
+ return handle;
+}
+
+static inline bool
+is_main_region_dsm_handle(dsm_handle handle)
+{
+ return handle & 1;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
new file mode 100644
index 0000000..6399fa2
--- /dev/null
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -0,0 +1,1053 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm_impl.c
+ * manage dynamic shared memory segments
+ *
+ * This file provides low-level APIs for creating and destroying shared
+ * memory segments using several different possible techniques. We refer
+ * to these segments as dynamic because they can be created, altered, and
+ * destroyed at any point during the server life cycle. This is unlike
+ * the main shared memory segment, of which there is always exactly one
+ * and which is always mapped at a fixed address in every PostgreSQL
+ * background process.
+ *
+ * Because not all systems provide the same primitives in this area, nor
+ * do all primitives behave the same way on all systems, we provide
+ * several implementations of this facility. Many systems implement
+ * POSIX shared memory (shm_open etc.), which is well-suited to our needs
+ * in this area, with the exception that shared memory identifiers live
+ * in a flat system-wide namespace, raising the uncomfortable prospect of
+ * name collisions with other processes (including other copies of
+ * PostgreSQL) running on the same system. Some systems only support
+ * the older System V shared memory interface (shmget etc.) which is
+ * also usable; however, the default allocation limits are often quite
+ * small, and the namespace is even more restricted.
+ *
+ * We also provide an mmap-based shared memory implementation. This may
+ * be useful on systems that provide shared memory via a special-purpose
+ * filesystem; by opting for this implementation, the user can even
+ * control precisely where their shared memory segments are placed. It
+ * can also be used as a fallback for systems where shm_open and shmget
+ * are not available or can't be used for some reason. Of course,
+ * mapping a file residing on an actual spinning disk is a fairly poor
+ * approximation for shared memory because writeback may hurt performance
+ * substantially, but there should be few systems where we must make do
+ * with such poor tools.
+ *
+ * As ever, Windows requires its own implementation.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/dsm_impl.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <signal.h>
+#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/stat.h>
+#endif
+
+#include "common/file_perm.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/mem.h"
+#include "postmaster/postmaster.h"
+#include "storage/dsm_impl.h"
+#include "storage/fd.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+#ifdef USE_DSM_POSIX
+static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel);
+static int dsm_impl_posix_resize(int fd, off_t size);
+#endif
+#ifdef USE_DSM_SYSV
+static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel);
+#endif
+#ifdef USE_DSM_WINDOWS
+static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel);
+#endif
+#ifdef USE_DSM_MMAP
+static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel);
+#endif
+static int errcode_for_dynamic_shared_memory(void);
+
+const struct config_enum_entry dynamic_shared_memory_options[] = {
+#ifdef USE_DSM_POSIX
+ {"posix", DSM_IMPL_POSIX, false},
+#endif
+#ifdef USE_DSM_SYSV
+ {"sysv", DSM_IMPL_SYSV, false},
+#endif
+#ifdef USE_DSM_WINDOWS
+ {"windows", DSM_IMPL_WINDOWS, false},
+#endif
+#ifdef USE_DSM_MMAP
+ {"mmap", DSM_IMPL_MMAP, false},
+#endif
+ {NULL, 0, false}
+};
+
+/* Implementation selector. */
+int dynamic_shared_memory_type = DEFAULT_DYNAMIC_SHARED_MEMORY_TYPE;
+
+/* Amount of space reserved for DSM segments in the main area. */
+int min_dynamic_shared_memory;
+
+/* Size of buffer to be used for zero-filling. */
+#define ZBUFFER_SIZE 8192
+
+#define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
+
+/*------
+ * Perform a low-level shared memory operation in a platform-specific way,
+ * as dictated by the selected implementation. Each implementation is
+ * required to implement the following primitives.
+ *
+ * DSM_OP_CREATE. Create a segment whose size is the request_size and
+ * map it.
+ *
+ * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
+ *
+ * DSM_OP_DETACH. Unmap the segment.
+ *
+ * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
+ * segment.
+ *
+ * Arguments:
+ * op: The operation to be performed.
+ * handle: The handle of an existing object, or for DSM_OP_CREATE, the
+ * a new handle the caller wants created.
+ * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
+ * impl_private: Private, implementation-specific data. Will be a pointer
+ * to NULL for the first operation on a shared memory segment within this
+ * backend; thereafter, it will point to the value to which it was set
+ * on the previous call.
+ * mapped_address: Pointer to start of current mapping; pointer to NULL
+ * if none. Updated with new mapping address.
+ * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
+ * Updated with new mapped size.
+ * elevel: Level at which to log errors.
+ *
+ * Return value: true on success, false on failure. When false is returned,
+ * a message should first be logged at the specified elevel, except in the
+ * case where DSM_OP_CREATE experiences a name collision, which should
+ * silently return false.
+ *-----
+ */
+bool
+dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel)
+{
+ Assert(op == DSM_OP_CREATE || request_size == 0);
+ Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
+ (*mapped_address == NULL && *mapped_size == 0));
+
+ switch (dynamic_shared_memory_type)
+ {
+#ifdef USE_DSM_POSIX
+ case DSM_IMPL_POSIX:
+ return dsm_impl_posix(op, handle, request_size, impl_private,
+ mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_SYSV
+ case DSM_IMPL_SYSV:
+ return dsm_impl_sysv(op, handle, request_size, impl_private,
+ mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_WINDOWS
+ case DSM_IMPL_WINDOWS:
+ return dsm_impl_windows(op, handle, request_size, impl_private,
+ mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_MMAP
+ case DSM_IMPL_MMAP:
+ return dsm_impl_mmap(op, handle, request_size, impl_private,
+ mapped_address, mapped_size, elevel);
+#endif
+ default:
+ elog(ERROR, "unexpected dynamic shared memory type: %d",
+ dynamic_shared_memory_type);
+ return false;
+ }
+}
+
+#ifdef USE_DSM_POSIX
+/*
+ * Operating system primitives to support POSIX shared memory.
+ *
+ * POSIX shared memory segments are created and attached using shm_open()
+ * and shm_unlink(); other operations, such as sizing or mapping the
+ * segment, are performed as if the shared memory segments were files.
+ *
+ * Indeed, on some platforms, they may be implemented that way. While
+ * POSIX shared memory segments seem intended to exist in a flat namespace,
+ * some operating systems may implement them as files, even going so far
+ * to treat a request for /xyz as a request to create a file by that name
+ * in the root directory. Users of such broken platforms should select
+ * a different shared memory implementation.
+ */
+static bool
+dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel)
+{
+ char name[64];
+ int flags;
+ int fd;
+ char *address;
+
+ snprintf(name, 64, "/PostgreSQL.%u", handle);
+
+ /* Handle teardown cases. */
+ if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+ {
+ if (*mapped_address != NULL
+ && munmap(*mapped_address, *mapped_size) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not unmap shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = NULL;
+ *mapped_size = 0;
+ if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not remove shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ return true;
+ }
+
+ /*
+ * Create new segment or open an existing one for attach.
+ *
+ * Even though we will close the FD before returning, it seems desirable
+ * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
+ * failure. The fact that we won't hold the FD open long justifies using
+ * ReserveExternalFD rather than AcquireExternalFD, though.
+ */
+ ReserveExternalFD();
+
+ flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
+ if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
+ {
+ ReleaseExternalFD();
+ if (op == DSM_OP_ATTACH || errno != EEXIST)
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not open shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ /*
+ * If we're attaching the segment, determine the current size; if we are
+ * creating the segment, set the size to the requested value.
+ */
+ if (op == DSM_OP_ATTACH)
+ {
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ close(fd);
+ ReleaseExternalFD();
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not stat shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ request_size = st.st_size;
+ }
+ else if (dsm_impl_posix_resize(fd, request_size) != 0)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ close(fd);
+ ReleaseExternalFD();
+ shm_unlink(name);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
+ name, request_size)));
+ return false;
+ }
+
+ /* Map it. */
+ address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
+ if (address == MAP_FAILED)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ close(fd);
+ ReleaseExternalFD();
+ if (op == DSM_OP_CREATE)
+ shm_unlink(name);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not map shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = address;
+ *mapped_size = request_size;
+ close(fd);
+ ReleaseExternalFD();
+
+ return true;
+}
+
+/*
+ * Set the size of a virtual memory region associated with a file descriptor.
+ * If necessary, also ensure that virtual memory is actually allocated by the
+ * operating system, to avoid nasty surprises later.
+ *
+ * Returns non-zero if either truncation or allocation fails, and sets errno.
+ */
+static int
+dsm_impl_posix_resize(int fd, off_t size)
+{
+ int rc;
+ int save_errno;
+ sigset_t save_sigmask;
+
+ /*
+ * Block all blockable signals, except SIGQUIT. posix_fallocate() can run
+ * for quite a long time, and is an all-or-nothing operation. If we
+ * allowed SIGUSR1 to interrupt us repeatedly (for example, due to
+ * recovery conflicts), the retry loop might never succeed.
+ */
+ if (IsUnderPostmaster)
+ sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask);
+
+ pgstat_report_wait_start(WAIT_EVENT_DSM_ALLOCATE);
+#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
+
+ /*
+ * On Linux, a shm_open fd is backed by a tmpfs file. If we were to use
+ * ftruncate, the file would contain a hole. Accessing memory backed by a
+ * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
+ * is no more tmpfs space available. So we ask tmpfs to allocate pages
+ * here, so we can fail gracefully with ENOSPC now rather than risking
+ * SIGBUS later.
+ *
+ * We still use a traditional EINTR retry loop to handle SIGCONT.
+ * posix_fallocate() doesn't restart automatically, and we don't want this
+ * to fail if you attach a debugger.
+ */
+ do
+ {
+ rc = posix_fallocate(fd, 0, size);
+ } while (rc == EINTR);
+
+ /*
+ * The caller expects errno to be set, but posix_fallocate() doesn't set
+ * it. Instead it returns error numbers directly. So set errno, even
+ * though we'll also return rc to indicate success or failure.
+ */
+ errno = rc;
+#else
+ /* Extend the file to the requested size. */
+ do
+ {
+ rc = ftruncate(fd, size);
+ } while (rc < 0 && errno == EINTR);
+#endif
+ pgstat_report_wait_end();
+
+ if (IsUnderPostmaster)
+ {
+ save_errno = errno;
+ sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
+ errno = save_errno;
+ }
+
+ return rc;
+}
+
+#endif /* USE_DSM_POSIX */
+
+#ifdef USE_DSM_SYSV
+/*
+ * Operating system primitives to support System V shared memory.
+ *
+ * System V shared memory segments are manipulated using shmget(), shmat(),
+ * shmdt(), and shmctl(). As the default allocation limits for System V
+ * shared memory are usually quite low, the POSIX facilities may be
+ * preferable; but those are not supported everywhere.
+ */
+static bool
+dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel)
+{
+ key_t key;
+ int ident;
+ char *address;
+ char name[64];
+ int *ident_cache;
+
+ /*
+ * POSIX shared memory and mmap-based shared memory identify segments with
+ * names. To avoid needless error message variation, we use the handle as
+ * the name.
+ */
+ snprintf(name, 64, "%u", handle);
+
+ /*
+ * The System V shared memory namespace is very restricted; names are of
+ * type key_t, which is expected to be some sort of integer data type, but
+ * not necessarily the same one as dsm_handle. Since we use dsm_handle to
+ * identify shared memory segments across processes, this might seem like
+ * a problem, but it's really not. If dsm_handle is bigger than key_t,
+ * the cast below might truncate away some bits from the handle the
+ * user-provided, but it'll truncate exactly the same bits away in exactly
+ * the same fashion every time we use that handle, which is all that
+ * really matters. Conversely, if dsm_handle is smaller than key_t, we
+ * won't use the full range of available key space, but that's no big deal
+ * either.
+ *
+ * We do make sure that the key isn't negative, because that might not be
+ * portable.
+ */
+ key = (key_t) handle;
+ if (key < 1) /* avoid compiler warning if type is unsigned */
+ key = -key;
+
+ /*
+ * There's one special key, IPC_PRIVATE, which can't be used. If we end
+ * up with that value by chance during a create operation, just pretend it
+ * already exists, so that caller will retry. If we run into it anywhere
+ * else, the caller has passed a handle that doesn't correspond to
+ * anything we ever created, which should not happen.
+ */
+ if (key == IPC_PRIVATE)
+ {
+ if (op != DSM_OP_CREATE)
+ elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
+ errno = EEXIST;
+ return false;
+ }
+
+ /*
+ * Before we can do anything with a shared memory segment, we have to map
+ * the shared memory key to a shared memory identifier using shmget(). To
+ * avoid repeated lookups, we store the key using impl_private.
+ */
+ if (*impl_private != NULL)
+ {
+ ident_cache = *impl_private;
+ ident = *ident_cache;
+ }
+ else
+ {
+ int flags = IPCProtection;
+ size_t segsize;
+
+ /*
+ * Allocate the memory BEFORE acquiring the resource, so that we don't
+ * leak the resource if memory allocation fails.
+ */
+ ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
+
+ /*
+ * When using shmget to find an existing segment, we must pass the
+ * size as 0. Passing a non-zero size which is greater than the
+ * actual size will result in EINVAL.
+ */
+ segsize = 0;
+
+ if (op == DSM_OP_CREATE)
+ {
+ flags |= IPC_CREAT | IPC_EXCL;
+ segsize = request_size;
+ }
+
+ if ((ident = shmget(key, segsize, flags)) == -1)
+ {
+ if (op == DSM_OP_ATTACH || errno != EEXIST)
+ {
+ int save_errno = errno;
+
+ pfree(ident_cache);
+ errno = save_errno;
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not get shared memory segment: %m")));
+ }
+ return false;
+ }
+
+ *ident_cache = ident;
+ *impl_private = ident_cache;
+ }
+
+ /* Handle teardown cases. */
+ if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+ {
+ pfree(ident_cache);
+ *impl_private = NULL;
+ if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not unmap shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = NULL;
+ *mapped_size = 0;
+ if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not remove shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ return true;
+ }
+
+ /* If we're attaching it, we must use IPC_STAT to determine the size. */
+ if (op == DSM_OP_ATTACH)
+ {
+ struct shmid_ds shm;
+
+ if (shmctl(ident, IPC_STAT, &shm) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not stat shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ request_size = shm.shm_segsz;
+ }
+
+ /* Map it. */
+ address = shmat(ident, NULL, PG_SHMAT_FLAGS);
+ if (address == (void *) -1)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ if (op == DSM_OP_CREATE)
+ shmctl(ident, IPC_RMID, NULL);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not map shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = address;
+ *mapped_size = request_size;
+
+ return true;
+}
+#endif
+
+#ifdef USE_DSM_WINDOWS
+/*
+ * Operating system primitives to support Windows shared memory.
+ *
+ * Windows shared memory implementation is done using file mapping
+ * which can be backed by either physical file or system paging file.
+ * Current implementation uses system paging file as other effects
+ * like performance are not clear for physical file and it is used in similar
+ * way for main shared memory in windows.
+ *
+ * A memory mapping object is a kernel object - they always get deleted when
+ * the last reference to them goes away, either explicitly via a CloseHandle or
+ * when the process containing the reference exits.
+ */
+static bool
+dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel)
+{
+ char *address;
+ HANDLE hmap;
+ char name[64];
+ MEMORY_BASIC_INFORMATION info;
+
+ /*
+ * Storing the shared memory segment in the Global\ namespace, can allow
+ * any process running in any session to access that file mapping object
+ * provided that the caller has the required access rights. But to avoid
+ * issues faced in main shared memory, we are using the naming convention
+ * similar to main shared memory. We can change here once issue mentioned
+ * in GetSharedMemName is resolved.
+ */
+ snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+
+ /*
+ * Handle teardown cases. Since Windows automatically destroys the object
+ * when no references remain, we can treat it the same as detach.
+ */
+ if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+ {
+ if (*mapped_address != NULL
+ && UnmapViewOfFile(*mapped_address) == 0)
+ {
+ _dosmaperr(GetLastError());
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not unmap shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ if (*impl_private != NULL
+ && CloseHandle(*impl_private) == 0)
+ {
+ _dosmaperr(GetLastError());
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not remove shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ *impl_private = NULL;
+ *mapped_address = NULL;
+ *mapped_size = 0;
+ return true;
+ }
+
+ /* Create new segment or open an existing one for attach. */
+ if (op == DSM_OP_CREATE)
+ {
+ DWORD size_high;
+ DWORD size_low;
+ DWORD errcode;
+
+ /* Shifts >= the width of the type are undefined. */
+#ifdef _WIN64
+ size_high = request_size >> 32;
+#else
+ size_high = 0;
+#endif
+ size_low = (DWORD) request_size;
+
+ /* CreateFileMapping might not clear the error code on success */
+ SetLastError(0);
+
+ hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
+ NULL, /* Default security attrs */
+ PAGE_READWRITE, /* Memory is read/write */
+ size_high, /* Upper 32 bits of size */
+ size_low, /* Lower 32 bits of size */
+ name);
+
+ errcode = GetLastError();
+ if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
+ {
+ /*
+ * On Windows, when the segment already exists, a handle for the
+ * existing segment is returned. We must close it before
+ * returning. However, if the existing segment is created by a
+ * service, then it returns ERROR_ACCESS_DENIED. We don't do
+ * _dosmaperr here, so errno won't be modified.
+ */
+ if (hmap)
+ CloseHandle(hmap);
+ return false;
+ }
+
+ if (!hmap)
+ {
+ _dosmaperr(errcode);
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not create shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ }
+ else
+ {
+ hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
+ FALSE, /* do not inherit the name */
+ name); /* name of mapping object */
+ if (!hmap)
+ {
+ _dosmaperr(GetLastError());
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not open shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ }
+
+ /* Map it. */
+ address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
+ 0, 0, 0);
+ if (!address)
+ {
+ int save_errno;
+
+ _dosmaperr(GetLastError());
+ /* Back out what's already been done. */
+ save_errno = errno;
+ CloseHandle(hmap);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not map shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ /*
+ * VirtualQuery gives size in page_size units, which is 4K for Windows. We
+ * need size only when we are attaching, but it's better to get the size
+ * when creating new segment to keep size consistent both for
+ * DSM_OP_CREATE and DSM_OP_ATTACH.
+ */
+ if (VirtualQuery(address, &info, sizeof(info)) == 0)
+ {
+ int save_errno;
+
+ _dosmaperr(GetLastError());
+ /* Back out what's already been done. */
+ save_errno = errno;
+ UnmapViewOfFile(address);
+ CloseHandle(hmap);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not stat shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ *mapped_address = address;
+ *mapped_size = info.RegionSize;
+ *impl_private = hmap;
+
+ return true;
+}
+#endif
+
+#ifdef USE_DSM_MMAP
+/*
+ * Operating system primitives to support mmap-based shared memory.
+ *
+ * Calling this "shared memory" is somewhat of a misnomer, because what
+ * we're really doing is creating a bunch of files and mapping them into
+ * our address space. The operating system may feel obliged to
+ * synchronize the contents to disk even if nothing is being paged out,
+ * which will not serve us well. The user can relocate the pg_dynshmem
+ * directory to a ramdisk to avoid this problem, if available.
+ */
+static bool
+dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel)
+{
+ char name[64];
+ int flags;
+ int fd;
+ char *address;
+
+ snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
+ handle);
+
+ /* Handle teardown cases. */
+ if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+ {
+ if (*mapped_address != NULL
+ && munmap(*mapped_address, *mapped_size) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not unmap shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = NULL;
+ *mapped_size = 0;
+ if (op == DSM_OP_DESTROY && unlink(name) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not remove shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ return true;
+ }
+
+ /* Create new segment or open an existing one for attach. */
+ flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
+ if ((fd = OpenTransientFile(name, flags)) == -1)
+ {
+ if (op == DSM_OP_ATTACH || errno != EEXIST)
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not open shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ /*
+ * If we're attaching the segment, determine the current size; if we are
+ * creating the segment, set the size to the requested value.
+ */
+ if (op == DSM_OP_ATTACH)
+ {
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ CloseTransientFile(fd);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not stat shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ request_size = st.st_size;
+ }
+ else
+ {
+ /*
+ * Allocate a buffer full of zeros.
+ *
+ * Note: palloc zbuffer, instead of just using a local char array, to
+ * ensure it is reasonably well-aligned; this may save a few cycles
+ * transferring data to the kernel.
+ */
+ char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
+ uint32 remaining = request_size;
+ bool success = true;
+
+ /*
+ * Zero-fill the file. We have to do this the hard way to ensure that
+ * all the file space has really been allocated, so that we don't
+ * later seg fault when accessing the memory mapping. This is pretty
+ * pessimal.
+ */
+ while (success && remaining > 0)
+ {
+ Size goal = remaining;
+
+ if (goal > ZBUFFER_SIZE)
+ goal = ZBUFFER_SIZE;
+ pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
+ if (write(fd, zbuffer, goal) == goal)
+ remaining -= goal;
+ else
+ success = false;
+ pgstat_report_wait_end();
+ }
+
+ if (!success)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ CloseTransientFile(fd);
+ unlink(name);
+ errno = save_errno ? save_errno : ENOSPC;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
+ name, request_size)));
+ return false;
+ }
+ }
+
+ /* Map it. */
+ address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
+ if (address == MAP_FAILED)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ CloseTransientFile(fd);
+ if (op == DSM_OP_CREATE)
+ unlink(name);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not map shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = address;
+ *mapped_size = request_size;
+
+ if (CloseTransientFile(fd) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not close shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ return true;
+}
+#endif
+
+/*
+ * Implementation-specific actions that must be performed when a segment is to
+ * be preserved even when no backend has it attached.
+ *
+ * Except on Windows, we don't need to do anything at all. But since Windows
+ * cleans up segments automatically when no references remain, we duplicate
+ * the segment handle into the postmaster process. The postmaster needn't
+ * do anything to receive the handle; Windows transfers it automatically.
+ */
+void
+dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
+ void **impl_private_pm_handle)
+{
+ switch (dynamic_shared_memory_type)
+ {
+#ifdef USE_DSM_WINDOWS
+ case DSM_IMPL_WINDOWS:
+ if (IsUnderPostmaster)
+ {
+ HANDLE hmap;
+
+ if (!DuplicateHandle(GetCurrentProcess(), impl_private,
+ PostmasterHandle, &hmap, 0, FALSE,
+ DUPLICATE_SAME_ACCESS))
+ {
+ char name[64];
+
+ snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+ _dosmaperr(GetLastError());
+ ereport(ERROR,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not duplicate handle for \"%s\": %m",
+ name)));
+ }
+
+ /*
+ * Here, we remember the handle that we created in the
+ * postmaster process. This handle isn't actually usable in
+ * any process other than the postmaster, but that doesn't
+ * matter. We're just holding onto it so that, if the segment
+ * is unpinned, dsm_impl_unpin_segment can close it.
+ */
+ *impl_private_pm_handle = hmap;
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
+/*
+ * Implementation-specific actions that must be performed when a segment is no
+ * longer to be preserved, so that it will be cleaned up when all backends
+ * have detached from it.
+ *
+ * Except on Windows, we don't need to do anything at all. For Windows, we
+ * close the extra handle that dsm_impl_pin_segment created in the
+ * postmaster's process space.
+ */
+void
+dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
+{
+ switch (dynamic_shared_memory_type)
+ {
+#ifdef USE_DSM_WINDOWS
+ case DSM_IMPL_WINDOWS:
+ if (IsUnderPostmaster)
+ {
+ if (*impl_private &&
+ !DuplicateHandle(PostmasterHandle, *impl_private,
+ NULL, NULL, 0, FALSE,
+ DUPLICATE_CLOSE_SOURCE))
+ {
+ char name[64];
+
+ snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+ _dosmaperr(GetLastError());
+ ereport(ERROR,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not duplicate handle for \"%s\": %m",
+ name)));
+ }
+
+ *impl_private = NULL;
+ }
+ break;
+#endif
+ default:
+ break;
+ }
+}
+
+static int
+errcode_for_dynamic_shared_memory(void)
+{
+ if (errno == EFBIG || errno == ENOMEM)
+ return errcode(ERRCODE_OUT_OF_MEMORY);
+ else
+ return errcode_for_file_access();
+}
diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c
new file mode 100644
index 0000000..6591b5d
--- /dev/null
+++ b/src/backend/storage/ipc/ipc.c
@@ -0,0 +1,439 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.c
+ * POSTGRES inter-process communication definitions.
+ *
+ * This file is misnamed, as it no longer has much of anything directly
+ * to do with IPC. The functionality here is concerned with managing
+ * exit-time cleanup for either a postmaster or a backend.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/ipc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+#ifdef PROFILE_PID_DIR
+#include "postmaster/autovacuum.h"
+#endif
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "tcop/tcopprot.h"
+
+
+/*
+ * This flag is set during proc_exit() to change ereport()'s behavior,
+ * so that an ereport() from an on_proc_exit routine cannot get us out
+ * of the exit procedure. We do NOT want to go back to the idle loop...
+ */
+bool proc_exit_inprogress = false;
+
+/*
+ * Set when shmem_exit() is in progress.
+ */
+bool shmem_exit_inprogress = false;
+
+/*
+ * This flag tracks whether we've called atexit() in the current process
+ * (or in the parent postmaster).
+ */
+static bool atexit_callback_setup = false;
+
+/* local functions */
+static void proc_exit_prepare(int code);
+
+
+/* ----------------------------------------------------------------
+ * exit() handling stuff
+ *
+ * These functions are in generally the same spirit as atexit(),
+ * but provide some additional features we need --- in particular,
+ * we want to register callbacks to invoke when we are disconnecting
+ * from a broken shared-memory context but not exiting the postmaster.
+ *
+ * Callback functions can take zero, one, or two args: the first passed
+ * arg is the integer exitcode, the second is the Datum supplied when
+ * the callback was registered.
+ * ----------------------------------------------------------------
+ */
+
+#define MAX_ON_EXITS 20
+
+struct ONEXIT
+{
+ pg_on_exit_callback function;
+ Datum arg;
+};
+
+static struct ONEXIT on_proc_exit_list[MAX_ON_EXITS];
+static struct ONEXIT on_shmem_exit_list[MAX_ON_EXITS];
+static struct ONEXIT before_shmem_exit_list[MAX_ON_EXITS];
+
+static int on_proc_exit_index,
+ on_shmem_exit_index,
+ before_shmem_exit_index;
+
+
+/* ----------------------------------------------------------------
+ * proc_exit
+ *
+ * this function calls all the callbacks registered
+ * for it (to free resources) and then calls exit.
+ *
+ * This should be the only function to call exit().
+ * -cim 2/6/90
+ *
+ * Unfortunately, we can't really guarantee that add-on code
+ * obeys the rule of not calling exit() directly. So, while
+ * this is the preferred way out of the system, we also register
+ * an atexit callback that will make sure cleanup happens.
+ * ----------------------------------------------------------------
+ */
+void
+proc_exit(int code)
+{
+ /* not safe if forked by system(), etc. */
+ if (MyProcPid != (int) getpid())
+ elog(PANIC, "proc_exit() called in child process");
+
+ /* Clean up everything that must be cleaned up */
+ proc_exit_prepare(code);
+
+#ifdef PROFILE_PID_DIR
+ {
+ /*
+ * If we are profiling ourself then gprof's mcleanup() is about to
+ * write out a profile to ./gmon.out. Since mcleanup() always uses a
+ * fixed file name, each backend will overwrite earlier profiles. To
+ * fix that, we create a separate subdirectory for each backend
+ * (./gprof/pid) and 'cd' to that subdirectory before we exit() - that
+ * forces mcleanup() to write each profile into its own directory. We
+ * end up with something like: $PGDATA/gprof/8829/gmon.out
+ * $PGDATA/gprof/8845/gmon.out ...
+ *
+ * To avoid undesirable disk space bloat, autovacuum workers are
+ * discriminated against: all their gmon.out files go into the same
+ * subdirectory. Without this, an installation that is "just sitting
+ * there" nonetheless eats megabytes of disk space every few seconds.
+ *
+ * Note that we do this here instead of in an on_proc_exit() callback
+ * because we want to ensure that this code executes last - we don't
+ * want to interfere with any other on_proc_exit() callback. For the
+ * same reason, we do not include it in proc_exit_prepare ... so if
+ * you are exiting in the "wrong way" you won't drop your profile in a
+ * nice place.
+ */
+ char gprofDirName[32];
+
+ if (IsAutoVacuumWorkerProcess())
+ snprintf(gprofDirName, 32, "gprof/avworker");
+ else
+ snprintf(gprofDirName, 32, "gprof/%d", (int) getpid());
+
+ /*
+ * Use mkdir() instead of MakePGDirectory() since we aren't making a
+ * PG directory here.
+ */
+ mkdir("gprof", S_IRWXU | S_IRWXG | S_IRWXO);
+ mkdir(gprofDirName, S_IRWXU | S_IRWXG | S_IRWXO);
+ chdir(gprofDirName);
+ }
+#endif
+
+ elog(DEBUG3, "exit(%d)", code);
+
+ exit(code);
+}
+
+/*
+ * Code shared between proc_exit and the atexit handler. Note that in
+ * normal exit through proc_exit, this will actually be called twice ...
+ * but the second call will have nothing to do.
+ */
+static void
+proc_exit_prepare(int code)
+{
+ /*
+ * Once we set this flag, we are committed to exit. Any ereport() will
+ * NOT send control back to the main loop, but right back here.
+ */
+ proc_exit_inprogress = true;
+
+ /*
+ * Forget any pending cancel or die requests; we're doing our best to
+ * close up shop already. Note that the signal handlers will not set
+ * these flags again, now that proc_exit_inprogress is set.
+ */
+ InterruptPending = false;
+ ProcDiePending = false;
+ QueryCancelPending = false;
+ InterruptHoldoffCount = 1;
+ CritSectionCount = 0;
+
+ /*
+ * Also clear the error context stack, to prevent error callbacks from
+ * being invoked by any elog/ereport calls made during proc_exit. Whatever
+ * context they might want to offer is probably not relevant, and in any
+ * case they are likely to fail outright after we've done things like
+ * aborting any open transaction. (In normal exit scenarios the context
+ * stack should be empty anyway, but it might not be in the case of
+ * elog(FATAL) for example.)
+ */
+ error_context_stack = NULL;
+ /* For the same reason, reset debug_query_string before it's clobbered */
+ debug_query_string = NULL;
+
+ /* do our shared memory exits first */
+ shmem_exit(code);
+
+ elog(DEBUG3, "proc_exit(%d): %d callbacks to make",
+ code, on_proc_exit_index);
+
+ /*
+ * call all the registered callbacks.
+ *
+ * Note that since we decrement on_proc_exit_index each time, if a
+ * callback calls ereport(ERROR) or ereport(FATAL) then it won't be
+ * invoked again when control comes back here (nor will the
+ * previously-completed callbacks). So, an infinite loop should not be
+ * possible.
+ */
+ while (--on_proc_exit_index >= 0)
+ on_proc_exit_list[on_proc_exit_index].function(code,
+ on_proc_exit_list[on_proc_exit_index].arg);
+
+ on_proc_exit_index = 0;
+}
+
+/* ------------------
+ * Run all of the on_shmem_exit routines --- but don't actually exit.
+ * This is used by the postmaster to re-initialize shared memory and
+ * semaphores after a backend dies horribly. As with proc_exit(), we
+ * remove each callback from the list before calling it, to avoid
+ * infinite loop in case of error.
+ * ------------------
+ */
+void
+shmem_exit(int code)
+{
+ shmem_exit_inprogress = true;
+
+ /*
+ * Call before_shmem_exit callbacks.
+ *
+ * These should be things that need most of the system to still be up and
+ * working, such as cleanup of temp relations, which requires catalog
+ * access; or things that need to be completed because later cleanup steps
+ * depend on them, such as releasing lwlocks.
+ */
+ elog(DEBUG3, "shmem_exit(%d): %d before_shmem_exit callbacks to make",
+ code, before_shmem_exit_index);
+ while (--before_shmem_exit_index >= 0)
+ before_shmem_exit_list[before_shmem_exit_index].function(code,
+ before_shmem_exit_list[before_shmem_exit_index].arg);
+ before_shmem_exit_index = 0;
+
+ /*
+ * Call dynamic shared memory callbacks.
+ *
+ * These serve the same purpose as late callbacks, but for dynamic shared
+ * memory segments rather than the main shared memory segment.
+ * dsm_backend_shutdown() has the same kind of progressive logic we use
+ * for the main shared memory segment; namely, it unregisters each
+ * callback before invoking it, so that we don't get stuck in an infinite
+ * loop if one of those callbacks itself throws an ERROR or FATAL.
+ *
+ * Note that explicitly calling this function here is quite different from
+ * registering it as an on_shmem_exit callback for precisely this reason:
+ * if one dynamic shared memory callback errors out, the remaining
+ * callbacks will still be invoked. Thus, hard-coding this call puts it
+ * equal footing with callbacks for the main shared memory segment.
+ */
+ dsm_backend_shutdown();
+
+ /*
+ * Call on_shmem_exit callbacks.
+ *
+ * These are generally releasing low-level shared memory resources. In
+ * some cases, this is a backstop against the possibility that the early
+ * callbacks might themselves fail, leading to re-entry to this routine;
+ * in other cases, it's cleanup that only happens at process exit.
+ */
+ elog(DEBUG3, "shmem_exit(%d): %d on_shmem_exit callbacks to make",
+ code, on_shmem_exit_index);
+ while (--on_shmem_exit_index >= 0)
+ on_shmem_exit_list[on_shmem_exit_index].function(code,
+ on_shmem_exit_list[on_shmem_exit_index].arg);
+ on_shmem_exit_index = 0;
+
+ shmem_exit_inprogress = false;
+}
+
+/* ----------------------------------------------------------------
+ * atexit_callback
+ *
+ * Backstop to ensure that direct calls of exit() don't mess us up.
+ *
+ * Somebody who was being really uncooperative could call _exit(),
+ * but for that case we have a "dead man switch" that will make the
+ * postmaster treat it as a crash --- see pmsignal.c.
+ * ----------------------------------------------------------------
+ */
+static void
+atexit_callback(void)
+{
+ /* Clean up everything that must be cleaned up */
+ /* ... too bad we don't know the real exit code ... */
+ proc_exit_prepare(-1);
+}
+
+/* ----------------------------------------------------------------
+ * on_proc_exit
+ *
+ * this function adds a callback function to the list of
+ * functions invoked by proc_exit(). -cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+void
+on_proc_exit(pg_on_exit_callback function, Datum arg)
+{
+ if (on_proc_exit_index >= MAX_ON_EXITS)
+ ereport(FATAL,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg_internal("out of on_proc_exit slots")));
+
+ on_proc_exit_list[on_proc_exit_index].function = function;
+ on_proc_exit_list[on_proc_exit_index].arg = arg;
+
+ ++on_proc_exit_index;
+
+ if (!atexit_callback_setup)
+ {
+ atexit(atexit_callback);
+ atexit_callback_setup = true;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * before_shmem_exit
+ *
+ * Register early callback to perform user-level cleanup,
+ * e.g. transaction abort, before we begin shutting down
+ * low-level subsystems.
+ * ----------------------------------------------------------------
+ */
+void
+before_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+ if (before_shmem_exit_index >= MAX_ON_EXITS)
+ ereport(FATAL,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg_internal("out of before_shmem_exit slots")));
+
+ before_shmem_exit_list[before_shmem_exit_index].function = function;
+ before_shmem_exit_list[before_shmem_exit_index].arg = arg;
+
+ ++before_shmem_exit_index;
+
+ if (!atexit_callback_setup)
+ {
+ atexit(atexit_callback);
+ atexit_callback_setup = true;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * on_shmem_exit
+ *
+ * Register ordinary callback to perform low-level shutdown
+ * (e.g. releasing our PGPROC); run after before_shmem_exit
+ * callbacks and before on_proc_exit callbacks.
+ * ----------------------------------------------------------------
+ */
+void
+on_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+ if (on_shmem_exit_index >= MAX_ON_EXITS)
+ ereport(FATAL,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg_internal("out of on_shmem_exit slots")));
+
+ on_shmem_exit_list[on_shmem_exit_index].function = function;
+ on_shmem_exit_list[on_shmem_exit_index].arg = arg;
+
+ ++on_shmem_exit_index;
+
+ if (!atexit_callback_setup)
+ {
+ atexit(atexit_callback);
+ atexit_callback_setup = true;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * cancel_before_shmem_exit
+ *
+ * this function removes a previously-registered before_shmem_exit
+ * callback. We only look at the latest entry for removal, as we
+ * expect callers to add and remove temporary before_shmem_exit
+ * callbacks in strict LIFO order.
+ * ----------------------------------------------------------------
+ */
+void
+cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+ if (before_shmem_exit_index > 0 &&
+ before_shmem_exit_list[before_shmem_exit_index - 1].function
+ == function &&
+ before_shmem_exit_list[before_shmem_exit_index - 1].arg == arg)
+ --before_shmem_exit_index;
+ else
+ elog(ERROR, "before_shmem_exit callback (%p,0x%llx) is not the latest entry",
+ function, (long long) arg);
+}
+
+/* ----------------------------------------------------------------
+ * on_exit_reset
+ *
+ * this function clears all on_proc_exit() and on_shmem_exit()
+ * registered functions. This is used just after forking a backend,
+ * so that the backend doesn't believe it should call the postmaster's
+ * on-exit routines when it exits...
+ * ----------------------------------------------------------------
+ */
+void
+on_exit_reset(void)
+{
+ before_shmem_exit_index = 0;
+ on_shmem_exit_index = 0;
+ on_proc_exit_index = 0;
+ reset_on_dsm_detach();
+}
+
+/* ----------------------------------------------------------------
+ * check_on_shmem_exit_lists_are_empty
+ *
+ * Debugging check that no shmem cleanup handlers have been registered
+ * prematurely in the current process.
+ * ----------------------------------------------------------------
+ */
+void
+check_on_shmem_exit_lists_are_empty(void)
+{
+ if (before_shmem_exit_index)
+ elog(FATAL, "before_shmem_exit has been called prematurely");
+ if (on_shmem_exit_index)
+ elog(FATAL, "on_shmem_exit has been called prematurely");
+ /* Checking DSM detach state seems unnecessary given the above */
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
new file mode 100644
index 0000000..8f1ded7
--- /dev/null
+++ b/src/backend/storage/ipc/ipci.c
@@ -0,0 +1,354 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipci.c
+ * POSTGRES inter-process communication initialization code.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/ipci.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/nbtree.h"
+#include "access/subtrans.h"
+#include "access/syncscan.h"
+#include "access/twophase.h"
+#include "access/xlogprefetcher.h"
+#include "access/xlogrecovery.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/bgworker_internals.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/postmaster.h"
+#include "replication/logicallauncher.h"
+#include "replication/origin.h"
+#include "replication/slot.h"
+#include "replication/walreceiver.h"
+#include "replication/walsender.h"
+#include "storage/bufmgr.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "utils/guc.h"
+#include "utils/snapmgr.h"
+
+/* GUCs */
+int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE;
+
+shmem_startup_hook_type shmem_startup_hook = NULL;
+
+static Size total_addin_request = 0;
+
+/*
+ * RequestAddinShmemSpace
+ * Request that extra shmem space be allocated for use by
+ * a loadable module.
+ *
+ * This may only be called via the shmem_request_hook of a library that is
+ * loaded into the postmaster via shared_preload_libraries. Calls from
+ * elsewhere will fail.
+ */
+void
+RequestAddinShmemSpace(Size size)
+{
+ if (!process_shmem_requests_in_progress)
+ elog(FATAL, "cannot request additional shared memory outside shmem_request_hook");
+ total_addin_request = add_size(total_addin_request, size);
+}
+
+/*
+ * CalculateShmemSize
+ * Calculates the amount of shared memory and number of semaphores needed.
+ *
+ * If num_semaphores is not NULL, it will be set to the number of semaphores
+ * required.
+ */
+Size
+CalculateShmemSize(int *num_semaphores)
+{
+ Size size;
+ int numSemas;
+
+ /* Compute number of semaphores we'll need */
+ numSemas = ProcGlobalSemas();
+ numSemas += SpinlockSemas();
+
+ /* Return the number of semaphores if requested by the caller */
+ if (num_semaphores)
+ *num_semaphores = numSemas;
+
+ /*
+ * Size of the Postgres shared-memory block is estimated via moderately-
+ * accurate estimates for the big hogs, plus 100K for the stuff that's too
+ * small to bother with estimating.
+ *
+ * We take some care to ensure that the total size request doesn't
+ * overflow size_t. If this gets through, we don't need to be so careful
+ * during the actual allocation phase.
+ */
+ size = 100000;
+ size = add_size(size, PGSemaphoreShmemSize(numSemas));
+ size = add_size(size, SpinlockSemaSize());
+ size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
+ sizeof(ShmemIndexEnt)));
+ size = add_size(size, dsm_estimate_size());
+ size = add_size(size, BufferShmemSize());
+ size = add_size(size, LockShmemSize());
+ size = add_size(size, PredicateLockShmemSize());
+ size = add_size(size, ProcGlobalShmemSize());
+ size = add_size(size, XLogPrefetchShmemSize());
+ size = add_size(size, XLOGShmemSize());
+ size = add_size(size, XLogRecoveryShmemSize());
+ size = add_size(size, CLOGShmemSize());
+ size = add_size(size, CommitTsShmemSize());
+ size = add_size(size, SUBTRANSShmemSize());
+ size = add_size(size, TwoPhaseShmemSize());
+ size = add_size(size, BackgroundWorkerShmemSize());
+ size = add_size(size, MultiXactShmemSize());
+ size = add_size(size, LWLockShmemSize());
+ size = add_size(size, ProcArrayShmemSize());
+ size = add_size(size, BackendStatusShmemSize());
+ size = add_size(size, SInvalShmemSize());
+ size = add_size(size, PMSignalShmemSize());
+ size = add_size(size, ProcSignalShmemSize());
+ size = add_size(size, CheckpointerShmemSize());
+ size = add_size(size, AutoVacuumShmemSize());
+ size = add_size(size, ReplicationSlotsShmemSize());
+ size = add_size(size, ReplicationOriginShmemSize());
+ size = add_size(size, WalSndShmemSize());
+ size = add_size(size, WalRcvShmemSize());
+ size = add_size(size, PgArchShmemSize());
+ size = add_size(size, ApplyLauncherShmemSize());
+ size = add_size(size, SnapMgrShmemSize());
+ size = add_size(size, BTreeShmemSize());
+ size = add_size(size, SyncScanShmemSize());
+ size = add_size(size, AsyncShmemSize());
+ size = add_size(size, StatsShmemSize());
+#ifdef EXEC_BACKEND
+ size = add_size(size, ShmemBackendArraySize());
+#endif
+
+ /* include additional requested shmem from preload libraries */
+ size = add_size(size, total_addin_request);
+
+ /* might as well round it off to a multiple of a typical page size */
+ size = add_size(size, 8192 - (size % 8192));
+
+ return size;
+}
+
+/*
+ * CreateSharedMemoryAndSemaphores
+ * Creates and initializes shared memory and semaphores.
+ *
+ * This is called by the postmaster or by a standalone backend.
+ * It is also called by a backend forked from the postmaster in the
+ * EXEC_BACKEND case. In the latter case, the shared memory segment
+ * already exists and has been physically attached to, but we have to
+ * initialize pointers in local memory that reference the shared structures,
+ * because we didn't inherit the correct pointer values from the postmaster
+ * as we do in the fork() scenario. The easiest way to do that is to run
+ * through the same code as before. (Note that the called routines mostly
+ * check IsUnderPostmaster, rather than EXEC_BACKEND, to detect this case.
+ * This is a bit code-wasteful and could be cleaned up.)
+ */
+void
+CreateSharedMemoryAndSemaphores(void)
+{
+ PGShmemHeader *shim = NULL;
+
+ if (!IsUnderPostmaster)
+ {
+ PGShmemHeader *seghdr;
+ Size size;
+ int numSemas;
+
+ /* Compute the size of the shared-memory block */
+ size = CalculateShmemSize(&numSemas);
+ elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size);
+
+ /*
+ * Create the shmem segment
+ */
+ seghdr = PGSharedMemoryCreate(size, &shim);
+
+ InitShmemAccess(seghdr);
+
+ /*
+ * Create semaphores
+ */
+ PGReserveSemaphores(numSemas);
+
+ /*
+ * If spinlocks are disabled, initialize emulation layer (which
+ * depends on semaphores, so the order is important here).
+ */
+#ifndef HAVE_SPINLOCKS
+ SpinlockSemaInit();
+#endif
+ }
+ else
+ {
+ /*
+ * We are reattaching to an existing shared memory segment. This
+ * should only be reached in the EXEC_BACKEND case.
+ */
+#ifndef EXEC_BACKEND
+ elog(PANIC, "should be attached to shared memory already");
+#endif
+ }
+
+ /*
+ * Set up shared memory allocation mechanism
+ */
+ if (!IsUnderPostmaster)
+ InitShmemAllocation();
+
+ /*
+ * Now initialize LWLocks, which do shared memory allocation and are
+ * needed for InitShmemIndex.
+ */
+ CreateLWLocks();
+
+ /*
+ * Set up shmem.c index hashtable
+ */
+ InitShmemIndex();
+
+ dsm_shmem_init();
+
+ /*
+ * Set up xlog, clog, and buffers
+ */
+ XLOGShmemInit();
+ XLogPrefetchShmemInit();
+ XLogRecoveryShmemInit();
+ CLOGShmemInit();
+ CommitTsShmemInit();
+ SUBTRANSShmemInit();
+ MultiXactShmemInit();
+ InitBufferPool();
+
+ /*
+ * Set up lock manager
+ */
+ InitLocks();
+
+ /*
+ * Set up predicate lock manager
+ */
+ InitPredicateLocks();
+
+ /*
+ * Set up process table
+ */
+ if (!IsUnderPostmaster)
+ InitProcGlobal();
+ CreateSharedProcArray();
+ CreateSharedBackendStatus();
+ TwoPhaseShmemInit();
+ BackgroundWorkerShmemInit();
+
+ /*
+ * Set up shared-inval messaging
+ */
+ CreateSharedInvalidationState();
+
+ /*
+ * Set up interprocess signaling mechanisms
+ */
+ PMSignalShmemInit();
+ ProcSignalShmemInit();
+ CheckpointerShmemInit();
+ AutoVacuumShmemInit();
+ ReplicationSlotsShmemInit();
+ ReplicationOriginShmemInit();
+ WalSndShmemInit();
+ WalRcvShmemInit();
+ PgArchShmemInit();
+ ApplyLauncherShmemInit();
+
+ /*
+ * Set up other modules that need some shared memory space
+ */
+ SnapMgrInit();
+ BTreeShmemInit();
+ SyncScanShmemInit();
+ AsyncShmemInit();
+ StatsShmemInit();
+
+#ifdef EXEC_BACKEND
+
+ /*
+ * Alloc the win32 shared backend array
+ */
+ if (!IsUnderPostmaster)
+ ShmemBackendArrayAllocation();
+#endif
+
+ /* Initialize dynamic shared memory facilities. */
+ if (!IsUnderPostmaster)
+ dsm_postmaster_startup(shim);
+
+ /*
+ * Now give loadable modules a chance to set up their shmem allocations
+ */
+ if (shmem_startup_hook)
+ shmem_startup_hook();
+}
+
+/*
+ * InitializeShmemGUCs
+ *
+ * This function initializes runtime-computed GUCs related to the amount of
+ * shared memory required for the current configuration.
+ */
+void
+InitializeShmemGUCs(void)
+{
+ char buf[64];
+ Size size_b;
+ Size size_mb;
+ Size hp_size;
+
+ /*
+ * Calculate the shared memory size and round up to the nearest megabyte.
+ */
+ size_b = CalculateShmemSize(NULL);
+ size_mb = add_size(size_b, (1024 * 1024) - 1) / (1024 * 1024);
+ sprintf(buf, "%zu", size_mb);
+ SetConfigOption("shared_memory_size", buf,
+ PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
+
+ /*
+ * Calculate the number of huge pages required.
+ */
+ GetHugePageSize(&hp_size, NULL);
+ if (hp_size != 0)
+ {
+ Size hp_required;
+
+ hp_required = add_size(size_b / hp_size, 1);
+ sprintf(buf, "%zu", hp_required);
+ SetConfigOption("shared_memory_size_in_huge_pages", buf,
+ PGC_INTERNAL, PGC_S_DYNAMIC_DEFAULT);
+ }
+}
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
new file mode 100644
index 0000000..cdb95c1
--- /dev/null
+++ b/src/backend/storage/ipc/latch.c
@@ -0,0 +1,2268 @@
+/*-------------------------------------------------------------------------
+ *
+ * latch.c
+ * Routines for inter-process latches
+ *
+ * The poll() implementation uses the so-called self-pipe trick to overcome the
+ * race condition involved with poll() and setting a global flag in the signal
+ * handler. When a latch is set and the current process is waiting for it, the
+ * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
+ * A signal by itself doesn't interrupt poll() on all platforms, and even on
+ * platforms where it does, a signal that arrives just before the poll() call
+ * does not prevent poll() from entering sleep. An incoming byte on a pipe
+ * however reliably interrupts the sleep, and causes poll() to return
+ * immediately even if the signal arrives before poll() begins.
+ *
+ * The epoll() implementation overcomes the race with a different technique: it
+ * keeps SIGURG blocked and consumes from a signalfd() descriptor instead. We
+ * don't need to register a signal handler or create our own self-pipe. We
+ * assume that any system that has Linux epoll() also has Linux signalfd().
+ *
+ * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
+ *
+ * The Windows implementation uses Windows events that are inherited by all
+ * postmaster child processes. There's no need for the self-pipe trick there.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/latch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#ifdef HAVE_SYS_EPOLL_H
+#include <sys/epoll.h>
+#endif
+#ifdef HAVE_SYS_EVENT_H
+#include <sys/event.h>
+#endif
+#ifdef HAVE_SYS_SIGNALFD_H
+#include <sys/signalfd.h>
+#endif
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "portability/instr_time.h"
+#include "postmaster/postmaster.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/memutils.h"
+
+/*
+ * Select the fd readiness primitive to use. Normally the "most modern"
+ * primitive supported by the OS will be used, but for testing it can be
+ * useful to manually specify the used primitive. If desired, just add a
+ * define somewhere before this block.
+ */
+#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
+ defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
+/* don't overwrite manual choice */
+#elif defined(HAVE_SYS_EPOLL_H)
+#define WAIT_USE_EPOLL
+#elif defined(HAVE_KQUEUE)
+#define WAIT_USE_KQUEUE
+#elif defined(HAVE_POLL)
+#define WAIT_USE_POLL
+#elif WIN32
+#define WAIT_USE_WIN32
+#else
+#error "no wait set implementation available"
+#endif
+
+/*
+ * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
+ * available. We avoid signalfd on illumos for now based on problem reports.
+ * For testing the choice can also be manually specified.
+ */
+#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+/* don't overwrite manual choice */
+#elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H) && \
+ !defined(__illumos__)
+#define WAIT_USE_SIGNALFD
+#else
+#define WAIT_USE_SELF_PIPE
+#endif
+#endif
+
+/* typedef in latch.h */
+struct WaitEventSet
+{
+ int nevents; /* number of registered events */
+ int nevents_space; /* maximum number of events in this set */
+
+ /*
+ * Array, of nevents_space length, storing the definition of events this
+ * set is waiting for.
+ */
+ WaitEvent *events;
+
+ /*
+ * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
+ * said latch, and latch_pos the offset in the ->events array. This is
+ * useful because we check the state of the latch before performing doing
+ * syscalls related to waiting.
+ */
+ Latch *latch;
+ int latch_pos;
+
+ /*
+ * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
+ * is set so that we'll exit immediately if postmaster death is detected,
+ * instead of returning.
+ */
+ bool exit_on_postmaster_death;
+
+#if defined(WAIT_USE_EPOLL)
+ int epoll_fd;
+ /* epoll_wait returns events in a user provided arrays, allocate once */
+ struct epoll_event *epoll_ret_events;
+#elif defined(WAIT_USE_KQUEUE)
+ int kqueue_fd;
+ /* kevent returns events in a user provided arrays, allocate once */
+ struct kevent *kqueue_ret_events;
+ bool report_postmaster_not_running;
+#elif defined(WAIT_USE_POLL)
+ /* poll expects events to be waited on every poll() call, prepare once */
+ struct pollfd *pollfds;
+#elif defined(WAIT_USE_WIN32)
+
+ /*
+ * Array of windows events. The first element always contains
+ * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
+ * event->pos + 1).
+ */
+ HANDLE *handles;
+#endif
+};
+
+/* A common WaitEventSet used to implement WaitLatch() */
+static WaitEventSet *LatchWaitSet;
+
+/* The position of the latch in LatchWaitSet. */
+#define LatchWaitSetLatchPos 0
+
+#ifndef WIN32
+/* Are we currently in WaitLatch? The signal handler would like to know. */
+static volatile sig_atomic_t waiting = false;
+#endif
+
+#ifdef WAIT_USE_SIGNALFD
+/* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
+static int signal_fd = -1;
+#endif
+
+#ifdef WAIT_USE_SELF_PIPE
+/* Read and write ends of the self-pipe */
+static int selfpipe_readfd = -1;
+static int selfpipe_writefd = -1;
+
+/* Process owning the self-pipe --- needed for checking purposes */
+static int selfpipe_owner_pid = 0;
+
+/* Private function prototypes */
+static void latch_sigurg_handler(SIGNAL_ARGS);
+static void sendSelfPipeByte(void);
+#endif
+
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+static void drain(void);
+#endif
+
+#if defined(WAIT_USE_EPOLL)
+static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
+#elif defined(WAIT_USE_KQUEUE)
+static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
+#elif defined(WAIT_USE_POLL)
+static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
+#elif defined(WAIT_USE_WIN32)
+static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
+#endif
+
+static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents);
+
+/*
+ * Initialize the process-local latch infrastructure.
+ *
+ * This must be called once during startup of any process that can wait on
+ * latches, before it issues any InitLatch() or OwnLatch() calls.
+ */
+void
+InitializeLatchSupport(void)
+{
+#if defined(WAIT_USE_SELF_PIPE)
+ int pipefd[2];
+
+ if (IsUnderPostmaster)
+ {
+ /*
+ * We might have inherited connections to a self-pipe created by the
+ * postmaster. It's critical that child processes create their own
+ * self-pipes, of course, and we really want them to close the
+ * inherited FDs for safety's sake.
+ */
+ if (selfpipe_owner_pid != 0)
+ {
+ /* Assert we go through here but once in a child process */
+ Assert(selfpipe_owner_pid != MyProcPid);
+ /* Release postmaster's pipe FDs; ignore any error */
+ (void) close(selfpipe_readfd);
+ (void) close(selfpipe_writefd);
+ /* Clean up, just for safety's sake; we'll set these below */
+ selfpipe_readfd = selfpipe_writefd = -1;
+ selfpipe_owner_pid = 0;
+ /* Keep fd.c's accounting straight */
+ ReleaseExternalFD();
+ ReleaseExternalFD();
+ }
+ else
+ {
+ /*
+ * Postmaster didn't create a self-pipe ... or else we're in an
+ * EXEC_BACKEND build, in which case it doesn't matter since the
+ * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
+ * fd.c won't have state to clean up, either.
+ */
+ Assert(selfpipe_readfd == -1);
+ }
+ }
+ else
+ {
+ /* In postmaster or standalone backend, assert we do this but once */
+ Assert(selfpipe_readfd == -1);
+ Assert(selfpipe_owner_pid == 0);
+ }
+
+ /*
+ * Set up the self-pipe that allows a signal handler to wake up the
+ * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
+ * that SetLatch won't block if the event has already been set many times
+ * filling the kernel buffer. Make the read-end non-blocking too, so that
+ * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
+ * Also, make both FDs close-on-exec, since we surely do not want any
+ * child processes messing with them.
+ */
+ if (pipe(pipefd) < 0)
+ elog(FATAL, "pipe() failed: %m");
+ if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
+ elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
+ if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
+ elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
+ if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
+ elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
+ if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
+ elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
+
+ selfpipe_readfd = pipefd[0];
+ selfpipe_writefd = pipefd[1];
+ selfpipe_owner_pid = MyProcPid;
+
+ /* Tell fd.c about these two long-lived FDs */
+ ReserveExternalFD();
+ ReserveExternalFD();
+
+ pqsignal(SIGURG, latch_sigurg_handler);
+#endif
+
+#ifdef WAIT_USE_SIGNALFD
+ sigset_t signalfd_mask;
+
+ if (IsUnderPostmaster)
+ {
+ /*
+ * It would probably be safe to re-use the inherited signalfd since
+ * signalfds only see the current process's pending signals, but it
+ * seems less surprising to close it and create our own.
+ */
+ if (signal_fd != -1)
+ {
+ /* Release postmaster's signal FD; ignore any error */
+ (void) close(signal_fd);
+ signal_fd = -1;
+ ReleaseExternalFD();
+ }
+ }
+
+ /* Block SIGURG, because we'll receive it through a signalfd. */
+ sigaddset(&UnBlockSig, SIGURG);
+
+ /* Set up the signalfd to receive SIGURG notifications. */
+ sigemptyset(&signalfd_mask);
+ sigaddset(&signalfd_mask, SIGURG);
+ signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
+ if (signal_fd < 0)
+ elog(FATAL, "signalfd() failed");
+ ReserveExternalFD();
+#endif
+
+#ifdef WAIT_USE_KQUEUE
+ /* Ignore SIGURG, because we'll receive it via kqueue. */
+ pqsignal(SIGURG, SIG_IGN);
+#endif
+}
+
+void
+InitializeLatchWaitSet(void)
+{
+ int latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
+ Assert(LatchWaitSet == NULL);
+
+ /* Set up the WaitEventSet used by WaitLatch(). */
+ LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+ latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+ MyLatch, NULL);
+ if (IsUnderPostmaster)
+ AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
+ PGINVALID_SOCKET, NULL, NULL);
+
+ Assert(latch_pos == LatchWaitSetLatchPos);
+}
+
+void
+ShutdownLatchSupport(void)
+{
+#if defined(WAIT_USE_POLL)
+ pqsignal(SIGURG, SIG_IGN);
+#endif
+
+ if (LatchWaitSet)
+ {
+ FreeWaitEventSet(LatchWaitSet);
+ LatchWaitSet = NULL;
+ }
+
+#if defined(WAIT_USE_SELF_PIPE)
+ close(selfpipe_readfd);
+ close(selfpipe_writefd);
+ selfpipe_readfd = -1;
+ selfpipe_writefd = -1;
+ selfpipe_owner_pid = InvalidPid;
+#endif
+
+#if defined(WAIT_USE_SIGNALFD)
+ close(signal_fd);
+ signal_fd = -1;
+#endif
+}
+
+/*
+ * Initialize a process-local latch.
+ */
+void
+InitLatch(Latch *latch)
+{
+ latch->is_set = false;
+ latch->maybe_sleeping = false;
+ latch->owner_pid = MyProcPid;
+ latch->is_shared = false;
+
+#if defined(WAIT_USE_SELF_PIPE)
+ /* Assert InitializeLatchSupport has been called in this process */
+ Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
+#elif defined(WAIT_USE_SIGNALFD)
+ /* Assert InitializeLatchSupport has been called in this process */
+ Assert(signal_fd >= 0);
+#elif defined(WAIT_USE_WIN32)
+ latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
+ if (latch->event == NULL)
+ elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
+#endif /* WIN32 */
+}
+
+/*
+ * Initialize a shared latch that can be set from other processes. The latch
+ * is initially owned by no-one; use OwnLatch to associate it with the
+ * current process.
+ *
+ * InitSharedLatch needs to be called in postmaster before forking child
+ * processes, usually right after allocating the shared memory block
+ * containing the latch with ShmemInitStruct. (The Unix implementation
+ * doesn't actually require that, but the Windows one does.) Because of
+ * this restriction, we have no concurrency issues to worry about here.
+ *
+ * Note that other handles created in this module are never marked as
+ * inheritable. Thus we do not need to worry about cleaning up child
+ * process references to postmaster-private latches or WaitEventSets.
+ */
+void
+InitSharedLatch(Latch *latch)
+{
+#ifdef WIN32
+ SECURITY_ATTRIBUTES sa;
+
+ /*
+ * Set up security attributes to specify that the events are inherited.
+ */
+ ZeroMemory(&sa, sizeof(sa));
+ sa.nLength = sizeof(sa);
+ sa.bInheritHandle = TRUE;
+
+ latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
+ if (latch->event == NULL)
+ elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
+#endif
+
+ latch->is_set = false;
+ latch->maybe_sleeping = false;
+ latch->owner_pid = 0;
+ latch->is_shared = true;
+}
+
+/*
+ * Associate a shared latch with the current process, allowing it to
+ * wait on the latch.
+ *
+ * Although there is a sanity check for latch-already-owned, we don't do
+ * any sort of locking here, meaning that we could fail to detect the error
+ * if two processes try to own the same latch at about the same time. If
+ * there is any risk of that, caller must provide an interlock to prevent it.
+ */
+void
+OwnLatch(Latch *latch)
+{
+ int owner_pid;
+
+ /* Sanity checks */
+ Assert(latch->is_shared);
+
+#if defined(WAIT_USE_SELF_PIPE)
+ /* Assert InitializeLatchSupport has been called in this process */
+ Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
+#elif defined(WAIT_USE_SIGNALFD)
+ /* Assert InitializeLatchSupport has been called in this process */
+ Assert(signal_fd >= 0);
+#endif
+
+ owner_pid = latch->owner_pid;
+ if (owner_pid != 0)
+ elog(PANIC, "latch already owned by PID %d", owner_pid);
+
+ latch->owner_pid = MyProcPid;
+}
+
+/*
+ * Disown a shared latch currently owned by the current process.
+ */
+void
+DisownLatch(Latch *latch)
+{
+ Assert(latch->is_shared);
+ Assert(latch->owner_pid == MyProcPid);
+
+ latch->owner_pid = 0;
+}
+
+/*
+ * Wait for a given latch to be set, or for postmaster death, or until timeout
+ * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
+ * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
+ * function returns immediately.
+ *
+ * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
+ * is given. Although it is declared as "long", we don't actually support
+ * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
+ * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
+ *
+ * The latch must be owned by the current process, ie. it must be a
+ * process-local latch initialized with InitLatch, or a shared latch
+ * associated with the current process by calling OwnLatch.
+ *
+ * Returns bit mask indicating which condition(s) caused the wake-up. Note
+ * that if multiple wake-up conditions are true, there is no guarantee that
+ * we return all of them in one call, but we will return at least one.
+ */
+int
+WaitLatch(Latch *latch, int wakeEvents, long timeout,
+ uint32 wait_event_info)
+{
+ WaitEvent event;
+
+ /* Postmaster-managed callers must handle postmaster death somehow. */
+ Assert(!IsUnderPostmaster ||
+ (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+ (wakeEvents & WL_POSTMASTER_DEATH));
+
+ /*
+ * Some callers may have a latch other than MyLatch, or no latch at all,
+ * or want to handle postmaster death differently. It's cheap to assign
+ * those, so just do it every time.
+ */
+ if (!(wakeEvents & WL_LATCH_SET))
+ latch = NULL;
+ ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
+ LatchWaitSet->exit_on_postmaster_death =
+ ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+ if (WaitEventSetWait(LatchWaitSet,
+ (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+ &event, 1,
+ wait_event_info) == 0)
+ return WL_TIMEOUT;
+ else
+ return event.events;
+}
+
+/*
+ * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
+ * conditions.
+ *
+ * When waiting on a socket, EOF and error conditions always cause the socket
+ * to be reported as readable/writable/connected, so that the caller can deal
+ * with the condition.
+ *
+ * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
+ * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
+ * return value if the postmaster dies. The latter is useful for rare cases
+ * where some behavior other than immediate exit is needed.
+ *
+ * NB: These days this is just a wrapper around the WaitEventSet API. When
+ * using a latch very frequently, consider creating a longer living
+ * WaitEventSet instead; that's more efficient.
+ */
+int
+WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
+ long timeout, uint32 wait_event_info)
+{
+ int ret = 0;
+ int rc;
+ WaitEvent event;
+ WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
+
+ if (wakeEvents & WL_TIMEOUT)
+ Assert(timeout >= 0);
+ else
+ timeout = -1;
+
+ if (wakeEvents & WL_LATCH_SET)
+ AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
+ latch, NULL);
+
+ /* Postmaster-managed callers must handle postmaster death somehow. */
+ Assert(!IsUnderPostmaster ||
+ (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+ (wakeEvents & WL_POSTMASTER_DEATH));
+
+ if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
+ AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
+ NULL, NULL);
+
+ if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
+ AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+ NULL, NULL);
+
+ if (wakeEvents & WL_SOCKET_MASK)
+ {
+ int ev;
+
+ ev = wakeEvents & WL_SOCKET_MASK;
+ AddWaitEventToSet(set, ev, sock, NULL, NULL);
+ }
+
+ rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
+
+ if (rc == 0)
+ ret |= WL_TIMEOUT;
+ else
+ {
+ ret |= event.events & (WL_LATCH_SET |
+ WL_POSTMASTER_DEATH |
+ WL_SOCKET_MASK);
+ }
+
+ FreeWaitEventSet(set);
+
+ return ret;
+}
+
+/*
+ * Sets a latch and wakes up anyone waiting on it.
+ *
+ * This is cheap if the latch is already set, otherwise not so much.
+ *
+ * NB: when calling this in a signal handler, be sure to save and restore
+ * errno around it. (That's standard practice in most signal handlers, of
+ * course, but we used to omit it in handlers that only set a flag.)
+ *
+ * NB: this function is called from critical sections and signal handlers so
+ * throwing an error is not a good idea.
+ */
+void
+SetLatch(Latch *latch)
+{
+#ifndef WIN32
+ pid_t owner_pid;
+#else
+ HANDLE handle;
+#endif
+
+ /*
+ * The memory barrier has to be placed here to ensure that any flag
+ * variables possibly changed by this process have been flushed to main
+ * memory, before we check/set is_set.
+ */
+ pg_memory_barrier();
+
+ /* Quick exit if already set */
+ if (latch->is_set)
+ return;
+
+ latch->is_set = true;
+
+ pg_memory_barrier();
+ if (!latch->maybe_sleeping)
+ return;
+
+#ifndef WIN32
+
+ /*
+ * See if anyone's waiting for the latch. It can be the current process if
+ * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
+ * to wake up WaitEventSetWaitBlock() without races in that case. If it's
+ * another process, send a signal.
+ *
+ * Fetch owner_pid only once, in case the latch is concurrently getting
+ * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
+ * guaranteed to be true! In practice, the effective range of pid_t fits
+ * in a 32 bit integer, and so should be atomic. In the worst case, we
+ * might end up signaling the wrong process. Even then, you're very
+ * unlucky if a process with that bogus pid exists and belongs to
+ * Postgres; and PG database processes should handle excess SIGUSR1
+ * interrupts without a problem anyhow.
+ *
+ * Another sort of race condition that's possible here is for a new
+ * process to own the latch immediately after we look, so we don't signal
+ * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
+ * the standard coding convention of waiting at the bottom of their loops,
+ * not the top, so that they'll correctly process latch-setting events
+ * that happen before they enter the loop.
+ */
+ owner_pid = latch->owner_pid;
+ if (owner_pid == 0)
+ return;
+ else if (owner_pid == MyProcPid)
+ {
+#if defined(WAIT_USE_SELF_PIPE)
+ if (waiting)
+ sendSelfPipeByte();
+#else
+ if (waiting)
+ kill(MyProcPid, SIGURG);
+#endif
+ }
+ else
+ kill(owner_pid, SIGURG);
+
+#else
+
+ /*
+ * See if anyone's waiting for the latch. It can be the current process if
+ * we're in a signal handler.
+ *
+ * Use a local variable here just in case somebody changes the event field
+ * concurrently (which really should not happen).
+ */
+ handle = latch->event;
+ if (handle)
+ {
+ SetEvent(handle);
+
+ /*
+ * Note that we silently ignore any errors. We might be in a signal
+ * handler or other critical path where it's not safe to call elog().
+ */
+ }
+#endif
+}
+
+/*
+ * Clear the latch. Calling WaitLatch after this will sleep, unless
+ * the latch is set again before the WaitLatch call.
+ */
+void
+ResetLatch(Latch *latch)
+{
+ /* Only the owner should reset the latch */
+ Assert(latch->owner_pid == MyProcPid);
+ Assert(latch->maybe_sleeping == false);
+
+ latch->is_set = false;
+
+ /*
+ * Ensure that the write to is_set gets flushed to main memory before we
+ * examine any flag variables. Otherwise a concurrent SetLatch might
+ * falsely conclude that it needn't signal us, even though we have missed
+ * seeing some flag updates that SetLatch was supposed to inform us of.
+ */
+ pg_memory_barrier();
+}
+
+/*
+ * Create a WaitEventSet with space for nevents different events to wait for.
+ *
+ * These events can then be efficiently waited upon together, using
+ * WaitEventSetWait().
+ */
+WaitEventSet *
+CreateWaitEventSet(MemoryContext context, int nevents)
+{
+ WaitEventSet *set;
+ char *data;
+ Size sz = 0;
+
+ /*
+ * Use MAXALIGN size/alignment to guarantee that later uses of memory are
+ * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
+ * platforms, but earlier allocations like WaitEventSet and WaitEvent
+ * might not be sized to guarantee that when purely using sizeof().
+ */
+ sz += MAXALIGN(sizeof(WaitEventSet));
+ sz += MAXALIGN(sizeof(WaitEvent) * nevents);
+
+#if defined(WAIT_USE_EPOLL)
+ sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+ sz += MAXALIGN(sizeof(struct kevent) * nevents);
+#elif defined(WAIT_USE_POLL)
+ sz += MAXALIGN(sizeof(struct pollfd) * nevents);
+#elif defined(WAIT_USE_WIN32)
+ /* need space for the pgwin32_signal_event */
+ sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
+#endif
+
+ data = (char *) MemoryContextAllocZero(context, sz);
+
+ set = (WaitEventSet *) data;
+ data += MAXALIGN(sizeof(WaitEventSet));
+
+ set->events = (WaitEvent *) data;
+ data += MAXALIGN(sizeof(WaitEvent) * nevents);
+
+#if defined(WAIT_USE_EPOLL)
+ set->epoll_ret_events = (struct epoll_event *) data;
+ data += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+ set->kqueue_ret_events = (struct kevent *) data;
+ data += MAXALIGN(sizeof(struct kevent) * nevents);
+#elif defined(WAIT_USE_POLL)
+ set->pollfds = (struct pollfd *) data;
+ data += MAXALIGN(sizeof(struct pollfd) * nevents);
+#elif defined(WAIT_USE_WIN32)
+ set->handles = (HANDLE) data;
+ data += MAXALIGN(sizeof(HANDLE) * nevents);
+#endif
+
+ set->latch = NULL;
+ set->nevents_space = nevents;
+ set->exit_on_postmaster_death = false;
+
+#if defined(WAIT_USE_EPOLL)
+ if (!AcquireExternalFD())
+ {
+ /* treat this as though epoll_create1 itself returned EMFILE */
+ elog(ERROR, "epoll_create1 failed: %m");
+ }
+ set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+ if (set->epoll_fd < 0)
+ {
+ ReleaseExternalFD();
+ elog(ERROR, "epoll_create1 failed: %m");
+ }
+#elif defined(WAIT_USE_KQUEUE)
+ if (!AcquireExternalFD())
+ {
+ /* treat this as though kqueue itself returned EMFILE */
+ elog(ERROR, "kqueue failed: %m");
+ }
+ set->kqueue_fd = kqueue();
+ if (set->kqueue_fd < 0)
+ {
+ ReleaseExternalFD();
+ elog(ERROR, "kqueue failed: %m");
+ }
+ if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
+ {
+ int save_errno = errno;
+
+ close(set->kqueue_fd);
+ ReleaseExternalFD();
+ errno = save_errno;
+ elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
+ }
+ set->report_postmaster_not_running = false;
+#elif defined(WAIT_USE_WIN32)
+
+ /*
+ * To handle signals while waiting, we need to add a win32 specific event.
+ * We accounted for the additional event at the top of this routine. See
+ * port/win32/signal.c for more details.
+ *
+ * Note: pgwin32_signal_event should be first to ensure that it will be
+ * reported when multiple events are set. We want to guarantee that
+ * pending signals are serviced.
+ */
+ set->handles[0] = pgwin32_signal_event;
+ StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
+#endif
+
+ return set;
+}
+
+/*
+ * Free a previously created WaitEventSet.
+ *
+ * Note: preferably, this shouldn't have to free any resources that could be
+ * inherited across an exec(). If it did, we'd likely leak those resources in
+ * many scenarios. For the epoll case, we ensure that by setting EPOLL_CLOEXEC
+ * when the FD is created. For the Windows case, we assume that the handles
+ * involved are non-inheritable.
+ */
+void
+FreeWaitEventSet(WaitEventSet *set)
+{
+#if defined(WAIT_USE_EPOLL)
+ close(set->epoll_fd);
+ ReleaseExternalFD();
+#elif defined(WAIT_USE_KQUEUE)
+ close(set->kqueue_fd);
+ ReleaseExternalFD();
+#elif defined(WAIT_USE_WIN32)
+ WaitEvent *cur_event;
+
+ for (cur_event = set->events;
+ cur_event < (set->events + set->nevents);
+ cur_event++)
+ {
+ if (cur_event->events & WL_LATCH_SET)
+ {
+ /* uses the latch's HANDLE */
+ }
+ else if (cur_event->events & WL_POSTMASTER_DEATH)
+ {
+ /* uses PostmasterHandle */
+ }
+ else
+ {
+ /* Clean up the event object we created for the socket */
+ WSAEventSelect(cur_event->fd, NULL, 0);
+ WSACloseEvent(set->handles[cur_event->pos + 1]);
+ }
+ }
+#endif
+
+ pfree(set);
+}
+
+/*
+ * Free a previously created WaitEventSet in a child process after a fork().
+ */
+void
+FreeWaitEventSetAfterFork(WaitEventSet *set)
+{
+#if defined(WAIT_USE_EPOLL)
+ close(set->epoll_fd);
+ ReleaseExternalFD();
+#elif defined(WAIT_USE_KQUEUE)
+ /* kqueues are not normally inherited by child processes */
+ ReleaseExternalFD();
+#endif
+
+ pfree(set);
+}
+
+/* ---
+ * Add an event to the set. Possible events are:
+ * - WL_LATCH_SET: Wait for the latch to be set
+ * - WL_POSTMASTER_DEATH: Wait for postmaster to die
+ * - WL_SOCKET_READABLE: Wait for socket to become readable,
+ * can be combined in one event with other WL_SOCKET_* events
+ * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
+ * can be combined with other WL_SOCKET_* events
+ * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
+ * can be combined with other WL_SOCKET_* events (on non-Windows
+ * platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_SOCKET_ACCEPT: Wait for new connection to a server socket,
+ * can be combined with other WL_SOCKET_* events (on non-Windows
+ * platforms, this is the same as WL_SOCKET_READABLE)
+ * - WL_SOCKET_CLOSED: Wait for socket to be closed by remote peer.
+ * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
+ *
+ * Returns the offset in WaitEventSet->events (starting from 0), which can be
+ * used to modify previously added wait events using ModifyWaitEvent().
+ *
+ * In the WL_LATCH_SET case the latch must be owned by the current process,
+ * i.e. it must be a process-local latch initialized with InitLatch, or a
+ * shared latch associated with the current process by calling OwnLatch.
+ *
+ * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED/ACCEPT cases, EOF and error
+ * conditions cause the socket to be reported as readable/writable/connected,
+ * so that the caller can deal with the condition.
+ *
+ * The user_data pointer specified here will be set for the events returned
+ * by WaitEventSetWait(), allowing to easily associate additional data with
+ * events.
+ */
+int
+AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
+ void *user_data)
+{
+ WaitEvent *event;
+
+ /* not enough space */
+ Assert(set->nevents < set->nevents_space);
+
+ if (events == WL_EXIT_ON_PM_DEATH)
+ {
+ events = WL_POSTMASTER_DEATH;
+ set->exit_on_postmaster_death = true;
+ }
+
+ if (latch)
+ {
+ if (latch->owner_pid != MyProcPid)
+ elog(ERROR, "cannot wait on a latch owned by another process");
+ if (set->latch)
+ elog(ERROR, "cannot wait on more than one latch");
+ if ((events & WL_LATCH_SET) != WL_LATCH_SET)
+ elog(ERROR, "latch events only support being set");
+ }
+ else
+ {
+ if (events & WL_LATCH_SET)
+ elog(ERROR, "cannot wait on latch without a specified latch");
+ }
+
+ /* waiting for socket readiness without a socket indicates a bug */
+ if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
+ elog(ERROR, "cannot wait on socket event without a socket");
+
+ event = &set->events[set->nevents];
+ event->pos = set->nevents++;
+ event->fd = fd;
+ event->events = events;
+ event->user_data = user_data;
+#ifdef WIN32
+ event->reset = false;
+#endif
+
+ if (events == WL_LATCH_SET)
+ {
+ set->latch = latch;
+ set->latch_pos = event->pos;
+#if defined(WAIT_USE_SELF_PIPE)
+ event->fd = selfpipe_readfd;
+#elif defined(WAIT_USE_SIGNALFD)
+ event->fd = signal_fd;
+#else
+ event->fd = PGINVALID_SOCKET;
+#ifdef WAIT_USE_EPOLL
+ return event->pos;
+#endif
+#endif
+ }
+ else if (events == WL_POSTMASTER_DEATH)
+ {
+#ifndef WIN32
+ event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
+#endif
+ }
+
+ /* perform wait primitive specific initialization, if needed */
+#if defined(WAIT_USE_EPOLL)
+ WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
+#elif defined(WAIT_USE_KQUEUE)
+ WaitEventAdjustKqueue(set, event, 0);
+#elif defined(WAIT_USE_POLL)
+ WaitEventAdjustPoll(set, event);
+#elif defined(WAIT_USE_WIN32)
+ WaitEventAdjustWin32(set, event);
+#endif
+
+ return event->pos;
+}
+
+/*
+ * Change the event mask and, in the WL_LATCH_SET case, the latch associated
+ * with the WaitEvent. The latch may be changed to NULL to disable the latch
+ * temporarily, and then set back to a latch later.
+ *
+ * 'pos' is the id returned by AddWaitEventToSet.
+ */
+void
+ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
+{
+ WaitEvent *event;
+#if defined(WAIT_USE_KQUEUE)
+ int old_events;
+#endif
+
+ Assert(pos < set->nevents);
+
+ event = &set->events[pos];
+#if defined(WAIT_USE_KQUEUE)
+ old_events = event->events;
+#endif
+
+ /*
+ * If neither the event mask nor the associated latch changes, return
+ * early. That's an important optimization for some sockets, where
+ * ModifyWaitEvent is frequently used to switch from waiting for reads to
+ * waiting on writes.
+ */
+ if (events == event->events &&
+ (!(event->events & WL_LATCH_SET) || set->latch == latch))
+ return;
+
+ if (event->events & WL_LATCH_SET &&
+ events != event->events)
+ {
+ elog(ERROR, "cannot modify latch event");
+ }
+
+ if (event->events & WL_POSTMASTER_DEATH)
+ {
+ elog(ERROR, "cannot modify postmaster death event");
+ }
+
+ /* FIXME: validate event mask */
+ event->events = events;
+
+ if (events == WL_LATCH_SET)
+ {
+ if (latch && latch->owner_pid != MyProcPid)
+ elog(ERROR, "cannot wait on a latch owned by another process");
+ set->latch = latch;
+
+ /*
+ * On Unix, we don't need to modify the kernel object because the
+ * underlying pipe (if there is one) is the same for all latches so we
+ * can return immediately. On Windows, we need to update our array of
+ * handles, but we leave the old one in place and tolerate spurious
+ * wakeups if the latch is disabled.
+ */
+#if defined(WAIT_USE_WIN32)
+ if (!latch)
+ return;
+#else
+ return;
+#endif
+ }
+
+#if defined(WAIT_USE_EPOLL)
+ WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
+#elif defined(WAIT_USE_KQUEUE)
+ WaitEventAdjustKqueue(set, event, old_events);
+#elif defined(WAIT_USE_POLL)
+ WaitEventAdjustPoll(set, event);
+#elif defined(WAIT_USE_WIN32)
+ WaitEventAdjustWin32(set, event);
+#endif
+}
+
+#if defined(WAIT_USE_EPOLL)
+/*
+ * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
+ */
+static void
+WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
+{
+ struct epoll_event epoll_ev;
+ int rc;
+
+ /* pointer to our event, returned by epoll_wait */
+ epoll_ev.data.ptr = event;
+ /* always wait for errors */
+ epoll_ev.events = EPOLLERR | EPOLLHUP;
+
+ /* prepare pollfd entry once */
+ if (event->events == WL_LATCH_SET)
+ {
+ Assert(set->latch != NULL);
+ epoll_ev.events |= EPOLLIN;
+ }
+ else if (event->events == WL_POSTMASTER_DEATH)
+ {
+ epoll_ev.events |= EPOLLIN;
+ }
+ else
+ {
+ Assert(event->fd != PGINVALID_SOCKET);
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
+
+ if (event->events & WL_SOCKET_READABLE)
+ epoll_ev.events |= EPOLLIN;
+ if (event->events & WL_SOCKET_WRITEABLE)
+ epoll_ev.events |= EPOLLOUT;
+ if (event->events & WL_SOCKET_CLOSED)
+ epoll_ev.events |= EPOLLRDHUP;
+ }
+
+ /*
+ * Even though unused, we also pass epoll_ev as the data argument if
+ * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
+ * requiring that, and actually it makes the code simpler...
+ */
+ rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
+
+ if (rc < 0)
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "epoll_ctl")));
+}
+#endif
+
+#if defined(WAIT_USE_POLL)
+static void
+WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
+{
+ struct pollfd *pollfd = &set->pollfds[event->pos];
+
+ pollfd->revents = 0;
+ pollfd->fd = event->fd;
+
+ /* prepare pollfd entry once */
+ if (event->events == WL_LATCH_SET)
+ {
+ Assert(set->latch != NULL);
+ pollfd->events = POLLIN;
+ }
+ else if (event->events == WL_POSTMASTER_DEATH)
+ {
+ pollfd->events = POLLIN;
+ }
+ else
+ {
+ Assert(event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED));
+ pollfd->events = 0;
+ if (event->events & WL_SOCKET_READABLE)
+ pollfd->events |= POLLIN;
+ if (event->events & WL_SOCKET_WRITEABLE)
+ pollfd->events |= POLLOUT;
+#ifdef POLLRDHUP
+ if (event->events & WL_SOCKET_CLOSED)
+ pollfd->events |= POLLRDHUP;
+#endif
+ }
+
+ Assert(event->fd != PGINVALID_SOCKET);
+}
+#endif
+
+#if defined(WAIT_USE_KQUEUE)
+
+/*
+ * On most BSD family systems, the udata member of struct kevent is of type
+ * void *, so we could directly convert to/from WaitEvent *. Unfortunately,
+ * NetBSD has it as intptr_t, so here we wallpaper over that difference with
+ * an lvalue cast.
+ */
+#define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
+
+static inline void
+WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
+ WaitEvent *event)
+{
+ k_ev->ident = event->fd;
+ k_ev->filter = filter;
+ k_ev->flags = action;
+ k_ev->fflags = 0;
+ k_ev->data = 0;
+ AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
+{
+ /* For now postmaster death can only be added, not removed. */
+ k_ev->ident = PostmasterPid;
+ k_ev->filter = EVFILT_PROC;
+ k_ev->flags = EV_ADD;
+ k_ev->fflags = NOTE_EXIT;
+ k_ev->data = 0;
+ AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
+{
+ /* For now latch can only be added, not removed. */
+ k_ev->ident = SIGURG;
+ k_ev->filter = EVFILT_SIGNAL;
+ k_ev->flags = EV_ADD;
+ k_ev->fflags = 0;
+ k_ev->data = 0;
+ AccessWaitEvent(k_ev) = event;
+}
+
+/*
+ * old_events is the previous event mask, used to compute what has changed.
+ */
+static void
+WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
+{
+ int rc;
+ struct kevent k_ev[2];
+ int count = 0;
+ bool new_filt_read = false;
+ bool old_filt_read = false;
+ bool new_filt_write = false;
+ bool old_filt_write = false;
+
+ if (old_events == event->events)
+ return;
+
+ Assert(event->events != WL_LATCH_SET || set->latch != NULL);
+ Assert(event->events == WL_LATCH_SET ||
+ event->events == WL_POSTMASTER_DEATH ||
+ (event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED)));
+
+ if (event->events == WL_POSTMASTER_DEATH)
+ {
+ /*
+ * Unlike all the other implementations, we detect postmaster death
+ * using process notification instead of waiting on the postmaster
+ * alive pipe.
+ */
+ WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
+ }
+ else if (event->events == WL_LATCH_SET)
+ {
+ /* We detect latch wakeup using a signal event. */
+ WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
+ }
+ else
+ {
+ /*
+ * We need to compute the adds and deletes required to get from the
+ * old event mask to the new event mask, since kevent treats readable
+ * and writable as separate events.
+ */
+ if (old_events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
+ old_filt_read = true;
+ if (event->events & (WL_SOCKET_READABLE | WL_SOCKET_CLOSED))
+ new_filt_read = true;
+ if (old_events & WL_SOCKET_WRITEABLE)
+ old_filt_write = true;
+ if (event->events & WL_SOCKET_WRITEABLE)
+ new_filt_write = true;
+ if (old_filt_read && !new_filt_read)
+ WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
+ event);
+ else if (!old_filt_read && new_filt_read)
+ WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
+ event);
+ if (old_filt_write && !new_filt_write)
+ WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
+ event);
+ else if (!old_filt_write && new_filt_write)
+ WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
+ event);
+ }
+
+ /* For WL_SOCKET_READ -> WL_SOCKET_CLOSED, no change needed. */
+ if (count == 0)
+ return;
+
+ Assert(count <= 2);
+
+ rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
+
+ /*
+ * When adding the postmaster's pid, we have to consider that it might
+ * already have exited and perhaps even been replaced by another process
+ * with the same pid. If so, we have to defer reporting this as an event
+ * until the next call to WaitEventSetWaitBlock().
+ */
+
+ if (rc < 0)
+ {
+ if (event->events == WL_POSTMASTER_DEATH &&
+ (errno == ESRCH || errno == EACCES))
+ set->report_postmaster_not_running = true;
+ else
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "kevent")));
+ }
+ else if (event->events == WL_POSTMASTER_DEATH &&
+ PostmasterPid != getppid() &&
+ !PostmasterIsAlive())
+ {
+ /*
+ * The extra PostmasterIsAliveInternal() check prevents false alarms
+ * on systems that give a different value for getppid() while being
+ * traced by a debugger.
+ */
+ set->report_postmaster_not_running = true;
+ }
+}
+
+#endif
+
+#if defined(WAIT_USE_WIN32)
+static void
+WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
+{
+ HANDLE *handle = &set->handles[event->pos + 1];
+
+ if (event->events == WL_LATCH_SET)
+ {
+ Assert(set->latch != NULL);
+ *handle = set->latch->event;
+ }
+ else if (event->events == WL_POSTMASTER_DEATH)
+ {
+ *handle = PostmasterHandle;
+ }
+ else
+ {
+ int flags = FD_CLOSE; /* always check for errors/EOF */
+
+ if (event->events & WL_SOCKET_READABLE)
+ flags |= FD_READ;
+ if (event->events & WL_SOCKET_WRITEABLE)
+ flags |= FD_WRITE;
+ if (event->events & WL_SOCKET_CONNECTED)
+ flags |= FD_CONNECT;
+ if (event->events & WL_SOCKET_ACCEPT)
+ flags |= FD_ACCEPT;
+
+ if (*handle == WSA_INVALID_EVENT)
+ {
+ *handle = WSACreateEvent();
+ if (*handle == WSA_INVALID_EVENT)
+ elog(ERROR, "failed to create event for socket: error code %d",
+ WSAGetLastError());
+ }
+ if (WSAEventSelect(event->fd, *handle, flags) != 0)
+ elog(ERROR, "failed to set up event for socket: error code %d",
+ WSAGetLastError());
+
+ Assert(event->fd != PGINVALID_SOCKET);
+ }
+}
+#endif
+
+/*
+ * Wait for events added to the set to happen, or until the timeout is
+ * reached. At most nevents occurred events are returned.
+ *
+ * If timeout = -1, block until an event occurs; if 0, check sockets for
+ * readiness, but don't block; if > 0, block for at most timeout milliseconds.
+ *
+ * Returns the number of events occurred, or 0 if the timeout was reached.
+ *
+ * Returned events will have the fd, pos, user_data fields set to the
+ * values associated with the registered event.
+ */
+int
+WaitEventSetWait(WaitEventSet *set, long timeout,
+ WaitEvent *occurred_events, int nevents,
+ uint32 wait_event_info)
+{
+ int returned_events = 0;
+ instr_time start_time;
+ instr_time cur_time;
+ long cur_timeout = -1;
+
+ Assert(nevents > 0);
+
+ /*
+ * Initialize timeout if requested. We must record the current time so
+ * that we can determine the remaining timeout if interrupted.
+ */
+ if (timeout >= 0)
+ {
+ INSTR_TIME_SET_CURRENT(start_time);
+ Assert(timeout >= 0 && timeout <= INT_MAX);
+ cur_timeout = timeout;
+ }
+ else
+ INSTR_TIME_SET_ZERO(start_time);
+
+ pgstat_report_wait_start(wait_event_info);
+
+#ifndef WIN32
+ waiting = true;
+#else
+ /* Ensure that signals are serviced even if latch is already set */
+ pgwin32_dispatch_queued_signals();
+#endif
+ while (returned_events == 0)
+ {
+ int rc;
+
+ /*
+ * Check if the latch is set already. If so, leave the loop
+ * immediately, avoid blocking again. We don't attempt to report any
+ * other events that might also be satisfied.
+ *
+ * If someone sets the latch between this and the
+ * WaitEventSetWaitBlock() below, the setter will write a byte to the
+ * pipe (or signal us and the signal handler will do that), and the
+ * readiness routine will return immediately.
+ *
+ * On unix, If there's a pending byte in the self pipe, we'll notice
+ * whenever blocking. Only clearing the pipe in that case avoids
+ * having to drain it every time WaitLatchOrSocket() is used. Should
+ * the pipe-buffer fill up we're still ok, because the pipe is in
+ * nonblocking mode. It's unlikely for that to happen, because the
+ * self pipe isn't filled unless we're blocking (waiting = true), or
+ * from inside a signal handler in latch_sigurg_handler().
+ *
+ * On windows, we'll also notice if there's a pending event for the
+ * latch when blocking, but there's no danger of anything filling up,
+ * as "Setting an event that is already set has no effect.".
+ *
+ * Note: we assume that the kernel calls involved in latch management
+ * will provide adequate synchronization on machines with weak memory
+ * ordering, so that we cannot miss seeing is_set if a notification
+ * has already been queued.
+ */
+ if (set->latch && !set->latch->is_set)
+ {
+ /* about to sleep on a latch */
+ set->latch->maybe_sleeping = true;
+ pg_memory_barrier();
+ /* and recheck */
+ }
+
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->pos = set->latch_pos;
+ occurred_events->user_data =
+ set->events[set->latch_pos].user_data;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+
+ /* could have been set above */
+ set->latch->maybe_sleeping = false;
+
+ break;
+ }
+
+ /*
+ * Wait for events using the readiness primitive chosen at the top of
+ * this file. If -1 is returned, a timeout has occurred, if 0 we have
+ * to retry, everything >= 1 is the number of returned events.
+ */
+ rc = WaitEventSetWaitBlock(set, cur_timeout,
+ occurred_events, nevents);
+
+ if (set->latch)
+ {
+ Assert(set->latch->maybe_sleeping);
+ set->latch->maybe_sleeping = false;
+ }
+
+ if (rc == -1)
+ break; /* timeout occurred */
+ else
+ returned_events = rc;
+
+ /* If we're not done, update cur_timeout for next iteration */
+ if (returned_events == 0 && timeout >= 0)
+ {
+ INSTR_TIME_SET_CURRENT(cur_time);
+ INSTR_TIME_SUBTRACT(cur_time, start_time);
+ cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+ if (cur_timeout <= 0)
+ break;
+ }
+ }
+#ifndef WIN32
+ waiting = false;
+#endif
+
+ pgstat_report_wait_end();
+
+ return returned_events;
+}
+
+
+#if defined(WAIT_USE_EPOLL)
+
+/*
+ * Wait using linux's epoll_wait(2).
+ *
+ * This is the preferable wait method, as several readiness notifications are
+ * delivered, without having to iterate through all of set->events. The return
+ * epoll_event struct contain a pointer to our events, making association
+ * easy.
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ int rc;
+ WaitEvent *cur_event;
+ struct epoll_event *cur_epoll_event;
+
+ /* Sleep */
+ rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
+ Min(nevents, set->nevents_space), cur_timeout);
+
+ /* Check return code */
+ if (rc < 0)
+ {
+ /* EINTR is okay, otherwise complain */
+ if (errno != EINTR)
+ {
+ waiting = false;
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "epoll_wait")));
+ }
+ return 0;
+ }
+ else if (rc == 0)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ /*
+ * At least one event occurred, iterate over the returned epoll events
+ * until they're either all processed, or we've returned all the events
+ * the caller desired.
+ */
+ for (cur_epoll_event = set->epoll_ret_events;
+ cur_epoll_event < (set->epoll_ret_events + rc) &&
+ returned_events < nevents;
+ cur_epoll_event++)
+ {
+ /* epoll's data pointer is set to the associated WaitEvent */
+ cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET &&
+ cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
+ {
+ /* Drain the signalfd. */
+ drain();
+
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH &&
+ cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
+ {
+ /*
+ * We expect an EPOLLHUP when the remote end is closed, but
+ * because we don't expect the pipe to become readable or to have
+ * any errors either, treat those cases as postmaster death, too.
+ *
+ * Be paranoid about a spurious event signaling the postmaster as
+ * being dead. There have been reports about that happening with
+ * older primitives (select(2) to be specific), and a spurious
+ * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+ * cost much.
+ */
+ if (!PostmasterIsAliveInternal())
+ {
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
+ {
+ Assert(cur_event->fd != PGINVALID_SOCKET);
+
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
+ {
+ /* data available in socket, or EOF */
+ occurred_events->events |= WL_SOCKET_READABLE;
+ }
+
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
+ {
+ /* writable, or EOF */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_epoll_event->events & (EPOLLRDHUP | EPOLLERR | EPOLLHUP)))
+ {
+ /* remote peer shut down, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events->fd = cur_event->fd;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ }
+
+ return returned_events;
+}
+
+#elif defined(WAIT_USE_KQUEUE)
+
+/*
+ * Wait using kevent(2) on BSD-family systems and macOS.
+ *
+ * For now this mirrors the epoll code, but in future it could modify the fd
+ * set in the same call to kevent as it uses for waiting instead of doing that
+ * with separate system calls.
+ */
+static int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ int rc;
+ WaitEvent *cur_event;
+ struct kevent *cur_kqueue_event;
+ struct timespec timeout;
+ struct timespec *timeout_p;
+
+ if (cur_timeout < 0)
+ timeout_p = NULL;
+ else
+ {
+ timeout.tv_sec = cur_timeout / 1000;
+ timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
+ timeout_p = &timeout;
+ }
+
+ /*
+ * Report postmaster events discovered by WaitEventAdjustKqueue() or an
+ * earlier call to WaitEventSetWait().
+ */
+ if (unlikely(set->report_postmaster_not_running))
+ {
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ return 1;
+ }
+
+ /* Sleep */
+ rc = kevent(set->kqueue_fd, NULL, 0,
+ set->kqueue_ret_events,
+ Min(nevents, set->nevents_space),
+ timeout_p);
+
+ /* Check return code */
+ if (rc < 0)
+ {
+ /* EINTR is okay, otherwise complain */
+ if (errno != EINTR)
+ {
+ waiting = false;
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "kevent")));
+ }
+ return 0;
+ }
+ else if (rc == 0)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ /*
+ * At least one event occurred, iterate over the returned kqueue events
+ * until they're either all processed, or we've returned all the events
+ * the caller desired.
+ */
+ for (cur_kqueue_event = set->kqueue_ret_events;
+ cur_kqueue_event < (set->kqueue_ret_events + rc) &&
+ returned_events < nevents;
+ cur_kqueue_event++)
+ {
+ /* kevent's udata points to the associated WaitEvent */
+ cur_event = AccessWaitEvent(cur_kqueue_event);
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET &&
+ cur_kqueue_event->filter == EVFILT_SIGNAL)
+ {
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH &&
+ cur_kqueue_event->filter == EVFILT_PROC &&
+ (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
+ {
+ /*
+ * The kernel will tell this kqueue object only once about the
+ * exit of the postmaster, so let's remember that for next time so
+ * that we provide level-triggered semantics.
+ */
+ set->report_postmaster_not_running = true;
+
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
+ {
+ Assert(cur_event->fd >= 0);
+
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (cur_kqueue_event->filter == EVFILT_READ))
+ {
+ /* readable, or EOF */
+ occurred_events->events |= WL_SOCKET_READABLE;
+ }
+
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_kqueue_event->filter == EVFILT_READ) &&
+ (cur_kqueue_event->flags & EV_EOF))
+ {
+ /* the remote peer has shut down */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (cur_kqueue_event->filter == EVFILT_WRITE))
+ {
+ /* writable, or EOF */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events->fd = cur_event->fd;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ }
+
+ return returned_events;
+}
+
+#elif defined(WAIT_USE_POLL)
+
+/*
+ * Wait using poll(2).
+ *
+ * This allows to receive readiness notifications for several events at once,
+ * but requires iterating through all of set->pollfds.
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ int rc;
+ WaitEvent *cur_event;
+ struct pollfd *cur_pollfd;
+
+ /* Sleep */
+ rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
+
+ /* Check return code */
+ if (rc < 0)
+ {
+ /* EINTR is okay, otherwise complain */
+ if (errno != EINTR)
+ {
+ waiting = false;
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "poll")));
+ }
+ return 0;
+ }
+ else if (rc == 0)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ for (cur_event = set->events, cur_pollfd = set->pollfds;
+ cur_event < (set->events + set->nevents) &&
+ returned_events < nevents;
+ cur_event++, cur_pollfd++)
+ {
+ /* no activity on this FD, skip */
+ if (cur_pollfd->revents == 0)
+ continue;
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET &&
+ (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
+ {
+ /* There's data in the self-pipe, clear it. */
+ drain();
+
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH &&
+ (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
+ {
+ /*
+ * We expect an POLLHUP when the remote end is closed, but because
+ * we don't expect the pipe to become readable or to have any
+ * errors either, treat those cases as postmaster death, too.
+ *
+ * Be paranoid about a spurious event signaling the postmaster as
+ * being dead. There have been reports about that happening with
+ * older primitives (select(2) to be specific), and a spurious
+ * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+ * cost much.
+ */
+ if (!PostmasterIsAliveInternal())
+ {
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events & (WL_SOCKET_READABLE |
+ WL_SOCKET_WRITEABLE |
+ WL_SOCKET_CLOSED))
+ {
+ int errflags = POLLHUP | POLLERR | POLLNVAL;
+
+ Assert(cur_event->fd >= PGINVALID_SOCKET);
+
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (cur_pollfd->revents & (POLLIN | errflags)))
+ {
+ /* data available in socket, or EOF */
+ occurred_events->events |= WL_SOCKET_READABLE;
+ }
+
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (cur_pollfd->revents & (POLLOUT | errflags)))
+ {
+ /* writeable, or EOF */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+
+#ifdef POLLRDHUP
+ if ((cur_event->events & WL_SOCKET_CLOSED) &&
+ (cur_pollfd->revents & (POLLRDHUP | errflags)))
+ {
+ /* remote peer closed, or error */
+ occurred_events->events |= WL_SOCKET_CLOSED;
+ }
+#endif
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events->fd = cur_event->fd;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ }
+ return returned_events;
+}
+
+#elif defined(WAIT_USE_WIN32)
+
+/*
+ * Wait using Windows' WaitForMultipleObjects().
+ *
+ * Unfortunately this will only ever return a single readiness notification at
+ * a time. Note that while the official documentation for
+ * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
+ * with a single bWaitAll = FALSE call,
+ * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
+ * that only one event is "consumed".
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ DWORD rc;
+ WaitEvent *cur_event;
+
+ /* Reset any wait events that need it */
+ for (cur_event = set->events;
+ cur_event < (set->events + set->nevents);
+ cur_event++)
+ {
+ if (cur_event->reset)
+ {
+ WaitEventAdjustWin32(set, cur_event);
+ cur_event->reset = false;
+ }
+
+ /*
+ * Windows does not guarantee to log an FD_WRITE network event
+ * indicating that more data can be sent unless the previous send()
+ * failed with WSAEWOULDBLOCK. While our caller might well have made
+ * such a call, we cannot assume that here. Therefore, if waiting for
+ * write-ready, force the issue by doing a dummy send(). If the dummy
+ * send() succeeds, assume that the socket is in fact write-ready, and
+ * return immediately. Also, if it fails with something other than
+ * WSAEWOULDBLOCK, return a write-ready indication to let our caller
+ * deal with the error condition.
+ */
+ if (cur_event->events & WL_SOCKET_WRITEABLE)
+ {
+ char c;
+ WSABUF buf;
+ DWORD sent;
+ int r;
+
+ buf.buf = &c;
+ buf.len = 0;
+
+ r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
+ if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
+ {
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = WL_SOCKET_WRITEABLE;
+ occurred_events->fd = cur_event->fd;
+ return 1;
+ }
+ }
+ }
+
+ /*
+ * Sleep.
+ *
+ * Need to wait for ->nevents + 1, because signal handle is in [0].
+ */
+ rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
+ cur_timeout);
+
+ /* Check return code */
+ if (rc == WAIT_FAILED)
+ elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
+ GetLastError());
+ else if (rc == WAIT_TIMEOUT)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ if (rc == WAIT_OBJECT_0)
+ {
+ /* Service newly-arrived signals */
+ pgwin32_dispatch_queued_signals();
+ return 0; /* retry */
+ }
+
+ /*
+ * With an offset of one, due to the always present pgwin32_signal_event,
+ * the handle offset directly corresponds to a wait event.
+ */
+ cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET)
+ {
+ /*
+ * We cannot use set->latch->event to reset the fired event if we
+ * aren't waiting on this latch now.
+ */
+ if (!ResetEvent(set->handles[cur_event->pos + 1]))
+ elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
+
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH)
+ {
+ /*
+ * Postmaster apparently died. Since the consequences of falsely
+ * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
+ * the trouble to positively verify this with PostmasterIsAlive(),
+ * even though there is no known reason to think that the event could
+ * be falsely set on Windows.
+ */
+ if (!PostmasterIsAliveInternal())
+ {
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events & WL_SOCKET_MASK)
+ {
+ WSANETWORKEVENTS resEvents;
+ HANDLE handle = set->handles[cur_event->pos + 1];
+
+ Assert(cur_event->fd);
+
+ occurred_events->fd = cur_event->fd;
+
+ ZeroMemory(&resEvents, sizeof(resEvents));
+ if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
+ elog(ERROR, "failed to enumerate network events: error code %d",
+ WSAGetLastError());
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (resEvents.lNetworkEvents & FD_READ))
+ {
+ /* data available in socket */
+ occurred_events->events |= WL_SOCKET_READABLE;
+
+ /*------
+ * WaitForMultipleObjects doesn't guarantee that a read event will
+ * be returned if the latch is set at the same time. Even if it
+ * did, the caller might drop that event expecting it to reoccur
+ * on next call. So, we must force the event to be reset if this
+ * WaitEventSet is used again in order to avoid an indefinite
+ * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
+ * for the behavior of socket events.
+ *------
+ */
+ cur_event->reset = true;
+ }
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (resEvents.lNetworkEvents & FD_WRITE))
+ {
+ /* writeable */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+ if ((cur_event->events & WL_SOCKET_CONNECTED) &&
+ (resEvents.lNetworkEvents & FD_CONNECT))
+ {
+ /* connected */
+ occurred_events->events |= WL_SOCKET_CONNECTED;
+ }
+ if ((cur_event->events & WL_SOCKET_ACCEPT) &&
+ (resEvents.lNetworkEvents & FD_ACCEPT))
+ {
+ /* incoming connection could be accepted */
+ occurred_events->events |= WL_SOCKET_ACCEPT;
+ }
+ if (resEvents.lNetworkEvents & FD_CLOSE)
+ {
+ /* EOF/error, so signal all caller-requested socket flags */
+ occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
+ }
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events++;
+ returned_events++;
+ }
+ }
+
+ return returned_events;
+}
+#endif
+
+/*
+ * Return whether the current build options can report WL_SOCKET_CLOSED.
+ */
+bool
+WaitEventSetCanReportClosed(void)
+{
+#if (defined(WAIT_USE_POLL) && defined(POLLRDHUP)) || \
+ defined(WAIT_USE_EPOLL) || \
+ defined(WAIT_USE_KQUEUE)
+ return true;
+#else
+ return false;
+#endif
+}
+
+/*
+ * Get the number of wait events registered in a given WaitEventSet.
+ */
+int
+GetNumRegisteredWaitEvents(WaitEventSet *set)
+{
+ return set->nevents;
+}
+
+#if defined(WAIT_USE_SELF_PIPE)
+
+/*
+ * SetLatch uses SIGURG to wake up the process waiting on the latch.
+ *
+ * Wake up WaitLatch, if we're waiting.
+ */
+static void
+latch_sigurg_handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ if (waiting)
+ sendSelfPipeByte();
+
+ errno = save_errno;
+}
+
+/* Send one byte to the self-pipe, to wake up WaitLatch */
+static void
+sendSelfPipeByte(void)
+{
+ int rc;
+ char dummy = 0;
+
+retry:
+ rc = write(selfpipe_writefd, &dummy, 1);
+ if (rc < 0)
+ {
+ /* If interrupted by signal, just retry */
+ if (errno == EINTR)
+ goto retry;
+
+ /*
+ * If the pipe is full, we don't need to retry, the data that's there
+ * already is enough to wake up WaitLatch.
+ */
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ return;
+
+ /*
+ * Oops, the write() failed for some other reason. We might be in a
+ * signal handler, so it's not safe to elog(). We have no choice but
+ * silently ignore the error.
+ */
+ return;
+ }
+}
+
+#endif
+
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+
+/*
+ * Read all available data from self-pipe or signalfd.
+ *
+ * Note: this is only called when waiting = true. If it fails and doesn't
+ * return, it must reset that flag first (though ideally, this will never
+ * happen).
+ */
+static void
+drain(void)
+{
+ char buf[1024];
+ int rc;
+ int fd;
+
+#ifdef WAIT_USE_SELF_PIPE
+ fd = selfpipe_readfd;
+#else
+ fd = signal_fd;
+#endif
+
+ for (;;)
+ {
+ rc = read(fd, buf, sizeof(buf));
+ if (rc < 0)
+ {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ break; /* the descriptor is empty */
+ else if (errno == EINTR)
+ continue; /* retry */
+ else
+ {
+ waiting = false;
+#ifdef WAIT_USE_SELF_PIPE
+ elog(ERROR, "read() on self-pipe failed: %m");
+#else
+ elog(ERROR, "read() on signalfd failed: %m");
+#endif
+ }
+ }
+ else if (rc == 0)
+ {
+ waiting = false;
+#ifdef WAIT_USE_SELF_PIPE
+ elog(ERROR, "unexpected EOF on self-pipe");
+#else
+ elog(ERROR, "unexpected EOF on signalfd");
+#endif
+ }
+ else if (rc < sizeof(buf))
+ {
+ /* we successfully drained the pipe; no need to read() again */
+ break;
+ }
+ /* else buffer wasn't big enough, so read again */
+ }
+}
+
+#endif
diff --git a/src/backend/storage/ipc/meson.build b/src/backend/storage/ipc/meson.build
new file mode 100644
index 0000000..79a16d0
--- /dev/null
+++ b/src/backend/storage/ipc/meson.build
@@ -0,0 +1,21 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'barrier.c',
+ 'dsm.c',
+ 'dsm_impl.c',
+ 'ipc.c',
+ 'ipci.c',
+ 'latch.c',
+ 'pmsignal.c',
+ 'procarray.c',
+ 'procsignal.c',
+ 'shm_mq.c',
+ 'shm_toc.c',
+ 'shmem.c',
+ 'signalfuncs.c',
+ 'sinval.c',
+ 'sinvaladt.c',
+ 'standby.c',
+
+)
diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c
new file mode 100644
index 0000000..5dc2da6
--- /dev/null
+++ b/src/backend/storage/ipc/pmsignal.c
@@ -0,0 +1,462 @@
+/*-------------------------------------------------------------------------
+ *
+ * pmsignal.c
+ * routines for signaling between the postmaster and its child processes
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/pmsignal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#include "miscadmin.h"
+#include "postmaster/postmaster.h"
+#include "replication/walsender.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/memutils.h"
+
+
+/*
+ * The postmaster is signaled by its children by sending SIGUSR1. The
+ * specific reason is communicated via flags in shared memory. We keep
+ * a boolean flag for each possible "reason", so that different reasons
+ * can be signaled by different backends at the same time. (However,
+ * if the same reason is signaled more than once simultaneously, the
+ * postmaster will observe it only once.)
+ *
+ * The flags are actually declared as "volatile sig_atomic_t" for maximum
+ * portability. This should ensure that loads and stores of the flag
+ * values are atomic, allowing us to dispense with any explicit locking.
+ *
+ * In addition to the per-reason flags, we store a set of per-child-process
+ * flags that are currently used only for detecting whether a backend has
+ * exited without performing proper shutdown. The per-child-process flags
+ * have three possible states: UNUSED, ASSIGNED, ACTIVE. An UNUSED slot is
+ * available for assignment. An ASSIGNED slot is associated with a postmaster
+ * child process, but either the process has not touched shared memory yet,
+ * or it has successfully cleaned up after itself. A ACTIVE slot means the
+ * process is actively using shared memory. The slots are assigned to
+ * child processes at random, and postmaster.c is responsible for tracking
+ * which one goes with which PID.
+ *
+ * Actually there is a fourth state, WALSENDER. This is just like ACTIVE,
+ * but carries the extra information that the child is a WAL sender.
+ * WAL senders too start in ACTIVE state, but switch to WALSENDER once they
+ * start streaming the WAL (and they never go back to ACTIVE after that).
+ *
+ * We also have a shared-memory field that is used for communication in
+ * the opposite direction, from postmaster to children: it tells why the
+ * postmaster has broadcasted SIGQUIT signals, if indeed it has done so.
+ */
+
+#define PM_CHILD_UNUSED 0 /* these values must fit in sig_atomic_t */
+#define PM_CHILD_ASSIGNED 1
+#define PM_CHILD_ACTIVE 2
+#define PM_CHILD_WALSENDER 3
+
+/* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */
+struct PMSignalData
+{
+ /* per-reason flags for signaling the postmaster */
+ sig_atomic_t PMSignalFlags[NUM_PMSIGNALS];
+ /* global flags for signals from postmaster to children */
+ QuitSignalReason sigquit_reason; /* why SIGQUIT was sent */
+ /* per-child-process flags */
+ int num_child_flags; /* # of entries in PMChildFlags[] */
+ sig_atomic_t PMChildFlags[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/* PMSignalState pointer is valid in both postmaster and child processes */
+NON_EXEC_STATIC volatile PMSignalData *PMSignalState = NULL;
+
+/*
+ * These static variables are valid only in the postmaster. We keep a
+ * duplicative private array so that we can trust its state even if some
+ * failing child has clobbered the PMSignalData struct in shared memory.
+ */
+static int num_child_inuse; /* # of entries in PMChildInUse[] */
+static int next_child_inuse; /* next slot to try to assign */
+static bool *PMChildInUse; /* true if i'th flag slot is assigned */
+
+/*
+ * Signal handler to be notified if postmaster dies.
+ */
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+volatile sig_atomic_t postmaster_possibly_dead = false;
+
+static void
+postmaster_death_handler(SIGNAL_ARGS)
+{
+ postmaster_possibly_dead = true;
+}
+
+/*
+ * The available signals depend on the OS. SIGUSR1 and SIGUSR2 are already
+ * used for other things, so choose another one.
+ *
+ * Currently, we assume that we can always find a signal to use. That
+ * seems like a reasonable assumption for all platforms that are modern
+ * enough to have a parent-death signaling mechanism.
+ */
+#if defined(SIGINFO)
+#define POSTMASTER_DEATH_SIGNAL SIGINFO
+#elif defined(SIGPWR)
+#define POSTMASTER_DEATH_SIGNAL SIGPWR
+#else
+#error "cannot find a signal to use for postmaster death"
+#endif
+
+#endif /* USE_POSTMASTER_DEATH_SIGNAL */
+
+/*
+ * PMSignalShmemSize
+ * Compute space needed for pmsignal.c's shared memory
+ */
+Size
+PMSignalShmemSize(void)
+{
+ Size size;
+
+ size = offsetof(PMSignalData, PMChildFlags);
+ size = add_size(size, mul_size(MaxLivePostmasterChildren(),
+ sizeof(sig_atomic_t)));
+
+ return size;
+}
+
+/*
+ * PMSignalShmemInit - initialize during shared-memory creation
+ */
+void
+PMSignalShmemInit(void)
+{
+ bool found;
+
+ PMSignalState = (PMSignalData *)
+ ShmemInitStruct("PMSignalState", PMSignalShmemSize(), &found);
+
+ if (!found)
+ {
+ /* initialize all flags to zeroes */
+ MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
+ num_child_inuse = MaxLivePostmasterChildren();
+ PMSignalState->num_child_flags = num_child_inuse;
+
+ /*
+ * Also allocate postmaster's private PMChildInUse[] array. We
+ * might've already done that in a previous shared-memory creation
+ * cycle, in which case free the old array to avoid a leak. (Do it
+ * like this to support the possibility that MaxLivePostmasterChildren
+ * changed.) In a standalone backend, we do not need this.
+ */
+ if (PostmasterContext != NULL)
+ {
+ if (PMChildInUse)
+ pfree(PMChildInUse);
+ PMChildInUse = (bool *)
+ MemoryContextAllocZero(PostmasterContext,
+ num_child_inuse * sizeof(bool));
+ }
+ next_child_inuse = 0;
+ }
+}
+
+/*
+ * SendPostmasterSignal - signal the postmaster from a child process
+ */
+void
+SendPostmasterSignal(PMSignalReason reason)
+{
+ /* If called in a standalone backend, do nothing */
+ if (!IsUnderPostmaster)
+ return;
+ /* Atomically set the proper flag */
+ PMSignalState->PMSignalFlags[reason] = true;
+ /* Send signal to postmaster */
+ kill(PostmasterPid, SIGUSR1);
+}
+
+/*
+ * CheckPostmasterSignal - check to see if a particular reason has been
+ * signaled, and clear the signal flag. Should be called by postmaster
+ * after receiving SIGUSR1.
+ */
+bool
+CheckPostmasterSignal(PMSignalReason reason)
+{
+ /* Careful here --- don't clear flag if we haven't seen it set */
+ if (PMSignalState->PMSignalFlags[reason])
+ {
+ PMSignalState->PMSignalFlags[reason] = false;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * SetQuitSignalReason - broadcast the reason for a system shutdown.
+ * Should be called by postmaster before sending SIGQUIT to children.
+ *
+ * Note: in a crash-and-restart scenario, the "reason" field gets cleared
+ * as a part of rebuilding shared memory; the postmaster need not do it
+ * explicitly.
+ */
+void
+SetQuitSignalReason(QuitSignalReason reason)
+{
+ PMSignalState->sigquit_reason = reason;
+}
+
+/*
+ * GetQuitSignalReason - obtain the reason for a system shutdown.
+ * Called by child processes when they receive SIGQUIT.
+ * If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT.
+ */
+QuitSignalReason
+GetQuitSignalReason(void)
+{
+ /* This is called in signal handlers, so be extra paranoid. */
+ if (!IsUnderPostmaster || PMSignalState == NULL)
+ return PMQUIT_NOT_SENT;
+ return PMSignalState->sigquit_reason;
+}
+
+
+/*
+ * AssignPostmasterChildSlot - select an unused slot for a new postmaster
+ * child process, and set its state to ASSIGNED. Returns a slot number
+ * (one to N).
+ *
+ * Only the postmaster is allowed to execute this routine, so we need no
+ * special locking.
+ */
+int
+AssignPostmasterChildSlot(void)
+{
+ int slot = next_child_inuse;
+ int n;
+
+ /*
+ * Scan for a free slot. Notice that we trust nothing about the contents
+ * of PMSignalState, but use only postmaster-local data for this decision.
+ * We track the last slot assigned so as not to waste time repeatedly
+ * rescanning low-numbered slots.
+ */
+ for (n = num_child_inuse; n > 0; n--)
+ {
+ if (--slot < 0)
+ slot = num_child_inuse - 1;
+ if (!PMChildInUse[slot])
+ {
+ PMChildInUse[slot] = true;
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
+ next_child_inuse = slot;
+ return slot + 1;
+ }
+ }
+
+ /* Out of slots ... should never happen, else postmaster.c messed up */
+ elog(FATAL, "no free slots in PMChildFlags array");
+ return 0; /* keep compiler quiet */
+}
+
+/*
+ * ReleasePostmasterChildSlot - release a slot after death of a postmaster
+ * child process. This must be called in the postmaster process.
+ *
+ * Returns true if the slot had been in ASSIGNED state (the expected case),
+ * false otherwise (implying that the child failed to clean itself up).
+ */
+bool
+ReleasePostmasterChildSlot(int slot)
+{
+ bool result;
+
+ Assert(slot > 0 && slot <= num_child_inuse);
+ slot--;
+
+ /*
+ * Note: the slot state might already be unused, because the logic in
+ * postmaster.c is such that this might get called twice when a child
+ * crashes. So we don't try to Assert anything about the state.
+ */
+ result = (PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_UNUSED;
+ PMChildInUse[slot] = false;
+ return result;
+}
+
+/*
+ * IsPostmasterChildWalSender - check if given slot is in use by a
+ * walsender process. This is called only by the postmaster.
+ */
+bool
+IsPostmasterChildWalSender(int slot)
+{
+ Assert(slot > 0 && slot <= num_child_inuse);
+ slot--;
+
+ if (PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER)
+ return true;
+ else
+ return false;
+}
+
+/*
+ * MarkPostmasterChildActive - mark a postmaster child as about to begin
+ * actively using shared memory. This is called in the child process.
+ */
+void
+MarkPostmasterChildActive(void)
+{
+ int slot = MyPMChildSlot;
+
+ Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+ slot--;
+ Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_ACTIVE;
+}
+
+/*
+ * MarkPostmasterChildWalSender - mark a postmaster child as a WAL sender
+ * process. This is called in the child process, sometime after marking the
+ * child as active.
+ */
+void
+MarkPostmasterChildWalSender(void)
+{
+ int slot = MyPMChildSlot;
+
+ Assert(am_walsender);
+
+ Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+ slot--;
+ Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE);
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_WALSENDER;
+}
+
+/*
+ * MarkPostmasterChildInactive - mark a postmaster child as done using
+ * shared memory. This is called in the child process.
+ */
+void
+MarkPostmasterChildInactive(void)
+{
+ int slot = MyPMChildSlot;
+
+ Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+ slot--;
+ Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE ||
+ PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER);
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
+}
+
+
+/*
+ * PostmasterIsAliveInternal - check whether postmaster process is still alive
+ *
+ * This is the slow path of PostmasterIsAlive(), where the caller has already
+ * checked 'postmaster_possibly_dead'. (On platforms that don't support
+ * a signal for parent death, PostmasterIsAlive() is just an alias for this.)
+ */
+bool
+PostmasterIsAliveInternal(void)
+{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+ /*
+ * Reset the flag before checking, so that we don't miss a signal if
+ * postmaster dies right after the check. If postmaster was indeed dead,
+ * we'll re-arm it before returning to caller.
+ */
+ postmaster_possibly_dead = false;
+#endif
+
+#ifndef WIN32
+ {
+ char c;
+ ssize_t rc;
+
+ rc = read(postmaster_alive_fds[POSTMASTER_FD_WATCH], &c, 1);
+
+ /*
+ * In the usual case, the postmaster is still alive, and there is no
+ * data in the pipe.
+ */
+ if (rc < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+ return true;
+ else
+ {
+ /*
+ * Postmaster is dead, or something went wrong with the read()
+ * call.
+ */
+
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+ postmaster_possibly_dead = true;
+#endif
+
+ if (rc < 0)
+ elog(FATAL, "read on postmaster death monitoring pipe failed: %m");
+ else if (rc > 0)
+ elog(FATAL, "unexpected data in postmaster death monitoring pipe");
+
+ return false;
+ }
+ }
+
+#else /* WIN32 */
+ if (WaitForSingleObject(PostmasterHandle, 0) == WAIT_TIMEOUT)
+ return true;
+ else
+ {
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+ postmaster_possibly_dead = true;
+#endif
+ return false;
+ }
+#endif /* WIN32 */
+}
+
+/*
+ * PostmasterDeathSignalInit - request signal on postmaster death if possible
+ */
+void
+PostmasterDeathSignalInit(void)
+{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+ int signum = POSTMASTER_DEATH_SIGNAL;
+
+ /* Register our signal handler. */
+ pqsignal(signum, postmaster_death_handler);
+
+ /* Request a signal on parent exit. */
+#if defined(PR_SET_PDEATHSIG)
+ if (prctl(PR_SET_PDEATHSIG, signum) < 0)
+ elog(ERROR, "could not request parent death signal: %m");
+#elif defined(PROC_PDEATHSIG_CTL)
+ if (procctl(P_PID, 0, PROC_PDEATHSIG_CTL, &signum) < 0)
+ elog(ERROR, "could not request parent death signal: %m");
+#else
+#error "USE_POSTMASTER_DEATH_SIGNAL set, but there is no mechanism to request the signal"
+#endif
+
+ /*
+ * Just in case the parent was gone already and we missed it, we'd better
+ * check the slow way on the first call.
+ */
+ postmaster_possibly_dead = true;
+#endif /* USE_POSTMASTER_DEATH_SIGNAL */
+}
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
new file mode 100644
index 0000000..e630302
--- /dev/null
+++ b/src/backend/storage/ipc/procarray.c
@@ -0,0 +1,5224 @@
+/*-------------------------------------------------------------------------
+ *
+ * procarray.c
+ * POSTGRES process array code.
+ *
+ *
+ * This module maintains arrays of PGPROC substructures, as well as associated
+ * arrays in ProcGlobal, for all active backends. Although there are several
+ * uses for this, the principal one is as a means of determining the set of
+ * currently running transactions.
+ *
+ * Because of various subtle race conditions it is critical that a backend
+ * hold the correct locks while setting or clearing its xid (in
+ * ProcGlobal->xids[]/MyProc->xid). See notes in
+ * src/backend/access/transam/README.
+ *
+ * The process arrays now also include structures representing prepared
+ * transactions. The xid and subxids fields of these are valid, as are the
+ * myProcLocks lists. They can be distinguished from regular backend PGPROCs
+ * at need by checking for pid == 0.
+ *
+ * During hot standby, we also keep a list of XIDs representing transactions
+ * that are known to be running on the primary (or more precisely, were running
+ * as of the current point in the WAL stream). This list is kept in the
+ * KnownAssignedXids array, and is updated by watching the sequence of
+ * arriving XIDs. This is necessary because if we leave those XIDs out of
+ * snapshots taken for standby queries, then they will appear to be already
+ * complete, leading to MVCC failures. Note that in hot standby, the PGPROC
+ * array represents standby processes, which by definition are not running
+ * transactions that have XIDs.
+ *
+ * It is perhaps possible for a backend on the primary to terminate without
+ * writing an abort record for its transaction. While that shouldn't really
+ * happen, it would tie up KnownAssignedXids indefinitely, so we protect
+ * ourselves by pruning the array when a valid list of running XIDs arrives.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/procarray.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "access/clog.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlogutils.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_authid.h"
+#include "commands/dbcommands.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/pg_lfind.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var))))
+
+/* Our shared memory area */
+typedef struct ProcArrayStruct
+{
+ int numProcs; /* number of valid procs entries */
+ int maxProcs; /* allocated size of procs array */
+
+ /*
+ * Known assigned XIDs handling
+ */
+ int maxKnownAssignedXids; /* allocated size of array */
+ int numKnownAssignedXids; /* current # of valid entries */
+ int tailKnownAssignedXids; /* index of oldest valid element */
+ int headKnownAssignedXids; /* index of newest element, + 1 */
+ slock_t known_assigned_xids_lck; /* protects head/tail pointers */
+
+ /*
+ * Highest subxid that has been removed from KnownAssignedXids array to
+ * prevent overflow; or InvalidTransactionId if none. We track this for
+ * similar reasons to tracking overflowing cached subxids in PGPROC
+ * entries. Must hold exclusive ProcArrayLock to change this, and shared
+ * lock to read it.
+ */
+ TransactionId lastOverflowedXid;
+
+ /* oldest xmin of any replication slot */
+ TransactionId replication_slot_xmin;
+ /* oldest catalog xmin of any replication slot */
+ TransactionId replication_slot_catalog_xmin;
+
+ /* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
+ int pgprocnos[FLEXIBLE_ARRAY_MEMBER];
+} ProcArrayStruct;
+
+/*
+ * State for the GlobalVisTest* family of functions. Those functions can
+ * e.g. be used to decide if a deleted row can be removed without violating
+ * MVCC semantics: If the deleted row's xmax is not considered to be running
+ * by anyone, the row can be removed.
+ *
+ * To avoid slowing down GetSnapshotData(), we don't calculate a precise
+ * cutoff XID while building a snapshot (looking at the frequently changing
+ * xmins scales badly). Instead we compute two boundaries while building the
+ * snapshot:
+ *
+ * 1) definitely_needed, indicating that rows deleted by XIDs >=
+ * definitely_needed are definitely still visible.
+ *
+ * 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
+ * definitely be removed
+ *
+ * When testing an XID that falls in between the two (i.e. XID >= maybe_needed
+ * && XID < definitely_needed), the boundaries can be recomputed (using
+ * ComputeXidHorizons()) to get a more accurate answer. This is cheaper than
+ * maintaining an accurate value all the time.
+ *
+ * As it is not cheap to compute accurate boundaries, we limit the number of
+ * times that happens in short succession. See GlobalVisTestShouldUpdate().
+ *
+ *
+ * There are three backend lifetime instances of this struct, optimized for
+ * different types of relations. As e.g. a normal user defined table in one
+ * database is inaccessible to backends connected to another database, a test
+ * specific to a relation can be more aggressive than a test for a shared
+ * relation. Currently we track four different states:
+ *
+ * 1) GlobalVisSharedRels, which only considers an XID's
+ * effects visible-to-everyone if neither snapshots in any database, nor a
+ * replication slot's xmin, nor a replication slot's catalog_xmin might
+ * still consider XID as running.
+ *
+ * 2) GlobalVisCatalogRels, which only considers an XID's
+ * effects visible-to-everyone if neither snapshots in the current
+ * database, nor a replication slot's xmin, nor a replication slot's
+ * catalog_xmin might still consider XID as running.
+ *
+ * I.e. the difference to GlobalVisSharedRels is that
+ * snapshot in other databases are ignored.
+ *
+ * 3) GlobalVisDataRels, which only considers an XID's
+ * effects visible-to-everyone if neither snapshots in the current
+ * database, nor a replication slot's xmin consider XID as running.
+ *
+ * I.e. the difference to GlobalVisCatalogRels is that
+ * replication slot's catalog_xmin is not taken into account.
+ *
+ * 4) GlobalVisTempRels, which only considers the current session, as temp
+ * tables are not visible to other sessions.
+ *
+ * GlobalVisTestFor(relation) returns the appropriate state
+ * for the relation.
+ *
+ * The boundaries are FullTransactionIds instead of TransactionIds to avoid
+ * wraparound dangers. There e.g. would otherwise exist no procarray state to
+ * prevent maybe_needed to become old enough after the GetSnapshotData()
+ * call.
+ *
+ * The typedef is in the header.
+ */
+struct GlobalVisState
+{
+ /* XIDs >= are considered running by some backend */
+ FullTransactionId definitely_needed;
+
+ /* XIDs < are not considered to be running by any backend */
+ FullTransactionId maybe_needed;
+};
+
+/*
+ * Result of ComputeXidHorizons().
+ */
+typedef struct ComputeXidHorizonsResult
+{
+ /*
+ * The value of ShmemVariableCache->latestCompletedXid when
+ * ComputeXidHorizons() held ProcArrayLock.
+ */
+ FullTransactionId latest_completed;
+
+ /*
+ * The same for procArray->replication_slot_xmin and.
+ * procArray->replication_slot_catalog_xmin.
+ */
+ TransactionId slot_xmin;
+ TransactionId slot_catalog_xmin;
+
+ /*
+ * Oldest xid that any backend might still consider running. This needs to
+ * include processes running VACUUM, in contrast to the normal visibility
+ * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when
+ * determining visibility, but doesn't care about rows above its xmin to
+ * be removed.
+ *
+ * This likely should only be needed to determine whether pg_subtrans can
+ * be truncated. It currently includes the effects of replication slots,
+ * for historical reasons. But that could likely be changed.
+ */
+ TransactionId oldest_considered_running;
+
+ /*
+ * Oldest xid for which deleted tuples need to be retained in shared
+ * tables.
+ *
+ * This includes the effects of replication slots. If that's not desired,
+ * look at shared_oldest_nonremovable_raw;
+ */
+ TransactionId shared_oldest_nonremovable;
+
+ /*
+ * Oldest xid that may be necessary to retain in shared tables. This is
+ * the same as shared_oldest_nonremovable, except that is not affected by
+ * replication slot's catalog_xmin.
+ *
+ * This is mainly useful to be able to send the catalog_xmin to upstream
+ * streaming replication servers via hot_standby_feedback, so they can
+ * apply the limit only when accessing catalog tables.
+ */
+ TransactionId shared_oldest_nonremovable_raw;
+
+ /*
+ * Oldest xid for which deleted tuples need to be retained in non-shared
+ * catalog tables.
+ */
+ TransactionId catalog_oldest_nonremovable;
+
+ /*
+ * Oldest xid for which deleted tuples need to be retained in normal user
+ * defined tables.
+ */
+ TransactionId data_oldest_nonremovable;
+
+ /*
+ * Oldest xid for which deleted tuples need to be retained in this
+ * session's temporary tables.
+ */
+ TransactionId temp_oldest_nonremovable;
+} ComputeXidHorizonsResult;
+
+/*
+ * Return value for GlobalVisHorizonKindForRel().
+ */
+typedef enum GlobalVisHorizonKind
+{
+ VISHORIZON_SHARED,
+ VISHORIZON_CATALOG,
+ VISHORIZON_DATA,
+ VISHORIZON_TEMP
+} GlobalVisHorizonKind;
+
+/*
+ * Reason codes for KnownAssignedXidsCompress().
+ */
+typedef enum KAXCompressReason
+{
+ KAX_NO_SPACE, /* need to free up space at array end */
+ KAX_PRUNE, /* we just pruned old entries */
+ KAX_TRANSACTION_END, /* we just committed/removed some XIDs */
+ KAX_STARTUP_PROCESS_IDLE /* startup process is about to sleep */
+} KAXCompressReason;
+
+
+static ProcArrayStruct *procArray;
+
+static PGPROC *allProcs;
+
+/*
+ * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress()
+ */
+static TransactionId cachedXidIsNotInProgress = InvalidTransactionId;
+
+/*
+ * Bookkeeping for tracking emulated transactions in recovery
+ */
+static TransactionId *KnownAssignedXids;
+static bool *KnownAssignedXidsValid;
+static TransactionId latestObservedXid = InvalidTransactionId;
+
+/*
+ * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
+ * the highest xid that might still be running that we don't have in
+ * KnownAssignedXids.
+ */
+static TransactionId standbySnapshotPendingXmin;
+
+/*
+ * State for visibility checks on different types of relations. See struct
+ * GlobalVisState for details. As shared, catalog, normal and temporary
+ * relations can have different horizons, one such state exists for each.
+ */
+static GlobalVisState GlobalVisSharedRels;
+static GlobalVisState GlobalVisCatalogRels;
+static GlobalVisState GlobalVisDataRels;
+static GlobalVisState GlobalVisTempRels;
+
+/*
+ * This backend's RecentXmin at the last time the accurate xmin horizon was
+ * recomputed, or InvalidTransactionId if it has not. Used to limit how many
+ * times accurate horizons are recomputed. See GlobalVisTestShouldUpdate().
+ */
+static TransactionId ComputeXidHorizonsResultLastXmin;
+
+#ifdef XIDCACHE_DEBUG
+
+/* counters for XidCache measurement */
+static long xc_by_recent_xmin = 0;
+static long xc_by_known_xact = 0;
+static long xc_by_my_xact = 0;
+static long xc_by_latest_xid = 0;
+static long xc_by_main_xid = 0;
+static long xc_by_child_xid = 0;
+static long xc_by_known_assigned = 0;
+static long xc_no_overflow = 0;
+static long xc_slow_answer = 0;
+
+#define xc_by_recent_xmin_inc() (xc_by_recent_xmin++)
+#define xc_by_known_xact_inc() (xc_by_known_xact++)
+#define xc_by_my_xact_inc() (xc_by_my_xact++)
+#define xc_by_latest_xid_inc() (xc_by_latest_xid++)
+#define xc_by_main_xid_inc() (xc_by_main_xid++)
+#define xc_by_child_xid_inc() (xc_by_child_xid++)
+#define xc_by_known_assigned_inc() (xc_by_known_assigned++)
+#define xc_no_overflow_inc() (xc_no_overflow++)
+#define xc_slow_answer_inc() (xc_slow_answer++)
+
+static void DisplayXidCache(void);
+#else /* !XIDCACHE_DEBUG */
+
+#define xc_by_recent_xmin_inc() ((void) 0)
+#define xc_by_known_xact_inc() ((void) 0)
+#define xc_by_my_xact_inc() ((void) 0)
+#define xc_by_latest_xid_inc() ((void) 0)
+#define xc_by_main_xid_inc() ((void) 0)
+#define xc_by_child_xid_inc() ((void) 0)
+#define xc_by_known_assigned_inc() ((void) 0)
+#define xc_no_overflow_inc() ((void) 0)
+#define xc_slow_answer_inc() ((void) 0)
+#endif /* XIDCACHE_DEBUG */
+
+/* Primitives for KnownAssignedXids array handling for standby */
+static void KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock);
+static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
+ bool exclusive_lock);
+static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
+static bool KnownAssignedXidExists(TransactionId xid);
+static void KnownAssignedXidsRemove(TransactionId xid);
+static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
+ TransactionId *subxids);
+static void KnownAssignedXidsRemovePreceding(TransactionId removeXid);
+static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
+static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
+ TransactionId *xmin,
+ TransactionId xmax);
+static TransactionId KnownAssignedXidsGetOldestXmin(void);
+static void KnownAssignedXidsDisplay(int trace_level);
+static void KnownAssignedXidsReset(void);
+static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid);
+static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
+static void MaintainLatestCompletedXid(TransactionId latestXid);
+static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
+
+static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel,
+ TransactionId xid);
+static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons);
+
+/*
+ * Report shared-memory space needed by CreateSharedProcArray.
+ */
+Size
+ProcArrayShmemSize(void)
+{
+ Size size;
+
+ /* Size of the ProcArray structure itself */
+#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts)
+
+ size = offsetof(ProcArrayStruct, pgprocnos);
+ size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
+
+ /*
+ * During Hot Standby processing we have a data structure called
+ * KnownAssignedXids, created in shared memory. Local data structures are
+ * also created in various backends during GetSnapshotData(),
+ * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
+ * main structures created in those functions must be identically sized,
+ * since we may at times copy the whole of the data structures around. We
+ * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
+ *
+ * Ideally we'd only create this structure if we were actually doing hot
+ * standby in the current run, but we don't know that yet at the time
+ * shared memory is being set up.
+ */
+#define TOTAL_MAX_CACHED_SUBXIDS \
+ ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+
+ if (EnableHotStandby)
+ {
+ size = add_size(size,
+ mul_size(sizeof(TransactionId),
+ TOTAL_MAX_CACHED_SUBXIDS));
+ size = add_size(size,
+ mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
+ }
+
+ return size;
+}
+
+/*
+ * Initialize the shared PGPROC array during postmaster startup.
+ */
+void
+CreateSharedProcArray(void)
+{
+ bool found;
+
+ /* Create or attach to the ProcArray shared structure */
+ procArray = (ProcArrayStruct *)
+ ShmemInitStruct("Proc Array",
+ add_size(offsetof(ProcArrayStruct, pgprocnos),
+ mul_size(sizeof(int),
+ PROCARRAY_MAXPROCS)),
+ &found);
+
+ if (!found)
+ {
+ /*
+ * We're the first - initialize.
+ */
+ procArray->numProcs = 0;
+ procArray->maxProcs = PROCARRAY_MAXPROCS;
+ procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
+ procArray->numKnownAssignedXids = 0;
+ procArray->tailKnownAssignedXids = 0;
+ procArray->headKnownAssignedXids = 0;
+ SpinLockInit(&procArray->known_assigned_xids_lck);
+ procArray->lastOverflowedXid = InvalidTransactionId;
+ procArray->replication_slot_xmin = InvalidTransactionId;
+ procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+ ShmemVariableCache->xactCompletionCount = 1;
+ }
+
+ allProcs = ProcGlobal->allProcs;
+
+ /* Create or attach to the KnownAssignedXids arrays too, if needed */
+ if (EnableHotStandby)
+ {
+ KnownAssignedXids = (TransactionId *)
+ ShmemInitStruct("KnownAssignedXids",
+ mul_size(sizeof(TransactionId),
+ TOTAL_MAX_CACHED_SUBXIDS),
+ &found);
+ KnownAssignedXidsValid = (bool *)
+ ShmemInitStruct("KnownAssignedXidsValid",
+ mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
+ &found);
+ }
+}
+
+/*
+ * Add the specified PGPROC to the shared array.
+ */
+void
+ProcArrayAdd(PGPROC *proc)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+ int movecount;
+
+ /* See ProcGlobal comment explaining why both locks are held */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+ if (arrayP->numProcs >= arrayP->maxProcs)
+ {
+ /*
+ * Oops, no room. (This really shouldn't happen, since there is a
+ * fixed supply of PGPROC structs too, and so we should have failed
+ * earlier.)
+ */
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("sorry, too many clients already")));
+ }
+
+ /*
+ * Keep the procs array sorted by (PGPROC *) so that we can utilize
+ * locality of references much better. This is useful while traversing the
+ * ProcArray because there is an increased likelihood of finding the next
+ * PGPROC structure in the cache.
+ *
+ * Since the occurrence of adding/removing a proc is much lower than the
+ * access to the ProcArray itself, the overhead should be marginal
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int procno PG_USED_FOR_ASSERTS_ONLY = arrayP->pgprocnos[index];
+
+ Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+ Assert(allProcs[procno].pgxactoff == index);
+
+ /* If we have found our right position in the array, break */
+ if (arrayP->pgprocnos[index] > proc->pgprocno)
+ break;
+ }
+
+ movecount = arrayP->numProcs - index;
+ memmove(&arrayP->pgprocnos[index + 1],
+ &arrayP->pgprocnos[index],
+ movecount * sizeof(*arrayP->pgprocnos));
+ memmove(&ProcGlobal->xids[index + 1],
+ &ProcGlobal->xids[index],
+ movecount * sizeof(*ProcGlobal->xids));
+ memmove(&ProcGlobal->subxidStates[index + 1],
+ &ProcGlobal->subxidStates[index],
+ movecount * sizeof(*ProcGlobal->subxidStates));
+ memmove(&ProcGlobal->statusFlags[index + 1],
+ &ProcGlobal->statusFlags[index],
+ movecount * sizeof(*ProcGlobal->statusFlags));
+
+ arrayP->pgprocnos[index] = proc->pgprocno;
+ proc->pgxactoff = index;
+ ProcGlobal->xids[index] = proc->xid;
+ ProcGlobal->subxidStates[index] = proc->subxidStatus;
+ ProcGlobal->statusFlags[index] = proc->statusFlags;
+
+ arrayP->numProcs++;
+
+ /* adjust pgxactoff for all following PGPROCs */
+ index++;
+ for (; index < arrayP->numProcs; index++)
+ {
+ int procno = arrayP->pgprocnos[index];
+
+ Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+ Assert(allProcs[procno].pgxactoff == index - 1);
+
+ allProcs[procno].pgxactoff = index;
+ }
+
+ /*
+ * Release in reversed acquisition order, to reduce frequency of having to
+ * wait for XidGenLock while holding ProcArrayLock.
+ */
+ LWLockRelease(XidGenLock);
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * Remove the specified PGPROC from the shared array.
+ *
+ * When latestXid is a valid XID, we are removing a live 2PC gxact from the
+ * array, and thus causing it to appear as "not running" anymore. In this
+ * case we must advance latestCompletedXid. (This is essentially the same
+ * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
+ * the ProcArrayLock only once, and don't damage the content of the PGPROC;
+ * twophase.c depends on the latter.)
+ */
+void
+ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int myoff;
+ int movecount;
+
+#ifdef XIDCACHE_DEBUG
+ /* dump stats at backend shutdown, but not prepared-xact end */
+ if (proc->pid != 0)
+ DisplayXidCache();
+#endif
+
+ /* See ProcGlobal comment explaining why both locks are held */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+ myoff = proc->pgxactoff;
+
+ Assert(myoff >= 0 && myoff < arrayP->numProcs);
+ Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff);
+
+ if (TransactionIdIsValid(latestXid))
+ {
+ Assert(TransactionIdIsValid(ProcGlobal->xids[myoff]));
+
+ /* Advance global latestCompletedXid while holding the lock */
+ MaintainLatestCompletedXid(latestXid);
+
+ /* Same with xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+
+ ProcGlobal->xids[myoff] = InvalidTransactionId;
+ ProcGlobal->subxidStates[myoff].overflowed = false;
+ ProcGlobal->subxidStates[myoff].count = 0;
+ }
+ else
+ {
+ /* Shouldn't be trying to remove a live transaction here */
+ Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
+ }
+
+ Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
+ Assert(ProcGlobal->subxidStates[myoff].count == 0);
+ Assert(ProcGlobal->subxidStates[myoff].overflowed == false);
+
+ ProcGlobal->statusFlags[myoff] = 0;
+
+ /* Keep the PGPROC array sorted. See notes above */
+ movecount = arrayP->numProcs - myoff - 1;
+ memmove(&arrayP->pgprocnos[myoff],
+ &arrayP->pgprocnos[myoff + 1],
+ movecount * sizeof(*arrayP->pgprocnos));
+ memmove(&ProcGlobal->xids[myoff],
+ &ProcGlobal->xids[myoff + 1],
+ movecount * sizeof(*ProcGlobal->xids));
+ memmove(&ProcGlobal->subxidStates[myoff],
+ &ProcGlobal->subxidStates[myoff + 1],
+ movecount * sizeof(*ProcGlobal->subxidStates));
+ memmove(&ProcGlobal->statusFlags[myoff],
+ &ProcGlobal->statusFlags[myoff + 1],
+ movecount * sizeof(*ProcGlobal->statusFlags));
+
+ arrayP->pgprocnos[arrayP->numProcs - 1] = -1; /* for debugging */
+ arrayP->numProcs--;
+
+ /*
+ * Adjust pgxactoff of following procs for removed PGPROC (note that
+ * numProcs already has been decremented).
+ */
+ for (int index = myoff; index < arrayP->numProcs; index++)
+ {
+ int procno = arrayP->pgprocnos[index];
+
+ Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+ Assert(allProcs[procno].pgxactoff - 1 == index);
+
+ allProcs[procno].pgxactoff = index;
+ }
+
+ /*
+ * Release in reversed acquisition order, to reduce frequency of having to
+ * wait for XidGenLock while holding ProcArrayLock.
+ */
+ LWLockRelease(XidGenLock);
+ LWLockRelease(ProcArrayLock);
+}
+
+
+/*
+ * ProcArrayEndTransaction -- mark a transaction as no longer running
+ *
+ * This is used interchangeably for commit and abort cases. The transaction
+ * commit/abort must already be reported to WAL and pg_xact.
+ *
+ * proc is currently always MyProc, but we pass it explicitly for flexibility.
+ * latestXid is the latest Xid among the transaction's main XID and
+ * subtransactions, or InvalidTransactionId if it has no XID. (We must ask
+ * the caller to pass latestXid, instead of computing it from the PGPROC's
+ * contents, because the subxid information in the PGPROC might be
+ * incomplete.)
+ */
+void
+ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
+{
+ if (TransactionIdIsValid(latestXid))
+ {
+ /*
+ * We must lock ProcArrayLock while clearing our advertised XID, so
+ * that we do not exit the set of "running" transactions while someone
+ * else is taking a snapshot. See discussion in
+ * src/backend/access/transam/README.
+ */
+ Assert(TransactionIdIsValid(proc->xid));
+
+ /*
+ * If we can immediately acquire ProcArrayLock, we clear our own XID
+ * and release the lock. If not, use group XID clearing to improve
+ * efficiency.
+ */
+ if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
+ {
+ ProcArrayEndTransactionInternal(proc, latestXid);
+ LWLockRelease(ProcArrayLock);
+ }
+ else
+ ProcArrayGroupClearXid(proc, latestXid);
+ }
+ else
+ {
+ /*
+ * If we have no XID, we don't need to lock, since we won't affect
+ * anyone else's calculation of a snapshot. We might change their
+ * estimate of global xmin, but that's OK.
+ */
+ Assert(!TransactionIdIsValid(proc->xid));
+ Assert(proc->subxidStatus.count == 0);
+ Assert(!proc->subxidStatus.overflowed);
+
+ proc->lxid = InvalidLocalTransactionId;
+ proc->xmin = InvalidTransactionId;
+
+ /* be sure this is cleared in abort */
+ proc->delayChkptFlags = 0;
+
+ proc->recoveryConflictPending = false;
+
+ /* must be cleared with xid/xmin: */
+ /* avoid unnecessarily dirtying shared cachelines */
+ if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
+ {
+ Assert(!LWLockHeldByMe(ProcArrayLock));
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ Assert(proc->statusFlags == ProcGlobal->statusFlags[proc->pgxactoff]);
+ proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
+ ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
+ LWLockRelease(ProcArrayLock);
+ }
+ }
+}
+
+/*
+ * Mark a write transaction as no longer running.
+ *
+ * We don't do any locking here; caller must handle that.
+ */
+static inline void
+ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
+{
+ int pgxactoff = proc->pgxactoff;
+
+ /*
+ * Note: we need exclusive lock here because we're going to change other
+ * processes' PGPROC entries.
+ */
+ Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE));
+ Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff]));
+ Assert(ProcGlobal->xids[pgxactoff] == proc->xid);
+
+ ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+ proc->xid = InvalidTransactionId;
+ proc->lxid = InvalidLocalTransactionId;
+ proc->xmin = InvalidTransactionId;
+
+ /* be sure this is cleared in abort */
+ proc->delayChkptFlags = 0;
+
+ proc->recoveryConflictPending = false;
+
+ /* must be cleared with xid/xmin: */
+ /* avoid unnecessarily dirtying shared cachelines */
+ if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
+ {
+ proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
+ ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
+ }
+
+ /* Clear the subtransaction-XID cache too while holding the lock */
+ Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+ ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+ if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+ {
+ ProcGlobal->subxidStates[pgxactoff].count = 0;
+ ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+ proc->subxidStatus.count = 0;
+ proc->subxidStatus.overflowed = false;
+ }
+
+ /* Also advance global latestCompletedXid while holding the lock */
+ MaintainLatestCompletedXid(latestXid);
+
+ /* Same with xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+}
+
+/*
+ * ProcArrayGroupClearXid -- group XID clearing
+ *
+ * When we cannot immediately acquire ProcArrayLock in exclusive mode at
+ * commit time, add ourselves to a list of processes that need their XIDs
+ * cleared. The first process to add itself to the list will acquire
+ * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
+ * on behalf of all group members. This avoids a great deal of contention
+ * around ProcArrayLock when many processes are trying to commit at once,
+ * since the lock need not be repeatedly handed off from one committing
+ * process to the next.
+ */
+static void
+ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
+{
+ PROC_HDR *procglobal = ProcGlobal;
+ uint32 nextidx;
+ uint32 wakeidx;
+
+ /* We should definitely have an XID to clear. */
+ Assert(TransactionIdIsValid(proc->xid));
+
+ /* Add ourselves to the list of processes needing a group XID clear. */
+ proc->procArrayGroupMember = true;
+ proc->procArrayGroupMemberXid = latestXid;
+ nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
+ while (true)
+ {
+ pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx);
+
+ if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
+ &nextidx,
+ (uint32) proc->pgprocno))
+ break;
+ }
+
+ /*
+ * If the list was not empty, the leader will clear our XID. It is
+ * impossible to have followers without a leader because the first process
+ * that has added itself to the list will always have nextidx as
+ * INVALID_PGPROCNO.
+ */
+ if (nextidx != INVALID_PGPROCNO)
+ {
+ int extraWaits = 0;
+
+ /* Sleep until the leader clears our XID. */
+ pgstat_report_wait_start(WAIT_EVENT_PROCARRAY_GROUP_UPDATE);
+ for (;;)
+ {
+ /* acts as a read barrier */
+ PGSemaphoreLock(proc->sem);
+ if (!proc->procArrayGroupMember)
+ break;
+ extraWaits++;
+ }
+ pgstat_report_wait_end();
+
+ Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO);
+
+ /* Fix semaphore count for any absorbed wakeups */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+ return;
+ }
+
+ /* We are the leader. Acquire the lock on behalf of everyone. */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Now that we've got the lock, clear the list of processes waiting for
+ * group XID clearing, saving a pointer to the head of the list. Trying
+ * to pop elements one at a time could lead to an ABA problem.
+ */
+ nextidx = pg_atomic_exchange_u32(&procglobal->procArrayGroupFirst,
+ INVALID_PGPROCNO);
+
+ /* Remember head of list so we can perform wakeups after dropping lock. */
+ wakeidx = nextidx;
+
+ /* Walk the list and clear all XIDs. */
+ while (nextidx != INVALID_PGPROCNO)
+ {
+ PGPROC *nextproc = &allProcs[nextidx];
+
+ ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid);
+
+ /* Move to next proc in list. */
+ nextidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
+ }
+
+ /* We're done with the lock now. */
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * Now that we've released the lock, go back and wake everybody up. We
+ * don't do this under the lock so as to keep lock hold times to a
+ * minimum. The system calls we need to perform to wake other processes
+ * up are probably much slower than the simple memory writes we did while
+ * holding the lock.
+ */
+ while (wakeidx != INVALID_PGPROCNO)
+ {
+ PGPROC *nextproc = &allProcs[wakeidx];
+
+ wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
+ pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PGPROCNO);
+
+ /* ensure all previous writes are visible before follower continues. */
+ pg_write_barrier();
+
+ nextproc->procArrayGroupMember = false;
+
+ if (nextproc != MyProc)
+ PGSemaphoreUnlock(nextproc->sem);
+ }
+}
+
+/*
+ * ProcArrayClearTransaction -- clear the transaction fields
+ *
+ * This is used after successfully preparing a 2-phase transaction. We are
+ * not actually reporting the transaction's XID as no longer running --- it
+ * will still appear as running because the 2PC's gxact is in the ProcArray
+ * too. We just have to clear out our own PGPROC.
+ */
+void
+ProcArrayClearTransaction(PGPROC *proc)
+{
+ int pgxactoff;
+
+ /*
+ * Currently we need to lock ProcArrayLock exclusively here, as we
+ * increment xactCompletionCount below. We also need it at least in shared
+ * mode for pgproc->pgxactoff to stay the same below.
+ *
+ * We could however, as this action does not actually change anyone's view
+ * of the set of running XIDs (our entry is duplicate with the gxact that
+ * has already been inserted into the ProcArray), lower the lock level to
+ * shared if we were to make xactCompletionCount an atomic variable. But
+ * that doesn't seem worth it currently, as a 2PC commit is heavyweight
+ * enough for this not to be the bottleneck. If it ever becomes a
+ * bottleneck it may also be worth considering to combine this with the
+ * subsequent ProcArrayRemove()
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ pgxactoff = proc->pgxactoff;
+
+ ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+ proc->xid = InvalidTransactionId;
+
+ proc->lxid = InvalidLocalTransactionId;
+ proc->xmin = InvalidTransactionId;
+ proc->recoveryConflictPending = false;
+
+ Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
+ Assert(!proc->delayChkptFlags);
+
+ /*
+ * Need to increment completion count even though transaction hasn't
+ * really committed yet. The reason for that is that GetSnapshotData()
+ * omits the xid of the current transaction, thus without the increment we
+ * otherwise could end up reusing the snapshot later. Which would be bad,
+ * because it might not count the prepared transaction as running.
+ */
+ ShmemVariableCache->xactCompletionCount++;
+
+ /* Clear the subtransaction-XID cache too */
+ Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+ ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+ if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+ {
+ ProcGlobal->subxidStates[pgxactoff].count = 0;
+ ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+ proc->subxidStatus.count = 0;
+ proc->subxidStatus.overflowed = false;
+ }
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * Update ShmemVariableCache->latestCompletedXid to point to latestXid if
+ * currently older.
+ */
+static void
+MaintainLatestCompletedXid(TransactionId latestXid)
+{
+ FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+
+ Assert(FullTransactionIdIsValid(cur_latest));
+ Assert(!RecoveryInProgress());
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+ {
+ ShmemVariableCache->latestCompletedXid =
+ FullXidRelativeTo(cur_latest, latestXid);
+ }
+
+ Assert(IsBootstrapProcessingMode() ||
+ FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * Same as MaintainLatestCompletedXid, except for use during WAL replay.
+ */
+static void
+MaintainLatestCompletedXidRecovery(TransactionId latestXid)
+{
+ FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+ FullTransactionId rel;
+
+ Assert(AmStartupProcess() || !IsUnderPostmaster);
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ /*
+ * Need a FullTransactionId to compare latestXid with. Can't rely on
+ * latestCompletedXid to be initialized in recovery. But in recovery it's
+ * safe to access nextXid without a lock for the startup process.
+ */
+ rel = ShmemVariableCache->nextXid;
+ Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+ if (!FullTransactionIdIsValid(cur_latest) ||
+ TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+ {
+ ShmemVariableCache->latestCompletedXid =
+ FullXidRelativeTo(rel, latestXid);
+ }
+
+ Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * ProcArrayInitRecovery -- initialize recovery xid mgmt environment
+ *
+ * Remember up to where the startup process initialized the CLOG and subtrans
+ * so we can ensure it's initialized gaplessly up to the point where necessary
+ * while in recovery.
+ */
+void
+ProcArrayInitRecovery(TransactionId initializedUptoXID)
+{
+ Assert(standbyState == STANDBY_INITIALIZED);
+ Assert(TransactionIdIsNormal(initializedUptoXID));
+
+ /*
+ * we set latestObservedXid to the xid SUBTRANS has been initialized up
+ * to, so we can extend it from that point onwards in
+ * RecordKnownAssignedTransactionIds, and when we get consistent in
+ * ProcArrayApplyRecoveryInfo().
+ */
+ latestObservedXid = initializedUptoXID;
+ TransactionIdRetreat(latestObservedXid);
+}
+
+/*
+ * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
+ *
+ * Takes us through 3 states: Initialized, Pending and Ready.
+ * Normal case is to go all the way to Ready straight away, though there
+ * are atypical cases where we need to take it in steps.
+ *
+ * Use the data about running transactions on the primary to create the initial
+ * state of KnownAssignedXids. We also use these records to regularly prune
+ * KnownAssignedXids because we know it is possible that some transactions
+ * with FATAL errors fail to write abort records, which could cause eventual
+ * overflow.
+ *
+ * See comments for LogStandbySnapshot().
+ */
+void
+ProcArrayApplyRecoveryInfo(RunningTransactions running)
+{
+ TransactionId *xids;
+ int nxids;
+ int i;
+
+ Assert(standbyState >= STANDBY_INITIALIZED);
+ Assert(TransactionIdIsValid(running->nextXid));
+ Assert(TransactionIdIsValid(running->oldestRunningXid));
+ Assert(TransactionIdIsNormal(running->latestCompletedXid));
+
+ /*
+ * Remove stale transactions, if any.
+ */
+ ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
+
+ /*
+ * Remove stale locks, if any.
+ */
+ StandbyReleaseOldLocks(running->oldestRunningXid);
+
+ /*
+ * If our snapshot is already valid, nothing else to do...
+ */
+ if (standbyState == STANDBY_SNAPSHOT_READY)
+ return;
+
+ /*
+ * If our initial RunningTransactionsData had an overflowed snapshot then
+ * we knew we were missing some subxids from our snapshot. If we continue
+ * to see overflowed snapshots then we might never be able to start up, so
+ * we make another test to see if our snapshot is now valid. We know that
+ * the missing subxids are equal to or earlier than nextXid. After we
+ * initialise we continue to apply changes during recovery, so once the
+ * oldestRunningXid is later than the nextXid from the initial snapshot we
+ * know that we no longer have missing information and can mark the
+ * snapshot as valid.
+ */
+ if (standbyState == STANDBY_SNAPSHOT_PENDING)
+ {
+ /*
+ * If the snapshot isn't overflowed or if its empty we can reset our
+ * pending state and use this snapshot instead.
+ */
+ if (!running->subxid_overflow || running->xcnt == 0)
+ {
+ /*
+ * If we have already collected known assigned xids, we need to
+ * throw them away before we apply the recovery snapshot.
+ */
+ KnownAssignedXidsReset();
+ standbyState = STANDBY_INITIALIZED;
+ }
+ else
+ {
+ if (TransactionIdPrecedes(standbySnapshotPendingXmin,
+ running->oldestRunningXid))
+ {
+ standbyState = STANDBY_SNAPSHOT_READY;
+ elog(trace_recovery(DEBUG1),
+ "recovery snapshots are now enabled");
+ }
+ else
+ elog(trace_recovery(DEBUG1),
+ "recovery snapshot waiting for non-overflowed snapshot or "
+ "until oldest active xid on standby is at least %u (now %u)",
+ standbySnapshotPendingXmin,
+ running->oldestRunningXid);
+ return;
+ }
+ }
+
+ Assert(standbyState == STANDBY_INITIALIZED);
+
+ /*
+ * NB: this can be reached at least twice, so make sure new code can deal
+ * with that.
+ */
+
+ /*
+ * Nobody else is running yet, but take locks anyhow
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * KnownAssignedXids is sorted so we cannot just add the xids, we have to
+ * sort them first.
+ *
+ * Some of the new xids are top-level xids and some are subtransactions.
+ * We don't call SubTransSetParent because it doesn't matter yet. If we
+ * aren't overflowed then all xids will fit in snapshot and so we don't
+ * need subtrans. If we later overflow, an xid assignment record will add
+ * xids to subtrans. If RunningTransactionsData is overflowed then we
+ * don't have enough information to correctly update subtrans anyway.
+ */
+
+ /*
+ * Allocate a temporary array to avoid modifying the array passed as
+ * argument.
+ */
+ xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt));
+
+ /*
+ * Add to the temp array any xids which have not already completed.
+ */
+ nxids = 0;
+ for (i = 0; i < running->xcnt + running->subxcnt; i++)
+ {
+ TransactionId xid = running->xids[i];
+
+ /*
+ * The running-xacts snapshot can contain xids that were still visible
+ * in the procarray when the snapshot was taken, but were already
+ * WAL-logged as completed. They're not running anymore, so ignore
+ * them.
+ */
+ if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+ continue;
+
+ xids[nxids++] = xid;
+ }
+
+ if (nxids > 0)
+ {
+ if (procArray->numKnownAssignedXids != 0)
+ {
+ LWLockRelease(ProcArrayLock);
+ elog(ERROR, "KnownAssignedXids is not empty");
+ }
+
+ /*
+ * Sort the array so that we can add them safely into
+ * KnownAssignedXids.
+ *
+ * We have to sort them logically, because in KnownAssignedXidsAdd we
+ * call TransactionIdFollowsOrEquals and so on. But we know these XIDs
+ * come from RUNNING_XACTS, which means there are only normal XIDs
+ * from the same epoch, so this is safe.
+ */
+ qsort(xids, nxids, sizeof(TransactionId), xidLogicalComparator);
+
+ /*
+ * Add the sorted snapshot into KnownAssignedXids. The running-xacts
+ * snapshot may include duplicated xids because of prepared
+ * transactions, so ignore them.
+ */
+ for (i = 0; i < nxids; i++)
+ {
+ if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i]))
+ {
+ elog(DEBUG1,
+ "found duplicated transaction %u for KnownAssignedXids insertion",
+ xids[i]);
+ continue;
+ }
+ KnownAssignedXidsAdd(xids[i], xids[i], true);
+ }
+
+ KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+ }
+
+ pfree(xids);
+
+ /*
+ * latestObservedXid is at least set to the point where SUBTRANS was
+ * started up to (cf. ProcArrayInitRecovery()) or to the biggest xid
+ * RecordKnownAssignedTransactionIds() was called for. Initialize
+ * subtrans from thereon, up to nextXid - 1.
+ *
+ * We need to duplicate parts of RecordKnownAssignedTransactionId() here,
+ * because we've just added xids to the known assigned xids machinery that
+ * haven't gone through RecordKnownAssignedTransactionId().
+ */
+ Assert(TransactionIdIsNormal(latestObservedXid));
+ TransactionIdAdvance(latestObservedXid);
+ while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
+ {
+ ExtendSUBTRANS(latestObservedXid);
+ TransactionIdAdvance(latestObservedXid);
+ }
+ TransactionIdRetreat(latestObservedXid); /* = running->nextXid - 1 */
+
+ /* ----------
+ * Now we've got the running xids we need to set the global values that
+ * are used to track snapshots as they evolve further.
+ *
+ * - latestCompletedXid which will be the xmax for snapshots
+ * - lastOverflowedXid which shows whether snapshots overflow
+ * - nextXid
+ *
+ * If the snapshot overflowed, then we still initialise with what we know,
+ * but the recovery snapshot isn't fully valid yet because we know there
+ * are some subxids missing. We don't know the specific subxids that are
+ * missing, so conservatively assume the last one is latestObservedXid.
+ * ----------
+ */
+ if (running->subxid_overflow)
+ {
+ standbyState = STANDBY_SNAPSHOT_PENDING;
+
+ standbySnapshotPendingXmin = latestObservedXid;
+ procArray->lastOverflowedXid = latestObservedXid;
+ }
+ else
+ {
+ standbyState = STANDBY_SNAPSHOT_READY;
+
+ standbySnapshotPendingXmin = InvalidTransactionId;
+ }
+
+ /*
+ * If a transaction wrote a commit record in the gap between taking and
+ * logging the snapshot then latestCompletedXid may already be higher than
+ * the value from the snapshot, so check before we use the incoming value.
+ * It also might not yet be set at all.
+ */
+ MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
+
+ /*
+ * NB: No need to increment ShmemVariableCache->xactCompletionCount here,
+ * nobody can see it yet.
+ */
+
+ LWLockRelease(ProcArrayLock);
+
+ /* ShmemVariableCache->nextXid must be beyond any observed xid. */
+ AdvanceNextFullTransactionIdPastXid(latestObservedXid);
+
+ Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+ KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+ if (standbyState == STANDBY_SNAPSHOT_READY)
+ elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled");
+ else
+ elog(trace_recovery(DEBUG1),
+ "recovery snapshot waiting for non-overflowed snapshot or "
+ "until oldest active xid on standby is at least %u (now %u)",
+ standbySnapshotPendingXmin,
+ running->oldestRunningXid);
+}
+
+/*
+ * ProcArrayApplyXidAssignment
+ * Process an XLOG_XACT_ASSIGNMENT WAL record
+ */
+void
+ProcArrayApplyXidAssignment(TransactionId topxid,
+ int nsubxids, TransactionId *subxids)
+{
+ TransactionId max_xid;
+ int i;
+
+ Assert(standbyState >= STANDBY_INITIALIZED);
+
+ max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
+
+ /*
+ * Mark all the subtransactions as observed.
+ *
+ * NOTE: This will fail if the subxid contains too many previously
+ * unobserved xids to fit into known-assigned-xids. That shouldn't happen
+ * as the code stands, because xid-assignment records should never contain
+ * more than PGPROC_MAX_CACHED_SUBXIDS entries.
+ */
+ RecordKnownAssignedTransactionIds(max_xid);
+
+ /*
+ * Notice that we update pg_subtrans with the top-level xid, rather than
+ * the parent xid. This is a difference between normal processing and
+ * recovery, yet is still correct in all cases. The reason is that
+ * subtransaction commit is not marked in clog until commit processing, so
+ * all aborted subtransactions have already been clearly marked in clog.
+ * As a result we are able to refer directly to the top-level
+ * transaction's state rather than skipping through all the intermediate
+ * states in the subtransaction tree. This should be the first time we
+ * have attempted to SubTransSetParent().
+ */
+ for (i = 0; i < nsubxids; i++)
+ SubTransSetParent(subxids[i], topxid);
+
+ /* KnownAssignedXids isn't maintained yet, so we're done for now */
+ if (standbyState == STANDBY_INITIALIZED)
+ return;
+
+ /*
+ * Uses same locking as transaction commit
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Remove subxids from known-assigned-xacts.
+ */
+ KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
+
+ /*
+ * Advance lastOverflowedXid to be at least the last of these subxids.
+ */
+ if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
+ procArray->lastOverflowedXid = max_xid;
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * TransactionIdIsInProgress -- is given transaction running in some backend
+ *
+ * Aside from some shortcuts such as checking RecentXmin and our own Xid,
+ * there are four possibilities for finding a running transaction:
+ *
+ * 1. The given Xid is a main transaction Id. We will find this out cheaply
+ * by looking at ProcGlobal->xids.
+ *
+ * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
+ * We can find this out cheaply too.
+ *
+ * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
+ * if the Xid is running on the primary.
+ *
+ * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
+ * if that is running according to ProcGlobal->xids[] or KnownAssignedXids.
+ * This is the slowest way, but sadly it has to be done always if the others
+ * failed, unless we see that the cached subxact sets are complete (none have
+ * overflowed).
+ *
+ * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids
+ * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
+ * This buys back some concurrency (and we can't retrieve the main Xids from
+ * ProcGlobal->xids[] again anyway; see GetNewTransactionId).
+ */
+bool
+TransactionIdIsInProgress(TransactionId xid)
+{
+ static TransactionId *xids = NULL;
+ static TransactionId *other_xids;
+ XidCacheStatus *other_subxidstates;
+ int nxids = 0;
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId topxid;
+ TransactionId latestCompletedXid;
+ int mypgxactoff;
+ int numProcs;
+ int j;
+
+ /*
+ * Don't bother checking a transaction older than RecentXmin; it could not
+ * possibly still be running. (Note: in particular, this guarantees that
+ * we reject InvalidTransactionId, FrozenTransactionId, etc as not
+ * running.)
+ */
+ if (TransactionIdPrecedes(xid, RecentXmin))
+ {
+ xc_by_recent_xmin_inc();
+ return false;
+ }
+
+ /*
+ * We may have just checked the status of this transaction, so if it is
+ * already known to be completed, we can fall out without any access to
+ * shared memory.
+ */
+ if (TransactionIdEquals(cachedXidIsNotInProgress, xid))
+ {
+ xc_by_known_xact_inc();
+ return false;
+ }
+
+ /*
+ * Also, we can handle our own transaction (and subtransactions) without
+ * any access to shared memory.
+ */
+ if (TransactionIdIsCurrentTransactionId(xid))
+ {
+ xc_by_my_xact_inc();
+ return true;
+ }
+
+ /*
+ * If first time through, get workspace to remember main XIDs in. We
+ * malloc it permanently to avoid repeated palloc/pfree overhead.
+ */
+ if (xids == NULL)
+ {
+ /*
+ * In hot standby mode, reserve enough space to hold all xids in the
+ * known-assigned list. If we later finish recovery, we no longer need
+ * the bigger array, but we don't bother to shrink it.
+ */
+ int maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
+
+ xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
+ if (xids == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ other_xids = ProcGlobal->xids;
+ other_subxidstates = ProcGlobal->subxidStates;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ /*
+ * Now that we have the lock, we can check latestCompletedXid; if the
+ * target Xid is after that, it's surely still running.
+ */
+ latestCompletedXid =
+ XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+ if (TransactionIdPrecedes(latestCompletedXid, xid))
+ {
+ LWLockRelease(ProcArrayLock);
+ xc_by_latest_xid_inc();
+ return true;
+ }
+
+ /* No shortcuts, gotta grovel through the array */
+ mypgxactoff = MyProc->pgxactoff;
+ numProcs = arrayP->numProcs;
+ for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
+ {
+ int pgprocno;
+ PGPROC *proc;
+ TransactionId pxid;
+ int pxids;
+
+ /* Ignore ourselves --- dealt with it above */
+ if (pgxactoff == mypgxactoff)
+ continue;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+
+ if (!TransactionIdIsValid(pxid))
+ continue;
+
+ /*
+ * Step 1: check the main Xid
+ */
+ if (TransactionIdEquals(pxid, xid))
+ {
+ LWLockRelease(ProcArrayLock);
+ xc_by_main_xid_inc();
+ return true;
+ }
+
+ /*
+ * We can ignore main Xids that are younger than the target Xid, since
+ * the target could not possibly be their child.
+ */
+ if (TransactionIdPrecedes(xid, pxid))
+ continue;
+
+ /*
+ * Step 2: check the cached child-Xids arrays
+ */
+ pxids = other_subxidstates[pgxactoff].count;
+ pg_read_barrier(); /* pairs with barrier in GetNewTransactionId() */
+ pgprocno = arrayP->pgprocnos[pgxactoff];
+ proc = &allProcs[pgprocno];
+ for (j = pxids - 1; j >= 0; j--)
+ {
+ /* Fetch xid just once - see GetNewTransactionId */
+ TransactionId cxid = UINT32_ACCESS_ONCE(proc->subxids.xids[j]);
+
+ if (TransactionIdEquals(cxid, xid))
+ {
+ LWLockRelease(ProcArrayLock);
+ xc_by_child_xid_inc();
+ return true;
+ }
+ }
+
+ /*
+ * Save the main Xid for step 4. We only need to remember main Xids
+ * that have uncached children. (Note: there is no race condition
+ * here because the overflowed flag cannot be cleared, only set, while
+ * we hold ProcArrayLock. So we can't miss an Xid that we need to
+ * worry about.)
+ */
+ if (other_subxidstates[pgxactoff].overflowed)
+ xids[nxids++] = pxid;
+ }
+
+ /*
+ * Step 3: in hot standby mode, check the known-assigned-xids list. XIDs
+ * in the list must be treated as running.
+ */
+ if (RecoveryInProgress())
+ {
+ /* none of the PGPROC entries should have XIDs in hot standby mode */
+ Assert(nxids == 0);
+
+ if (KnownAssignedXidExists(xid))
+ {
+ LWLockRelease(ProcArrayLock);
+ xc_by_known_assigned_inc();
+ return true;
+ }
+
+ /*
+ * If the KnownAssignedXids overflowed, we have to check pg_subtrans
+ * too. Fetch all xids from KnownAssignedXids that are lower than
+ * xid, since if xid is a subtransaction its parent will always have a
+ * lower value. Note we will collect both main and subXIDs here, but
+ * there's no help for it.
+ */
+ if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
+ nxids = KnownAssignedXidsGet(xids, xid);
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * If none of the relevant caches overflowed, we know the Xid is not
+ * running without even looking at pg_subtrans.
+ */
+ if (nxids == 0)
+ {
+ xc_no_overflow_inc();
+ cachedXidIsNotInProgress = xid;
+ return false;
+ }
+
+ /*
+ * Step 4: have to check pg_subtrans.
+ *
+ * At this point, we know it's either a subtransaction of one of the Xids
+ * in xids[], or it's not running. If it's an already-failed
+ * subtransaction, we want to say "not running" even though its parent may
+ * still be running. So first, check pg_xact to see if it's been aborted.
+ */
+ xc_slow_answer_inc();
+
+ if (TransactionIdDidAbort(xid))
+ {
+ cachedXidIsNotInProgress = xid;
+ return false;
+ }
+
+ /*
+ * It isn't aborted, so check whether the transaction tree it belongs to
+ * is still running (or, more precisely, whether it was running when we
+ * held ProcArrayLock).
+ */
+ topxid = SubTransGetTopmostTransaction(xid);
+ Assert(TransactionIdIsValid(topxid));
+ if (!TransactionIdEquals(topxid, xid) &&
+ pg_lfind32(topxid, xids, nxids))
+ return true;
+
+ cachedXidIsNotInProgress = xid;
+ return false;
+}
+
+/*
+ * TransactionIdIsActive -- is xid the top-level XID of an active backend?
+ *
+ * This differs from TransactionIdIsInProgress in that it ignores prepared
+ * transactions, as well as transactions running on the primary if we're in
+ * hot standby. Also, we ignore subtransactions since that's not needed
+ * for current uses.
+ */
+bool
+TransactionIdIsActive(TransactionId xid)
+{
+ bool result = false;
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ int i;
+
+ /*
+ * Don't bother checking a transaction older than RecentXmin; it could not
+ * possibly still be running.
+ */
+ if (TransactionIdPrecedes(xid, RecentXmin))
+ return false;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (i = 0; i < arrayP->numProcs; i++)
+ {
+ int pgprocno = arrayP->pgprocnos[i];
+ PGPROC *proc = &allProcs[pgprocno];
+ TransactionId pxid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ pxid = UINT32_ACCESS_ONCE(other_xids[i]);
+
+ if (!TransactionIdIsValid(pxid))
+ continue;
+
+ if (proc->pid == 0)
+ continue; /* ignore prepared transactions */
+
+ if (TransactionIdEquals(pxid, xid))
+ {
+ result = true;
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+
+/*
+ * Determine XID horizons.
+ *
+ * This is used by wrapper functions like GetOldestNonRemovableTransactionId()
+ * (for VACUUM), GetReplicationHorizons() (for hot_standby_feedback), etc as
+ * well as "internally" by GlobalVisUpdate() (see comment above struct
+ * GlobalVisState).
+ *
+ * See the definition of ComputeXidHorizonsResult for the various computed
+ * horizons.
+ *
+ * For VACUUM separate horizons (used to decide which deleted tuples must
+ * be preserved), for shared and non-shared tables are computed. For shared
+ * relations backends in all databases must be considered, but for non-shared
+ * relations that's not required, since only backends in my own database could
+ * ever see the tuples in them. Also, we can ignore concurrently running lazy
+ * VACUUMs because (a) they must be working on other tables, and (b) they
+ * don't need to do snapshot-based lookups.
+ *
+ * This also computes a horizon used to truncate pg_subtrans. For that
+ * backends in all databases have to be considered, and concurrently running
+ * lazy VACUUMs cannot be ignored, as they still may perform pg_subtrans
+ * accesses.
+ *
+ * Note: we include all currently running xids in the set of considered xids.
+ * This ensures that if a just-started xact has not yet set its snapshot,
+ * when it does set the snapshot it cannot set xmin less than what we compute.
+ * See notes in src/backend/access/transam/README.
+ *
+ * Note: despite the above, it's possible for the calculated values to move
+ * backwards on repeated calls. The calculated values are conservative, so
+ * that anything older is definitely not considered as running by anyone
+ * anymore, but the exact values calculated depend on a number of things. For
+ * example, if there are no transactions running in the current database, the
+ * horizon for normal tables will be latestCompletedXid. If a transaction
+ * begins after that, its xmin will include in-progress transactions in other
+ * databases that started earlier, so another call will return a lower value.
+ * Nonetheless it is safe to vacuum a table in the current database with the
+ * first result. There are also replication-related effects: a walsender
+ * process can set its xmin based on transactions that are no longer running
+ * on the primary but are still being replayed on the standby, thus possibly
+ * making the values go backwards. In this case there is a possibility that
+ * we lose data that the standby would like to have, but unless the standby
+ * uses a replication slot to make its xmin persistent there is little we can
+ * do about that --- data is only protected if the walsender runs continuously
+ * while queries are executed on the standby. (The Hot Standby code deals
+ * with such cases by failing standby queries that needed to access
+ * already-removed data, so there's no integrity bug.)
+ *
+ * Note: the approximate horizons (see definition of GlobalVisState) are
+ * updated by the computations done here. That's currently required for
+ * correctness and a small optimization. Without doing so it's possible that
+ * heap vacuum's call to heap_page_prune() uses a more conservative horizon
+ * than later when deciding which tuples can be removed - which the code
+ * doesn't expect (breaking HOT).
+ */
+static void
+ComputeXidHorizons(ComputeXidHorizonsResult *h)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId kaxmin;
+ bool in_recovery = RecoveryInProgress();
+ TransactionId *other_xids = ProcGlobal->xids;
+
+ /* inferred after ProcArrayLock is released */
+ h->catalog_oldest_nonremovable = InvalidTransactionId;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ h->latest_completed = ShmemVariableCache->latestCompletedXid;
+
+ /*
+ * We initialize the MIN() calculation with latestCompletedXid + 1. This
+ * is a lower bound for the XIDs that might appear in the ProcArray later,
+ * and so protects us against overestimating the result due to future
+ * additions.
+ */
+ {
+ TransactionId initial;
+
+ initial = XidFromFullTransactionId(h->latest_completed);
+ Assert(TransactionIdIsValid(initial));
+ TransactionIdAdvance(initial);
+
+ h->oldest_considered_running = initial;
+ h->shared_oldest_nonremovable = initial;
+ h->data_oldest_nonremovable = initial;
+
+ /*
+ * Only modifications made by this backend affect the horizon for
+ * temporary relations. Instead of a check in each iteration of the
+ * loop over all PGPROCs it is cheaper to just initialize to the
+ * current top-level xid any.
+ *
+ * Without an assigned xid we could use a horizon as aggressive as
+ * GetNewTransactionId(), but we can get away with the much cheaper
+ * latestCompletedXid + 1: If this backend has no xid there, by
+ * definition, can't be any newer changes in the temp table than
+ * latestCompletedXid.
+ */
+ if (TransactionIdIsValid(MyProc->xid))
+ h->temp_oldest_nonremovable = MyProc->xid;
+ else
+ h->temp_oldest_nonremovable = initial;
+ }
+
+ /*
+ * Fetch slot horizons while ProcArrayLock is held - the
+ * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside
+ * the lock.
+ */
+ h->slot_xmin = procArray->replication_slot_xmin;
+ h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+ for (int index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ int8 statusFlags = ProcGlobal->statusFlags[index];
+ TransactionId xid;
+ TransactionId xmin;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
+ xmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+ /*
+ * Consider both the transaction's Xmin, and its Xid.
+ *
+ * We must check both because a transaction might have an Xmin but not
+ * (yet) an Xid; conversely, if it has an Xid, that could determine
+ * some not-yet-set Xmin.
+ */
+ xmin = TransactionIdOlder(xmin, xid);
+
+ /* if neither is set, this proc doesn't influence the horizon */
+ if (!TransactionIdIsValid(xmin))
+ continue;
+
+ /*
+ * Don't ignore any procs when determining which transactions might be
+ * considered running. While slots should ensure logical decoding
+ * backends are protected even without this check, it can't hurt to
+ * include them here as well..
+ */
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running, xmin);
+
+ /*
+ * Skip over backends either vacuuming (which is ok with rows being
+ * removed, as long as pg_subtrans is not truncated) or doing logical
+ * decoding (which manages xmin separately, check below).
+ */
+ if (statusFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING))
+ continue;
+
+ /* shared tables need to take backends in all databases into account */
+ h->shared_oldest_nonremovable =
+ TransactionIdOlder(h->shared_oldest_nonremovable, xmin);
+
+ /*
+ * Normally sessions in other databases are ignored for anything but
+ * the shared horizon.
+ *
+ * However, include them when MyDatabaseId is not (yet) set. A
+ * backend in the process of starting up must not compute a "too
+ * aggressive" horizon, otherwise we could end up using it to prune
+ * still-needed data away. If the current backend never connects to a
+ * database this is harmless, because data_oldest_nonremovable will
+ * never be utilized.
+ *
+ * Also, sessions marked with PROC_AFFECTS_ALL_HORIZONS should always
+ * be included. (This flag is used for hot standby feedback, which
+ * can't be tied to a specific database.)
+ *
+ * Also, while in recovery we cannot compute an accurate per-database
+ * horizon, as all xids are managed via the KnownAssignedXids
+ * machinery.
+ */
+ if (proc->databaseId == MyDatabaseId ||
+ MyDatabaseId == InvalidOid ||
+ (statusFlags & PROC_AFFECTS_ALL_HORIZONS) ||
+ in_recovery)
+ {
+ h->data_oldest_nonremovable =
+ TransactionIdOlder(h->data_oldest_nonremovable, xmin);
+ }
+ }
+
+ /*
+ * If in recovery fetch oldest xid in KnownAssignedXids, will be applied
+ * after lock is released.
+ */
+ if (in_recovery)
+ kaxmin = KnownAssignedXidsGetOldestXmin();
+
+ /*
+ * No other information from shared state is needed, release the lock
+ * immediately. The rest of the computations can be done without a lock.
+ */
+ LWLockRelease(ProcArrayLock);
+
+ if (in_recovery)
+ {
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running, kaxmin);
+ h->shared_oldest_nonremovable =
+ TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin);
+ h->data_oldest_nonremovable =
+ TransactionIdOlder(h->data_oldest_nonremovable, kaxmin);
+ /* temp relations cannot be accessed in recovery */
+ }
+
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->shared_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+ h->data_oldest_nonremovable));
+
+ /*
+ * Check whether there are replication slots requiring an older xmin.
+ */
+ h->shared_oldest_nonremovable =
+ TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_xmin);
+ h->data_oldest_nonremovable =
+ TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin);
+
+ /*
+ * The only difference between catalog / data horizons is that the slot's
+ * catalog xmin is applied to the catalog one (so catalogs can be accessed
+ * for logical decoding). Initialize with data horizon, and then back up
+ * further if necessary. Have to back up the shared horizon as well, since
+ * that also can contain catalogs.
+ */
+ h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable;
+ h->shared_oldest_nonremovable =
+ TransactionIdOlder(h->shared_oldest_nonremovable,
+ h->slot_catalog_xmin);
+ h->catalog_oldest_nonremovable = h->data_oldest_nonremovable;
+ h->catalog_oldest_nonremovable =
+ TransactionIdOlder(h->catalog_oldest_nonremovable,
+ h->slot_catalog_xmin);
+
+ /*
+ * It's possible that slots backed up the horizons further than
+ * oldest_considered_running. Fix.
+ */
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running,
+ h->shared_oldest_nonremovable);
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running,
+ h->catalog_oldest_nonremovable);
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running,
+ h->data_oldest_nonremovable);
+
+ /*
+ * shared horizons have to be at least as old as the oldest visible in
+ * current db
+ */
+ Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+ h->data_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+ h->catalog_oldest_nonremovable));
+
+ /*
+ * Horizons need to ensure that pg_subtrans access is still possible for
+ * the relevant backends.
+ */
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->shared_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->catalog_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->data_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->temp_oldest_nonremovable));
+ Assert(!TransactionIdIsValid(h->slot_xmin) ||
+ TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->slot_xmin));
+ Assert(!TransactionIdIsValid(h->slot_catalog_xmin) ||
+ TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->slot_catalog_xmin));
+
+ /* update approximate horizons with the computed horizons */
+ GlobalVisUpdateApply(h);
+}
+
+/*
+ * Determine what kind of visibility horizon needs to be used for a
+ * relation. If rel is NULL, the most conservative horizon is used.
+ */
+static inline GlobalVisHorizonKind
+GlobalVisHorizonKindForRel(Relation rel)
+{
+ /*
+ * Other relkinds currently don't contain xids, nor always the necessary
+ * logical decoding markers.
+ */
+ Assert(!rel ||
+ rel->rd_rel->relkind == RELKIND_RELATION ||
+ rel->rd_rel->relkind == RELKIND_MATVIEW ||
+ rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+
+ if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress())
+ return VISHORIZON_SHARED;
+ else if (IsCatalogRelation(rel) ||
+ RelationIsAccessibleInLogicalDecoding(rel))
+ return VISHORIZON_CATALOG;
+ else if (!RELATION_IS_LOCAL(rel))
+ return VISHORIZON_DATA;
+ else
+ return VISHORIZON_TEMP;
+}
+
+/*
+ * Return the oldest XID for which deleted tuples must be preserved in the
+ * passed table.
+ *
+ * If rel is not NULL the horizon may be considerably more recent than
+ * otherwise (i.e. fewer tuples will be removable). In the NULL case a horizon
+ * that is correct (but not optimal) for all relations will be returned.
+ *
+ * This is used by VACUUM to decide which deleted tuples must be preserved in
+ * the passed in table.
+ */
+TransactionId
+GetOldestNonRemovableTransactionId(Relation rel)
+{
+ ComputeXidHorizonsResult horizons;
+
+ ComputeXidHorizons(&horizons);
+
+ switch (GlobalVisHorizonKindForRel(rel))
+ {
+ case VISHORIZON_SHARED:
+ return horizons.shared_oldest_nonremovable;
+ case VISHORIZON_CATALOG:
+ return horizons.catalog_oldest_nonremovable;
+ case VISHORIZON_DATA:
+ return horizons.data_oldest_nonremovable;
+ case VISHORIZON_TEMP:
+ return horizons.temp_oldest_nonremovable;
+ }
+
+ /* just to prevent compiler warnings */
+ return InvalidTransactionId;
+}
+
+/*
+ * Return the oldest transaction id any currently running backend might still
+ * consider running. This should not be used for visibility / pruning
+ * determinations (see GetOldestNonRemovableTransactionId()), but for
+ * decisions like up to where pg_subtrans can be truncated.
+ */
+TransactionId
+GetOldestTransactionIdConsideredRunning(void)
+{
+ ComputeXidHorizonsResult horizons;
+
+ ComputeXidHorizons(&horizons);
+
+ return horizons.oldest_considered_running;
+}
+
+/*
+ * Return the visibility horizons for a hot standby feedback message.
+ */
+void
+GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin)
+{
+ ComputeXidHorizonsResult horizons;
+
+ ComputeXidHorizons(&horizons);
+
+ /*
+ * Don't want to use shared_oldest_nonremovable here, as that contains the
+ * effect of replication slot's catalog_xmin. We want to send a separate
+ * feedback for the catalog horizon, so the primary can remove data table
+ * contents more aggressively.
+ */
+ *xmin = horizons.shared_oldest_nonremovable_raw;
+ *catalog_xmin = horizons.slot_catalog_xmin;
+}
+
+/*
+ * GetMaxSnapshotXidCount -- get max size for snapshot XID array
+ *
+ * We have to export this for use by snapmgr.c.
+ */
+int
+GetMaxSnapshotXidCount(void)
+{
+ return procArray->maxProcs;
+}
+
+/*
+ * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array
+ *
+ * We have to export this for use by snapmgr.c.
+ */
+int
+GetMaxSnapshotSubxidCount(void)
+{
+ return TOTAL_MAX_CACHED_SUBXIDS;
+}
+
+/*
+ * Initialize old_snapshot_threshold specific parts of a newly build snapshot.
+ */
+static void
+GetSnapshotDataInitOldSnapshot(Snapshot snapshot)
+{
+ if (!OldSnapshotThresholdActive())
+ {
+ /*
+ * If not using "snapshot too old" feature, fill related fields with
+ * dummy values that don't require any locking.
+ */
+ snapshot->lsn = InvalidXLogRecPtr;
+ snapshot->whenTaken = 0;
+ }
+ else
+ {
+ /*
+ * Capture the current time and WAL stream location in case this
+ * snapshot becomes old enough to need to fall back on the special
+ * "old snapshot" logic.
+ */
+ snapshot->lsn = GetXLogInsertRecPtr();
+ snapshot->whenTaken = GetSnapshotCurrentTimestamp();
+ MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin);
+ }
+}
+
+/*
+ * Helper function for GetSnapshotData() that checks if the bulk of the
+ * visibility information in the snapshot is still valid. If so, it updates
+ * the fields that need to change and returns true. Otherwise it returns
+ * false.
+ *
+ * This very likely can be evolved to not need ProcArrayLock held (at very
+ * least in the case we already hold a snapshot), but that's for another day.
+ */
+static bool
+GetSnapshotDataReuse(Snapshot snapshot)
+{
+ uint64 curXactCompletionCount;
+
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ if (unlikely(snapshot->snapXactCompletionCount == 0))
+ return false;
+
+ curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+ if (curXactCompletionCount != snapshot->snapXactCompletionCount)
+ return false;
+
+ /*
+ * If the current xactCompletionCount is still the same as it was at the
+ * time the snapshot was built, we can be sure that rebuilding the
+ * contents of the snapshot the hard way would result in the same snapshot
+ * contents:
+ *
+ * As explained in transam/README, the set of xids considered running by
+ * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot
+ * contents only depend on transactions with xids and xactCompletionCount
+ * is incremented whenever a transaction with an xid finishes (while
+ * holding ProcArrayLock exclusively). Thus the xactCompletionCount check
+ * ensures we would detect if the snapshot would have changed.
+ *
+ * As the snapshot contents are the same as it was before, it is safe to
+ * re-enter the snapshot's xmin into the PGPROC array. None of the rows
+ * visible under the snapshot could already have been removed (that'd
+ * require the set of running transactions to change) and it fulfills the
+ * requirement that concurrent GetSnapshotData() calls yield the same
+ * xmin.
+ */
+ if (!TransactionIdIsValid(MyProc->xmin))
+ MyProc->xmin = TransactionXmin = snapshot->xmin;
+
+ RecentXmin = snapshot->xmin;
+ Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+ snapshot->curcid = GetCurrentCommandId(false);
+ snapshot->active_count = 0;
+ snapshot->regd_count = 0;
+ snapshot->copied = false;
+
+ GetSnapshotDataInitOldSnapshot(snapshot);
+
+ return true;
+}
+
+/*
+ * GetSnapshotData -- returns information about running transactions.
+ *
+ * The returned snapshot includes xmin (lowest still-running xact ID),
+ * xmax (highest completed xact ID + 1), and a list of running xact IDs
+ * in the range xmin <= xid < xmax. It is used as follows:
+ * All xact IDs < xmin are considered finished.
+ * All xact IDs >= xmax are considered still running.
+ * For an xact ID xmin <= xid < xmax, consult list to see whether
+ * it is considered running or not.
+ * This ensures that the set of transactions seen as "running" by the
+ * current xact will not change after it takes the snapshot.
+ *
+ * All running top-level XIDs are included in the snapshot, except for lazy
+ * VACUUM processes. We also try to include running subtransaction XIDs,
+ * but since PGPROC has only a limited cache area for subxact XIDs, full
+ * information may not be available. If we find any overflowed subxid arrays,
+ * we have to mark the snapshot's subxid data as overflowed, and extra work
+ * *may* need to be done to determine what's running (see XidInMVCCSnapshot()).
+ *
+ * We also update the following backend-global variables:
+ * TransactionXmin: the oldest xmin of any snapshot in use in the
+ * current transaction (this is the same as MyProc->xmin).
+ * RecentXmin: the xmin computed for the most recent snapshot. XIDs
+ * older than this are known not running any more.
+ *
+ * And try to advance the bounds of GlobalVis{Shared,Catalog,Data,Temp}Rels
+ * for the benefit of the GlobalVisTest* family of functions.
+ *
+ * Note: this function should probably not be called with an argument that's
+ * not statically allocated (see xip allocation below).
+ */
+Snapshot
+GetSnapshotData(Snapshot snapshot)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ TransactionId xmin;
+ TransactionId xmax;
+ int count = 0;
+ int subcount = 0;
+ bool suboverflowed = false;
+ FullTransactionId latest_completed;
+ TransactionId oldestxid;
+ int mypgxactoff;
+ TransactionId myxid;
+ uint64 curXactCompletionCount;
+
+ TransactionId replication_slot_xmin = InvalidTransactionId;
+ TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+
+ Assert(snapshot != NULL);
+
+ /*
+ * Allocating space for maxProcs xids is usually overkill; numProcs would
+ * be sufficient. But it seems better to do the malloc while not holding
+ * the lock, so we can't look at numProcs. Likewise, we allocate much
+ * more subxip storage than is probably needed.
+ *
+ * This does open a possibility for avoiding repeated malloc/free: since
+ * maxProcs does not change at runtime, we can simply reuse the previous
+ * xip arrays if any. (This relies on the fact that all callers pass
+ * static SnapshotData structs.)
+ */
+ if (snapshot->xip == NULL)
+ {
+ /*
+ * First call for this snapshot. Snapshot is same size whether or not
+ * we are in recovery, see later comments.
+ */
+ snapshot->xip = (TransactionId *)
+ malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ Assert(snapshot->subxip == NULL);
+ snapshot->subxip = (TransactionId *)
+ malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId));
+ if (snapshot->subxip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /*
+ * It is sufficient to get shared lock on ProcArrayLock, even if we are
+ * going to set MyProc->xmin.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ if (GetSnapshotDataReuse(snapshot))
+ {
+ LWLockRelease(ProcArrayLock);
+ return snapshot;
+ }
+
+ latest_completed = ShmemVariableCache->latestCompletedXid;
+ mypgxactoff = MyProc->pgxactoff;
+ myxid = other_xids[mypgxactoff];
+ Assert(myxid == MyProc->xid);
+
+ oldestxid = ShmemVariableCache->oldestXid;
+ curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+
+ /* xmax is always latestCompletedXid + 1 */
+ xmax = XidFromFullTransactionId(latest_completed);
+ TransactionIdAdvance(xmax);
+ Assert(TransactionIdIsNormal(xmax));
+
+ /* initialize xmin calculation with xmax */
+ xmin = xmax;
+
+ /* take own xid into account, saves a check inside the loop */
+ if (TransactionIdIsNormal(myxid) && NormalTransactionIdPrecedes(myxid, xmin))
+ xmin = myxid;
+
+ snapshot->takenDuringRecovery = RecoveryInProgress();
+
+ if (!snapshot->takenDuringRecovery)
+ {
+ int numProcs = arrayP->numProcs;
+ TransactionId *xip = snapshot->xip;
+ int *pgprocnos = arrayP->pgprocnos;
+ XidCacheStatus *subxidStates = ProcGlobal->subxidStates;
+ uint8 *allStatusFlags = ProcGlobal->statusFlags;
+
+ /*
+ * First collect set of pgxactoff/xids that need to be included in the
+ * snapshot.
+ */
+ for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
+ {
+ /* Fetch xid just once - see GetNewTransactionId */
+ TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+ uint8 statusFlags;
+
+ Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff);
+
+ /*
+ * If the transaction has no XID assigned, we can skip it; it
+ * won't have sub-XIDs either.
+ */
+ if (likely(xid == InvalidTransactionId))
+ continue;
+
+ /*
+ * We don't include our own XIDs (if any) in the snapshot. It
+ * needs to be included in the xmin computation, but we did so
+ * outside the loop.
+ */
+ if (pgxactoff == mypgxactoff)
+ continue;
+
+ /*
+ * The only way we are able to get here with a non-normal xid is
+ * during bootstrap - with this backend using
+ * BootstrapTransactionId. But the above test should filter that
+ * out.
+ */
+ Assert(TransactionIdIsNormal(xid));
+
+ /*
+ * If the XID is >= xmax, we can skip it; such transactions will
+ * be treated as running anyway (and any sub-XIDs will also be >=
+ * xmax).
+ */
+ if (!NormalTransactionIdPrecedes(xid, xmax))
+ continue;
+
+ /*
+ * Skip over backends doing logical decoding which manages xmin
+ * separately (check below) and ones running LAZY VACUUM.
+ */
+ statusFlags = allStatusFlags[pgxactoff];
+ if (statusFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM))
+ continue;
+
+ if (NormalTransactionIdPrecedes(xid, xmin))
+ xmin = xid;
+
+ /* Add XID to snapshot. */
+ xip[count++] = xid;
+
+ /*
+ * Save subtransaction XIDs if possible (if we've already
+ * overflowed, there's no point). Note that the subxact XIDs must
+ * be later than their parent, so no need to check them against
+ * xmin. We could filter against xmax, but it seems better not to
+ * do that much work while holding the ProcArrayLock.
+ *
+ * The other backend can add more subxids concurrently, but cannot
+ * remove any. Hence it's important to fetch nxids just once.
+ * Should be safe to use memcpy, though. (We needn't worry about
+ * missing any xids added concurrently, because they must postdate
+ * xmax.)
+ *
+ * Again, our own XIDs are not included in the snapshot.
+ */
+ if (!suboverflowed)
+ {
+
+ if (subxidStates[pgxactoff].overflowed)
+ suboverflowed = true;
+ else
+ {
+ int nsubxids = subxidStates[pgxactoff].count;
+
+ if (nsubxids > 0)
+ {
+ int pgprocno = pgprocnos[pgxactoff];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ pg_read_barrier(); /* pairs with GetNewTransactionId */
+
+ memcpy(snapshot->subxip + subcount,
+ proc->subxids.xids,
+ nsubxids * sizeof(TransactionId));
+ subcount += nsubxids;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ /*
+ * We're in hot standby, so get XIDs from KnownAssignedXids.
+ *
+ * We store all xids directly into subxip[]. Here's why:
+ *
+ * In recovery we don't know which xids are top-level and which are
+ * subxacts, a design choice that greatly simplifies xid processing.
+ *
+ * It seems like we would want to try to put xids into xip[] only, but
+ * that is fairly small. We would either need to make that bigger or
+ * to increase the rate at which we WAL-log xid assignment; neither is
+ * an appealing choice.
+ *
+ * We could try to store xids into xip[] first and then into subxip[]
+ * if there are too many xids. That only works if the snapshot doesn't
+ * overflow because we do not search subxip[] in that case. A simpler
+ * way is to just store all xids in the subxip array because this is
+ * by far the bigger array. We just leave the xip array empty.
+ *
+ * Either way we need to change the way XidInMVCCSnapshot() works
+ * depending upon when the snapshot was taken, or change normal
+ * snapshot processing so it matches.
+ *
+ * Note: It is possible for recovery to end before we finish taking
+ * the snapshot, and for newly assigned transaction ids to be added to
+ * the ProcArray. xmax cannot change while we hold ProcArrayLock, so
+ * those newly added transaction ids would be filtered away, so we
+ * need not be concerned about them.
+ */
+ subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
+ xmax);
+
+ if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
+ suboverflowed = true;
+ }
+
+
+ /*
+ * Fetch into local variable while ProcArrayLock is held - the
+ * LWLockRelease below is a barrier, ensuring this happens inside the
+ * lock.
+ */
+ replication_slot_xmin = procArray->replication_slot_xmin;
+ replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+ if (!TransactionIdIsValid(MyProc->xmin))
+ MyProc->xmin = TransactionXmin = xmin;
+
+ LWLockRelease(ProcArrayLock);
+
+ /* maintain state for GlobalVis* */
+ {
+ TransactionId def_vis_xid;
+ TransactionId def_vis_xid_data;
+ FullTransactionId def_vis_fxid;
+ FullTransactionId def_vis_fxid_data;
+ FullTransactionId oldestfxid;
+
+ /*
+ * Converting oldestXid is only safe when xid horizon cannot advance,
+ * i.e. holding locks. While we don't hold the lock anymore, all the
+ * necessary data has been gathered with lock held.
+ */
+ oldestfxid = FullXidRelativeTo(latest_completed, oldestxid);
+
+ /* Check whether there's a replication slot requiring an older xmin. */
+ def_vis_xid_data =
+ TransactionIdOlder(xmin, replication_slot_xmin);
+
+ /*
+ * Rows in non-shared, non-catalog tables possibly could be vacuumed
+ * if older than this xid.
+ */
+ def_vis_xid = def_vis_xid_data;
+
+ /*
+ * Check whether there's a replication slot requiring an older catalog
+ * xmin.
+ */
+ def_vis_xid =
+ TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid);
+
+ def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid);
+ def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data);
+
+ /*
+ * Check if we can increase upper bound. As a previous
+ * GlobalVisUpdate() might have computed more aggressive values, don't
+ * overwrite them if so.
+ */
+ GlobalVisSharedRels.definitely_needed =
+ FullTransactionIdNewer(def_vis_fxid,
+ GlobalVisSharedRels.definitely_needed);
+ GlobalVisCatalogRels.definitely_needed =
+ FullTransactionIdNewer(def_vis_fxid,
+ GlobalVisCatalogRels.definitely_needed);
+ GlobalVisDataRels.definitely_needed =
+ FullTransactionIdNewer(def_vis_fxid_data,
+ GlobalVisDataRels.definitely_needed);
+ /* See temp_oldest_nonremovable computation in ComputeXidHorizons() */
+ if (TransactionIdIsNormal(myxid))
+ GlobalVisTempRels.definitely_needed =
+ FullXidRelativeTo(latest_completed, myxid);
+ else
+ {
+ GlobalVisTempRels.definitely_needed = latest_completed;
+ FullTransactionIdAdvance(&GlobalVisTempRels.definitely_needed);
+ }
+
+ /*
+ * Check if we know that we can initialize or increase the lower
+ * bound. Currently the only cheap way to do so is to use
+ * ShmemVariableCache->oldestXid as input.
+ *
+ * We should definitely be able to do better. We could e.g. put a
+ * global lower bound value into ShmemVariableCache.
+ */
+ GlobalVisSharedRels.maybe_needed =
+ FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+ oldestfxid);
+ GlobalVisCatalogRels.maybe_needed =
+ FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+ oldestfxid);
+ GlobalVisDataRels.maybe_needed =
+ FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+ oldestfxid);
+ /* accurate value known */
+ GlobalVisTempRels.maybe_needed = GlobalVisTempRels.definitely_needed;
+ }
+
+ RecentXmin = xmin;
+ Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+ snapshot->xmin = xmin;
+ snapshot->xmax = xmax;
+ snapshot->xcnt = count;
+ snapshot->subxcnt = subcount;
+ snapshot->suboverflowed = suboverflowed;
+ snapshot->snapXactCompletionCount = curXactCompletionCount;
+
+ snapshot->curcid = GetCurrentCommandId(false);
+
+ /*
+ * This is a new snapshot, so set both refcounts are zero, and mark it as
+ * not copied in persistent memory.
+ */
+ snapshot->active_count = 0;
+ snapshot->regd_count = 0;
+ snapshot->copied = false;
+
+ GetSnapshotDataInitOldSnapshot(snapshot);
+
+ return snapshot;
+}
+
+/*
+ * ProcArrayInstallImportedXmin -- install imported xmin into MyProc->xmin
+ *
+ * This is called when installing a snapshot imported from another
+ * transaction. To ensure that OldestXmin doesn't go backwards, we must
+ * check that the source transaction is still running, and we'd better do
+ * that atomically with installing the new xmin.
+ *
+ * Returns true if successful, false if source xact is no longer running.
+ */
+bool
+ProcArrayInstallImportedXmin(TransactionId xmin,
+ VirtualTransactionId *sourcevxid)
+{
+ bool result = false;
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ Assert(TransactionIdIsNormal(xmin));
+ if (!sourcevxid)
+ return false;
+
+ /* Get lock so source xact can't end while we're doing this */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ int statusFlags = ProcGlobal->statusFlags[index];
+ TransactionId xid;
+
+ /* Ignore procs running LAZY VACUUM */
+ if (statusFlags & PROC_IN_VACUUM)
+ continue;
+
+ /* We are only interested in the specific virtual transaction. */
+ if (proc->backendId != sourcevxid->backendId)
+ continue;
+ if (proc->lxid != sourcevxid->localTransactionId)
+ continue;
+
+ /*
+ * We check the transaction's database ID for paranoia's sake: if it's
+ * in another DB then its xmin does not cover us. Caller should have
+ * detected this already, so we just treat any funny cases as
+ * "transaction not found".
+ */
+ if (proc->databaseId != MyDatabaseId)
+ continue;
+
+ /*
+ * Likewise, let's just make real sure its xmin does cover us.
+ */
+ xid = UINT32_ACCESS_ONCE(proc->xmin);
+ if (!TransactionIdIsNormal(xid) ||
+ !TransactionIdPrecedesOrEquals(xid, xmin))
+ continue;
+
+ /*
+ * We're good. Install the new xmin. As in GetSnapshotData, set
+ * TransactionXmin too. (Note that because snapmgr.c called
+ * GetSnapshotData first, we'll be overwriting a valid xmin here, so
+ * we don't check that.)
+ */
+ MyProc->xmin = TransactionXmin = xmin;
+
+ result = true;
+ break;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * ProcArrayInstallRestoredXmin -- install restored xmin into MyProc->xmin
+ *
+ * This is like ProcArrayInstallImportedXmin, but we have a pointer to the
+ * PGPROC of the transaction from which we imported the snapshot, rather than
+ * an XID.
+ *
+ * Note that this function also copies statusFlags from the source `proc` in
+ * order to avoid the case where MyProc's xmin needs to be skipped for
+ * computing xid horizon.
+ *
+ * Returns true if successful, false if source xact is no longer running.
+ */
+bool
+ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
+{
+ bool result = false;
+ TransactionId xid;
+
+ Assert(TransactionIdIsNormal(xmin));
+ Assert(proc != NULL);
+
+ /*
+ * Get an exclusive lock so that we can copy statusFlags from source proc.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Be certain that the referenced PGPROC has an advertised xmin which is
+ * no later than the one we're installing, so that the system-wide xmin
+ * can't go backwards. Also, make sure it's running in the same database,
+ * so that the per-database xmin cannot go backwards.
+ */
+ xid = UINT32_ACCESS_ONCE(proc->xmin);
+ if (proc->databaseId == MyDatabaseId &&
+ TransactionIdIsNormal(xid) &&
+ TransactionIdPrecedesOrEquals(xid, xmin))
+ {
+ /*
+ * Install xmin and propagate the statusFlags that affect how the
+ * value is interpreted by vacuum.
+ */
+ MyProc->xmin = TransactionXmin = xmin;
+ MyProc->statusFlags = (MyProc->statusFlags & ~PROC_XMIN_FLAGS) |
+ (proc->statusFlags & PROC_XMIN_FLAGS);
+ ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags;
+
+ result = true;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * GetRunningTransactionData -- returns information about running transactions.
+ *
+ * Similar to GetSnapshotData but returns more information. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes and
+ * prepared transactions.
+ *
+ * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
+ * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
+ * array until the caller has WAL-logged this snapshot, and releases the
+ * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
+ * lock is released.
+ *
+ * The returned data structure is statically allocated; caller should not
+ * modify it, and must not assume it is valid past the next call.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * Dummy PGPROCs from prepared transaction are included, meaning that this
+ * may return entries with duplicated TransactionId values coming from
+ * transaction finishing to prepare. Nothing is done about duplicated
+ * entries here to not hold on ProcArrayLock more than necessary.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ *
+ * Note that if any transaction has overflowed its cached subtransactions
+ * then there is no real need include any subtransactions.
+ */
+RunningTransactions
+GetRunningTransactionData(void)
+{
+ /* result workspace */
+ static RunningTransactionsData CurrentRunningXactsData;
+
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
+ TransactionId latestCompletedXid;
+ TransactionId oldestRunningXid;
+ TransactionId *xids;
+ int index;
+ int count;
+ int subcount;
+ bool suboverflowed;
+
+ Assert(!RecoveryInProgress());
+
+ /*
+ * Allocating space for maxProcs xids is usually overkill; numProcs would
+ * be sufficient. But it seems better to do the malloc while not holding
+ * the lock, so we can't look at numProcs. Likewise, we allocate much
+ * more subxip storage than is probably needed.
+ *
+ * Should only be allocated in bgwriter, since only ever executed during
+ * checkpoints.
+ */
+ if (CurrentRunningXacts->xids == NULL)
+ {
+ /*
+ * First call
+ */
+ CurrentRunningXacts->xids = (TransactionId *)
+ malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+ if (CurrentRunningXacts->xids == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ xids = CurrentRunningXacts->xids;
+
+ count = subcount = 0;
+ suboverflowed = false;
+
+ /*
+ * Ensure that no xids enter or leave the procarray while we obtain
+ * snapshot.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+ LWLockAcquire(XidGenLock, LW_SHARED);
+
+ latestCompletedXid =
+ XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+ oldestRunningXid =
+ XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+ /*
+ * Spin over procArray collecting all xids
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ TransactionId xid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+ /*
+ * We don't need to store transactions that don't have a TransactionId
+ * yet because they will not show as running on a standby server.
+ */
+ if (!TransactionIdIsValid(xid))
+ continue;
+
+ /*
+ * Be careful not to exclude any xids before calculating the values of
+ * oldestRunningXid and suboverflowed, since these are used to clean
+ * up transaction information held on standbys.
+ */
+ if (TransactionIdPrecedes(xid, oldestRunningXid))
+ oldestRunningXid = xid;
+
+ if (ProcGlobal->subxidStates[index].overflowed)
+ suboverflowed = true;
+
+ /*
+ * If we wished to exclude xids this would be the right place for it.
+ * Procs with the PROC_IN_VACUUM flag set don't usually assign xids,
+ * but they do during truncation at the end when they get the lock and
+ * truncate, so it is not much of a problem to include them if they
+ * are seen and it is cleaner to include them.
+ */
+
+ xids[count++] = xid;
+ }
+
+ /*
+ * Spin over procArray collecting all subxids, but only if there hasn't
+ * been a suboverflow.
+ */
+ if (!suboverflowed)
+ {
+ XidCacheStatus *other_subxidstates = ProcGlobal->subxidStates;
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ int nsubxids;
+
+ /*
+ * Save subtransaction XIDs. Other backends can't add or remove
+ * entries while we're holding XidGenLock.
+ */
+ nsubxids = other_subxidstates[index].count;
+ if (nsubxids > 0)
+ {
+ /* barrier not really required, as XidGenLock is held, but ... */
+ pg_read_barrier(); /* pairs with GetNewTransactionId */
+
+ memcpy(&xids[count], proc->subxids.xids,
+ nsubxids * sizeof(TransactionId));
+ count += nsubxids;
+ subcount += nsubxids;
+
+ /*
+ * Top-level XID of a transaction is always less than any of
+ * its subxids, so we don't need to check if any of the
+ * subxids are smaller than oldestRunningXid
+ */
+ }
+ }
+ }
+
+ /*
+ * It's important *not* to include the limits set by slots here because
+ * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
+ * were to be included here the initial value could never increase because
+ * of a circular dependency where slots only increase their limits when
+ * running xacts increases oldestRunningXid and running xacts only
+ * increases if slots do.
+ */
+
+ CurrentRunningXacts->xcnt = count - subcount;
+ CurrentRunningXacts->subxcnt = subcount;
+ CurrentRunningXacts->subxid_overflow = suboverflowed;
+ CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+ CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
+
+ Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
+ Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
+ Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
+
+ /* We don't release the locks here, the caller is responsible for that */
+
+ return CurrentRunningXacts;
+}
+
+/*
+ * GetOldestActiveTransactionId()
+ *
+ * Similar to GetSnapshotData but returns just oldestActiveXid. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes.
+ * We look at all databases, though there is no need to include WALSender
+ * since this has no effect on hot standby conflicts.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ */
+TransactionId
+GetOldestActiveTransactionId(void)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ TransactionId oldestRunningXid;
+ int index;
+
+ Assert(!RecoveryInProgress());
+
+ /*
+ * Read nextXid, as the upper bound of what's still active.
+ *
+ * Reading a TransactionId is atomic, but we must grab the lock to make
+ * sure that all XIDs < nextXid are already present in the proc array (or
+ * have already completed), when we spin over it.
+ */
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ LWLockRelease(XidGenLock);
+
+ /*
+ * Spin over procArray collecting all xids and subxids.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ TransactionId xid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+ if (!TransactionIdIsNormal(xid))
+ continue;
+
+ if (TransactionIdPrecedes(xid, oldestRunningXid))
+ oldestRunningXid = xid;
+
+ /*
+ * Top-level XID of a transaction is always less than any of its
+ * subxids, so we don't need to check if any of the subxids are
+ * smaller than oldestRunningXid
+ */
+ }
+ LWLockRelease(ProcArrayLock);
+
+ return oldestRunningXid;
+}
+
+/*
+ * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
+ *
+ * Returns the oldest xid that we can guarantee not to have been affected by
+ * vacuum, i.e. no rows >= that xid have been vacuumed away unless the
+ * transaction aborted. Note that the value can (and most of the time will) be
+ * much more conservative than what really has been affected by vacuum, but we
+ * currently don't have better data available.
+ *
+ * This is useful to initialize the cutoff xid after which a new changeset
+ * extraction replication slot can start decoding changes.
+ *
+ * Must be called with ProcArrayLock held either shared or exclusively,
+ * although most callers will want to use exclusive mode since it is expected
+ * that the caller will immediately use the xid to peg the xmin horizon.
+ */
+TransactionId
+GetOldestSafeDecodingTransactionId(bool catalogOnly)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId oldestSafeXid;
+ int index;
+ bool recovery_in_progress = RecoveryInProgress();
+
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ /*
+ * Acquire XidGenLock, so no transactions can acquire an xid while we're
+ * running. If no transaction with xid were running concurrently a new xid
+ * could influence the RecentXmin et al.
+ *
+ * We initialize the computation to nextXid since that's guaranteed to be
+ * a safe, albeit pessimal, value.
+ */
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+ /*
+ * If there's already a slot pegging the xmin horizon, we can start with
+ * that value, it's guaranteed to be safe since it's computed by this
+ * routine initially and has been enforced since. We can always use the
+ * slot's general xmin horizon, but the catalog horizon is only usable
+ * when only catalog data is going to be looked at.
+ */
+ if (TransactionIdIsValid(procArray->replication_slot_xmin) &&
+ TransactionIdPrecedes(procArray->replication_slot_xmin,
+ oldestSafeXid))
+ oldestSafeXid = procArray->replication_slot_xmin;
+
+ if (catalogOnly &&
+ TransactionIdIsValid(procArray->replication_slot_catalog_xmin) &&
+ TransactionIdPrecedes(procArray->replication_slot_catalog_xmin,
+ oldestSafeXid))
+ oldestSafeXid = procArray->replication_slot_catalog_xmin;
+
+ /*
+ * If we're not in recovery, we walk over the procarray and collect the
+ * lowest xid. Since we're called with ProcArrayLock held and have
+ * acquired XidGenLock, no entries can vanish concurrently, since
+ * ProcGlobal->xids[i] is only set with XidGenLock held and only cleared
+ * with ProcArrayLock held.
+ *
+ * In recovery we can't lower the safe value besides what we've computed
+ * above, so we'll have to wait a bit longer there. We unfortunately can
+ * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
+ * machinery can miss values and return an older value than is safe.
+ */
+ if (!recovery_in_progress)
+ {
+ TransactionId *other_xids = ProcGlobal->xids;
+
+ /*
+ * Spin over procArray collecting min(ProcGlobal->xids[i])
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ TransactionId xid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+ if (!TransactionIdIsNormal(xid))
+ continue;
+
+ if (TransactionIdPrecedes(xid, oldestSafeXid))
+ oldestSafeXid = xid;
+ }
+ }
+
+ LWLockRelease(XidGenLock);
+
+ return oldestSafeXid;
+}
+
+/*
+ * GetVirtualXIDsDelayingChkpt -- Get the VXIDs of transactions that are
+ * delaying checkpoint because they have critical actions in progress.
+ *
+ * Constructs an array of VXIDs of transactions that are currently in commit
+ * critical sections, as shown by having specified delayChkptFlags bits set
+ * in their PGPROC.
+ *
+ * Returns a palloc'd array that should be freed by the caller.
+ * *nvxids is the number of valid entries.
+ *
+ * Note that because backends set or clear delayChkptFlags without holding any
+ * lock, the result is somewhat indeterminate, but we don't really care. Even
+ * in a multiprocessor with delayed writes to shared memory, it should be
+ * certain that setting of delayChkptFlags will propagate to shared memory
+ * when the backend takes a lock, so we cannot fail to see a virtual xact as
+ * delayChkptFlags if it's already inserted its commit record. Whether it
+ * takes a little while for clearing of delayChkptFlags to propagate is
+ * unimportant for correctness.
+ */
+VirtualTransactionId *
+GetVirtualXIDsDelayingChkpt(int *nvxids, int type)
+{
+ VirtualTransactionId *vxids;
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ Assert(type != 0);
+
+ /* allocate what's certainly enough result space */
+ vxids = (VirtualTransactionId *)
+ palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if ((proc->delayChkptFlags & type) != 0)
+ {
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+ if (VirtualTransactionIdIsValid(vxid))
+ vxids[count++] = vxid;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ *nvxids = count;
+ return vxids;
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying?
+ *
+ * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any
+ * of the specified VXIDs are still in critical sections of code.
+ *
+ * Note: this is O(N^2) in the number of vxacts that are/were delaying, but
+ * those numbers should be small enough for it not to be a problem.
+ */
+bool
+HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids, int type)
+{
+ bool result = false;
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ Assert(type != 0);
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+
+ if ((proc->delayChkptFlags & type) != 0 &&
+ VirtualTransactionIdIsValid(vxid))
+ {
+ int i;
+
+ for (i = 0; i < nvxids; i++)
+ {
+ if (VirtualTransactionIdEquals(vxid, vxids[i]))
+ {
+ result = true;
+ break;
+ }
+ }
+ if (result)
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * BackendPidGetProc -- get a backend's PGPROC given its PID
+ *
+ * Returns NULL if not found. Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
+ */
+PGPROC *
+BackendPidGetProc(int pid)
+{
+ PGPROC *result;
+
+ if (pid == 0) /* never match dummy PGPROCs */
+ return NULL;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ result = BackendPidGetProcWithLock(pid);
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * BackendPidGetProcWithLock -- get a backend's PGPROC given its PID
+ *
+ * Same as above, except caller must be holding ProcArrayLock. The found
+ * entry, if any, can be assumed to be valid as long as the lock remains held.
+ */
+PGPROC *
+BackendPidGetProcWithLock(int pid)
+{
+ PGPROC *result = NULL;
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ if (pid == 0) /* never match dummy PGPROCs */
+ return NULL;
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ PGPROC *proc = &allProcs[arrayP->pgprocnos[index]];
+
+ if (proc->pid == pid)
+ {
+ result = proc;
+ break;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * BackendXidGetPid -- get a backend's pid given its XID
+ *
+ * Returns 0 if not found or it's a prepared transaction. Note that
+ * it is up to the caller to be sure that the question remains
+ * meaningful for long enough for the answer to be used ...
+ *
+ * Only main transaction Ids are considered. This function is mainly
+ * useful for determining what backend owns a lock.
+ *
+ * Beware that not every xact has an XID assigned. However, as long as you
+ * only call this using an XID found on disk, you're safe.
+ */
+int
+BackendXidGetPid(TransactionId xid)
+{
+ int result = 0;
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ int index;
+
+ if (xid == InvalidTransactionId) /* never match invalid xid */
+ return 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (other_xids[index] == xid)
+ {
+ result = proc->pid;
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * IsBackendPid -- is a given pid a running backend
+ *
+ * This is not called by the backend, but is called by external modules.
+ */
+bool
+IsBackendPid(int pid)
+{
+ return (BackendPidGetProc(pid) != NULL);
+}
+
+
+/*
+ * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * The array is palloc'd. The number of valid entries is returned into *nvxids.
+ *
+ * The arguments allow filtering the set of VXIDs returned. Our own process
+ * is always skipped. In addition:
+ * If limitXmin is not InvalidTransactionId, skip processes with
+ * xmin > limitXmin.
+ * If excludeXmin0 is true, skip processes with xmin = 0.
+ * If allDbs is false, skip processes attached to other databases.
+ * If excludeVacuum isn't zero, skip processes for which
+ * (statusFlags & excludeVacuum) is not zero.
+ *
+ * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
+ * allow skipping backends whose oldest live snapshot is no older than
+ * some snapshot we have. Since we examine the procarray with only shared
+ * lock, there are race conditions: a backend could set its xmin just after
+ * we look. Indeed, on multiprocessors with weak memory ordering, the
+ * other backend could have set its xmin *before* we look. We know however
+ * that such a backend must have held shared ProcArrayLock overlapping our
+ * own hold of ProcArrayLock, else we would see its xmin update. Therefore,
+ * any snapshot the other backend is taking concurrently with our scan cannot
+ * consider any transactions as still running that we think are committed
+ * (since backends must hold ProcArrayLock exclusive to commit).
+ */
+VirtualTransactionId *
+GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
+ bool allDbs, int excludeVacuum,
+ int *nvxids)
+{
+ VirtualTransactionId *vxids;
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ /* allocate what's certainly enough result space */
+ vxids = (VirtualTransactionId *)
+ palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ uint8 statusFlags = ProcGlobal->statusFlags[index];
+
+ if (proc == MyProc)
+ continue;
+
+ if (excludeVacuum & statusFlags)
+ continue;
+
+ if (allDbs || proc->databaseId == MyDatabaseId)
+ {
+ /* Fetch xmin just once - might change on us */
+ TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+ if (excludeXmin0 && !TransactionIdIsValid(pxmin))
+ continue;
+
+ /*
+ * InvalidTransactionId precedes all other XIDs, so a proc that
+ * hasn't set xmin yet will not be rejected by this test.
+ */
+ if (!TransactionIdIsValid(limitXmin) ||
+ TransactionIdPrecedesOrEquals(pxmin, limitXmin))
+ {
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+ if (VirtualTransactionIdIsValid(vxid))
+ vxids[count++] = vxid;
+ }
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ *nvxids = count;
+ return vxids;
+}
+
+/*
+ * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * Usage is limited to conflict resolution during recovery on standby servers.
+ * limitXmin is supplied as either a cutoff with snapshotConflictHorizon
+ * semantics, or InvalidTransactionId in cases where caller cannot accurately
+ * determine a safe snapshotConflictHorizon value.
+ *
+ * If limitXmin is InvalidTransactionId then we want to kill everybody,
+ * so we're not worried if they have a snapshot or not, nor does it really
+ * matter what type of lock we hold. Caller must avoid calling here with
+ * snapshotConflictHorizon style cutoffs that were set to InvalidTransactionId
+ * during original execution, since that actually indicates that there is
+ * definitely no need for a recovery conflict (the snapshotConflictHorizon
+ * convention for InvalidTransactionId values is the opposite of our own!).
+ *
+ * All callers that are checking xmins always now supply a valid and useful
+ * value for limitXmin. The limitXmin is always lower than the lowest
+ * numbered KnownAssignedXid that is not already a FATAL error. This is
+ * because we only care about cleanup records that are cleaning up tuple
+ * versions from committed transactions. In that case they will only occur
+ * at the point where the record is less than the lowest running xid. That
+ * allows us to say that if any backend takes a snapshot concurrently with
+ * us then the conflict assessment made here would never include the snapshot
+ * that is being derived. So we take LW_SHARED on the ProcArray and allow
+ * concurrent snapshots when limitXmin is valid. We might think about adding
+ * Assert(limitXmin < lowest(KnownAssignedXids))
+ * but that would not be true in the case of FATAL errors lagging in array,
+ * but we already know those are bogus anyway, so we skip that test.
+ *
+ * If dbOid is valid we skip backends attached to other databases.
+ *
+ * Be careful to *not* pfree the result from this function. We reuse
+ * this array sufficiently often that we use malloc for the result.
+ */
+VirtualTransactionId *
+GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
+{
+ static VirtualTransactionId *vxids;
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ /*
+ * If first time through, get workspace to remember main XIDs in. We
+ * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
+ * result space, remembering room for a terminator.
+ */
+ if (vxids == NULL)
+ {
+ vxids = (VirtualTransactionId *)
+ malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
+ if (vxids == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ /* Exclude prepared transactions */
+ if (proc->pid == 0)
+ continue;
+
+ if (!OidIsValid(dbOid) ||
+ proc->databaseId == dbOid)
+ {
+ /* Fetch xmin just once - can't change on us, but good coding */
+ TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+ /*
+ * We ignore an invalid pxmin because this means that backend has
+ * no snapshot currently. We hold a Share lock to avoid contention
+ * with users taking snapshots. That is not a problem because the
+ * current xmin is always at least one higher than the latest
+ * removed xid, so any new snapshot would never conflict with the
+ * test here.
+ */
+ if (!TransactionIdIsValid(limitXmin) ||
+ (TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
+ {
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+ if (VirtualTransactionIdIsValid(vxid))
+ vxids[count++] = vxid;
+ }
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ /* add the terminator */
+ vxids[count].backendId = InvalidBackendId;
+ vxids[count].localTransactionId = InvalidLocalTransactionId;
+
+ return vxids;
+}
+
+/*
+ * CancelVirtualTransaction - used in recovery conflict processing
+ *
+ * Returns pid of the process signaled, or 0 if not found.
+ */
+pid_t
+CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
+{
+ return SignalVirtualTransaction(vxid, sigmode, true);
+}
+
+pid_t
+SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
+ bool conflictPending)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+ pid_t pid = 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ VirtualTransactionId procvxid;
+
+ GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+ if (procvxid.backendId == vxid.backendId &&
+ procvxid.localTransactionId == vxid.localTransactionId)
+ {
+ proc->recoveryConflictPending = conflictPending;
+ pid = proc->pid;
+ if (pid != 0)
+ {
+ /*
+ * Kill the pid if it's still here. If not, that's what we
+ * wanted so ignore any errors.
+ */
+ (void) SendProcSignal(pid, sigmode, vxid.backendId);
+ }
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return pid;
+}
+
+/*
+ * MinimumActiveBackends --- count backends (other than myself) that are
+ * in active transactions. Return true if the count exceeds the
+ * minimum threshold passed. This is used as a heuristic to decide if
+ * a pre-XLOG-flush delay is worthwhile during commit.
+ *
+ * Do not count backends that are blocked waiting for locks, since they are
+ * not going to get to run until someone else commits.
+ */
+bool
+MinimumActiveBackends(int min)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ /* Quick short-circuit if no minimum is specified */
+ if (min == 0)
+ return true;
+
+ /*
+ * Note: for speed, we don't acquire ProcArrayLock. This is a little bit
+ * bogus, but since we are only testing fields for zero or nonzero, it
+ * should be OK. The result is only used for heuristic purposes anyway...
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ /*
+ * Since we're not holding a lock, need to be prepared to deal with
+ * garbage, as someone could have incremented numProcs but not yet
+ * filled the structure.
+ *
+ * If someone just decremented numProcs, 'proc' could also point to a
+ * PGPROC entry that's no longer in the array. It still points to a
+ * PGPROC struct, though, because freed PGPROC entries just go to the
+ * free list and are recycled. Its contents are nonsense in that case,
+ * but that's acceptable for this function.
+ */
+ if (pgprocno == -1)
+ continue; /* do not count deleted entries */
+ if (proc == MyProc)
+ continue; /* do not count myself */
+ if (proc->xid == InvalidTransactionId)
+ continue; /* do not count if no XID assigned */
+ if (proc->pid == 0)
+ continue; /* do not count prepared xacts */
+ if (proc->waitLock != NULL)
+ continue; /* do not count if blocked on a lock */
+ count++;
+ if (count >= min)
+ break;
+ }
+
+ return count >= min;
+}
+
+/*
+ * CountDBBackends --- count backends that are using specified database
+ */
+int
+CountDBBackends(Oid databaseid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (proc->pid == 0)
+ continue; /* do not count prepared xacts */
+ if (!OidIsValid(databaseid) ||
+ proc->databaseId == databaseid)
+ count++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return count;
+}
+
+/*
+ * CountDBConnections --- counts database backends ignoring any background
+ * worker processes
+ */
+int
+CountDBConnections(Oid databaseid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (proc->pid == 0)
+ continue; /* do not count prepared xacts */
+ if (proc->isBackgroundWorker)
+ continue; /* do not count background workers */
+ if (!OidIsValid(databaseid) ||
+ proc->databaseId == databaseid)
+ count++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return count;
+}
+
+/*
+ * CancelDBBackends --- cancel backends that are using specified database
+ */
+void
+CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ /* tell all backends to die */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (databaseid == InvalidOid || proc->databaseId == databaseid)
+ {
+ VirtualTransactionId procvxid;
+ pid_t pid;
+
+ GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+ proc->recoveryConflictPending = conflictPending;
+ pid = proc->pid;
+ if (pid != 0)
+ {
+ /*
+ * Kill the pid if it's still here. If not, that's what we
+ * wanted so ignore any errors.
+ */
+ (void) SendProcSignal(pid, sigmode, procvxid.backendId);
+ }
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * CountUserBackends --- count backends that are used by specified user
+ */
+int
+CountUserBackends(Oid roleid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (proc->pid == 0)
+ continue; /* do not count prepared xacts */
+ if (proc->isBackgroundWorker)
+ continue; /* do not count background workers */
+ if (proc->roleId == roleid)
+ count++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return count;
+}
+
+/*
+ * CountOtherDBBackends -- check for other backends running in the given DB
+ *
+ * If there are other backends in the DB, we will wait a maximum of 5 seconds
+ * for them to exit. Autovacuum backends are encouraged to exit early by
+ * sending them SIGTERM, but normal user backends are just waited for.
+ *
+ * The current backend is always ignored; it is caller's responsibility to
+ * check whether the current backend uses the given DB, if it's important.
+ *
+ * Returns true if there are (still) other backends in the DB, false if not.
+ * Also, *nbackends and *nprepared are set to the number of other backends
+ * and prepared transactions in the DB, respectively.
+ *
+ * This function is used to interlock DROP DATABASE and related commands
+ * against there being any active backends in the target DB --- dropping the
+ * DB while active backends remain would be a Bad Thing. Note that we cannot
+ * detect here the possibility of a newly-started backend that is trying to
+ * connect to the doomed database, so additional interlocking is needed during
+ * backend startup. The caller should normally hold an exclusive lock on the
+ * target DB before calling this, which is one reason we mustn't wait
+ * indefinitely.
+ */
+bool
+CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
+{
+ ProcArrayStruct *arrayP = procArray;
+
+#define MAXAUTOVACPIDS 10 /* max autovacs to SIGTERM per iteration */
+ int autovac_pids[MAXAUTOVACPIDS];
+ int tries;
+
+ /* 50 tries with 100ms sleep between tries makes 5 sec total wait */
+ for (tries = 0; tries < 50; tries++)
+ {
+ int nautovacs = 0;
+ bool found = false;
+ int index;
+
+ CHECK_FOR_INTERRUPTS();
+
+ *nbackends = *nprepared = 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ uint8 statusFlags = ProcGlobal->statusFlags[index];
+
+ if (proc->databaseId != databaseId)
+ continue;
+ if (proc == MyProc)
+ continue;
+
+ found = true;
+
+ if (proc->pid == 0)
+ (*nprepared)++;
+ else
+ {
+ (*nbackends)++;
+ if ((statusFlags & PROC_IS_AUTOVACUUM) &&
+ nautovacs < MAXAUTOVACPIDS)
+ autovac_pids[nautovacs++] = proc->pid;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ if (!found)
+ return false; /* no conflicting backends, so done */
+
+ /*
+ * Send SIGTERM to any conflicting autovacuums before sleeping. We
+ * postpone this step until after the loop because we don't want to
+ * hold ProcArrayLock while issuing kill(). We have no idea what might
+ * block kill() inside the kernel...
+ */
+ for (index = 0; index < nautovacs; index++)
+ (void) kill(autovac_pids[index], SIGTERM); /* ignore any error */
+
+ /* sleep, then try again */
+ pg_usleep(100 * 1000L); /* 100ms */
+ }
+
+ return true; /* timed out, still conflicts */
+}
+
+/*
+ * Terminate existing connections to the specified database. This routine
+ * is used by the DROP DATABASE command when user has asked to forcefully
+ * drop the database.
+ *
+ * The current backend is always ignored; it is caller's responsibility to
+ * check whether the current backend uses the given DB, if it's important.
+ *
+ * It doesn't allow to terminate the connections even if there is a one
+ * backend with the prepared transaction in the target database.
+ */
+void
+TerminateOtherDBBackends(Oid databaseId)
+{
+ ProcArrayStruct *arrayP = procArray;
+ List *pids = NIL;
+ int nprepared = 0;
+ int i;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (i = 0; i < procArray->numProcs; i++)
+ {
+ int pgprocno = arrayP->pgprocnos[i];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (proc->databaseId != databaseId)
+ continue;
+ if (proc == MyProc)
+ continue;
+
+ if (proc->pid != 0)
+ pids = lappend_int(pids, proc->pid);
+ else
+ nprepared++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ if (nprepared > 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_IN_USE),
+ errmsg("database \"%s\" is being used by prepared transactions",
+ get_database_name(databaseId)),
+ errdetail_plural("There is %d prepared transaction using the database.",
+ "There are %d prepared transactions using the database.",
+ nprepared,
+ nprepared)));
+
+ if (pids)
+ {
+ ListCell *lc;
+
+ /*
+ * Check whether we have the necessary rights to terminate other
+ * sessions. We don't terminate any session until we ensure that we
+ * have rights on all the sessions to be terminated. These checks are
+ * the same as we do in pg_terminate_backend.
+ *
+ * In this case we don't raise some warnings - like "PID %d is not a
+ * PostgreSQL server process", because for us already finished session
+ * is not a problem.
+ */
+ foreach(lc, pids)
+ {
+ int pid = lfirst_int(lc);
+ PGPROC *proc = BackendPidGetProc(pid);
+
+ if (proc != NULL)
+ {
+ /* Only allow superusers to signal superuser-owned backends. */
+ if (superuser_arg(proc->roleId) && !superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied to terminate process"),
+ errdetail("Only roles with the %s attribute may terminate processes of roles with the %s attribute.",
+ "SUPERUSER", "SUPERUSER")));
+
+ /* Users can signal backends they have role membership in. */
+ if (!has_privs_of_role(GetUserId(), proc->roleId) &&
+ !has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied to terminate process"),
+ errdetail("Only roles with privileges of the role whose process is being terminated or with privileges of the \"%s\" role may terminate this process.",
+ "pg_signal_backend")));
+ }
+ }
+
+ /*
+ * There's a race condition here: once we release the ProcArrayLock,
+ * it's possible for the session to exit before we issue kill. That
+ * race condition possibility seems too unlikely to worry about. See
+ * pg_signal_backend.
+ */
+ foreach(lc, pids)
+ {
+ int pid = lfirst_int(lc);
+ PGPROC *proc = BackendPidGetProc(pid);
+
+ if (proc != NULL)
+ {
+ /*
+ * If we have setsid(), signal the backend's whole process
+ * group
+ */
+#ifdef HAVE_SETSID
+ (void) kill(-pid, SIGTERM);
+#else
+ (void) kill(pid, SIGTERM);
+#endif
+ }
+ }
+ }
+}
+
+/*
+ * ProcArraySetReplicationSlotXmin
+ *
+ * Install limits to future computations of the xmin horizon to prevent vacuum
+ * and HOT pruning from removing affected rows still needed by clients with
+ * replication slots.
+ */
+void
+ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
+ bool already_locked)
+{
+ Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
+
+ if (!already_locked)
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ procArray->replication_slot_xmin = xmin;
+ procArray->replication_slot_catalog_xmin = catalog_xmin;
+
+ if (!already_locked)
+ LWLockRelease(ProcArrayLock);
+
+ elog(DEBUG1, "xmin required by slots: data %u, catalog %u",
+ xmin, catalog_xmin);
+}
+
+/*
+ * ProcArrayGetReplicationSlotXmin
+ *
+ * Return the current slot xmin limits. That's useful to be able to remove
+ * data that's older than those limits.
+ */
+void
+ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+ TransactionId *catalog_xmin)
+{
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ if (xmin != NULL)
+ *xmin = procArray->replication_slot_xmin;
+
+ if (catalog_xmin != NULL)
+ *catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * XidCacheRemoveRunningXids
+ *
+ * Remove a bunch of TransactionIds from the list of known-running
+ * subtransactions for my backend. Both the specified xid and those in
+ * the xids[] array (of length nxids) are removed from the subxids cache.
+ * latestXid must be the latest XID among the group.
+ */
+void
+XidCacheRemoveRunningXids(TransactionId xid,
+ int nxids, const TransactionId *xids,
+ TransactionId latestXid)
+{
+ int i,
+ j;
+ XidCacheStatus *mysubxidstat;
+
+ Assert(TransactionIdIsValid(xid));
+
+ /*
+ * We must hold ProcArrayLock exclusively in order to remove transactions
+ * from the PGPROC array. (See src/backend/access/transam/README.) It's
+ * possible this could be relaxed since we know this routine is only used
+ * to abort subtransactions, but pending closer analysis we'd best be
+ * conservative.
+ *
+ * Note that we do not have to be careful about memory ordering of our own
+ * reads wrt. GetNewTransactionId() here - only this process can modify
+ * relevant fields of MyProc/ProcGlobal->xids[]. But we do have to be
+ * careful about our own writes being well ordered.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ mysubxidstat = &ProcGlobal->subxidStates[MyProc->pgxactoff];
+
+ /*
+ * Under normal circumstances xid and xids[] will be in increasing order,
+ * as will be the entries in subxids. Scan backwards to avoid O(N^2)
+ * behavior when removing a lot of xids.
+ */
+ for (i = nxids - 1; i >= 0; i--)
+ {
+ TransactionId anxid = xids[i];
+
+ for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
+ {
+ if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
+ {
+ MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+ pg_write_barrier();
+ mysubxidstat->count--;
+ MyProc->subxidStatus.count--;
+ break;
+ }
+ }
+
+ /*
+ * Ordinarily we should have found it, unless the cache has
+ * overflowed. However it's also possible for this routine to be
+ * invoked multiple times for the same subtransaction, in case of an
+ * error during AbortSubTransaction. So instead of Assert, emit a
+ * debug warning.
+ */
+ if (j < 0 && !MyProc->subxidStatus.overflowed)
+ elog(WARNING, "did not find subXID %u in MyProc", anxid);
+ }
+
+ for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
+ {
+ if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
+ {
+ MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+ pg_write_barrier();
+ mysubxidstat->count--;
+ MyProc->subxidStatus.count--;
+ break;
+ }
+ }
+ /* Ordinarily we should have found it, unless the cache has overflowed */
+ if (j < 0 && !MyProc->subxidStatus.overflowed)
+ elog(WARNING, "did not find subXID %u in MyProc", xid);
+
+ /* Also advance global latestCompletedXid while holding the lock */
+ MaintainLatestCompletedXid(latestXid);
+
+ /* ... and xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+
+ LWLockRelease(ProcArrayLock);
+}
+
+#ifdef XIDCACHE_DEBUG
+
+/*
+ * Print stats about effectiveness of XID cache
+ */
+static void
+DisplayXidCache(void)
+{
+ fprintf(stderr,
+ "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
+ xc_by_recent_xmin,
+ xc_by_known_xact,
+ xc_by_my_xact,
+ xc_by_latest_xid,
+ xc_by_main_xid,
+ xc_by_child_xid,
+ xc_by_known_assigned,
+ xc_no_overflow,
+ xc_slow_answer);
+}
+#endif /* XIDCACHE_DEBUG */
+
+/*
+ * If rel != NULL, return test state appropriate for relation, otherwise
+ * return state usable for all relations. The latter may consider XIDs as
+ * not-yet-visible-to-everyone that a state for a specific relation would
+ * already consider visible-to-everyone.
+ *
+ * This needs to be called while a snapshot is active or registered, otherwise
+ * there are wraparound and other dangers.
+ *
+ * See comment for GlobalVisState for details.
+ */
+GlobalVisState *
+GlobalVisTestFor(Relation rel)
+{
+ GlobalVisState *state = NULL;
+
+ /* XXX: we should assert that a snapshot is pushed or registered */
+ Assert(RecentXmin);
+
+ switch (GlobalVisHorizonKindForRel(rel))
+ {
+ case VISHORIZON_SHARED:
+ state = &GlobalVisSharedRels;
+ break;
+ case VISHORIZON_CATALOG:
+ state = &GlobalVisCatalogRels;
+ break;
+ case VISHORIZON_DATA:
+ state = &GlobalVisDataRels;
+ break;
+ case VISHORIZON_TEMP:
+ state = &GlobalVisTempRels;
+ break;
+ }
+
+ Assert(FullTransactionIdIsValid(state->definitely_needed) &&
+ FullTransactionIdIsValid(state->maybe_needed));
+
+ return state;
+}
+
+/*
+ * Return true if it's worth updating the accurate maybe_needed boundary.
+ *
+ * As it is somewhat expensive to determine xmin horizons, we don't want to
+ * repeatedly do so when there is a low likelihood of it being beneficial.
+ *
+ * The current heuristic is that we update only if RecentXmin has changed
+ * since the last update. If the oldest currently running transaction has not
+ * finished, it is unlikely that recomputing the horizon would be useful.
+ */
+static bool
+GlobalVisTestShouldUpdate(GlobalVisState *state)
+{
+ /* hasn't been updated yet */
+ if (!TransactionIdIsValid(ComputeXidHorizonsResultLastXmin))
+ return true;
+
+ /*
+ * If the maybe_needed/definitely_needed boundaries are the same, it's
+ * unlikely to be beneficial to refresh boundaries.
+ */
+ if (FullTransactionIdFollowsOrEquals(state->maybe_needed,
+ state->definitely_needed))
+ return false;
+
+ /* does the last snapshot built have a different xmin? */
+ return RecentXmin != ComputeXidHorizonsResultLastXmin;
+}
+
+static void
+GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons)
+{
+ GlobalVisSharedRels.maybe_needed =
+ FullXidRelativeTo(horizons->latest_completed,
+ horizons->shared_oldest_nonremovable);
+ GlobalVisCatalogRels.maybe_needed =
+ FullXidRelativeTo(horizons->latest_completed,
+ horizons->catalog_oldest_nonremovable);
+ GlobalVisDataRels.maybe_needed =
+ FullXidRelativeTo(horizons->latest_completed,
+ horizons->data_oldest_nonremovable);
+ GlobalVisTempRels.maybe_needed =
+ FullXidRelativeTo(horizons->latest_completed,
+ horizons->temp_oldest_nonremovable);
+
+ /*
+ * In longer running transactions it's possible that transactions we
+ * previously needed to treat as running aren't around anymore. So update
+ * definitely_needed to not be earlier than maybe_needed.
+ */
+ GlobalVisSharedRels.definitely_needed =
+ FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+ GlobalVisSharedRels.definitely_needed);
+ GlobalVisCatalogRels.definitely_needed =
+ FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+ GlobalVisCatalogRels.definitely_needed);
+ GlobalVisDataRels.definitely_needed =
+ FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+ GlobalVisDataRels.definitely_needed);
+ GlobalVisTempRels.definitely_needed = GlobalVisTempRels.maybe_needed;
+
+ ComputeXidHorizonsResultLastXmin = RecentXmin;
+}
+
+/*
+ * Update boundaries in GlobalVis{Shared,Catalog, Data}Rels
+ * using ComputeXidHorizons().
+ */
+static void
+GlobalVisUpdate(void)
+{
+ ComputeXidHorizonsResult horizons;
+
+ /* updates the horizons as a side-effect */
+ ComputeXidHorizons(&horizons);
+}
+
+/*
+ * Return true if no snapshot still considers fxid to be running.
+ *
+ * The state passed needs to have been initialized for the relation fxid is
+ * from (NULL is also OK), otherwise the result may not be correct.
+ *
+ * See comment for GlobalVisState for details.
+ */
+bool
+GlobalVisTestIsRemovableFullXid(GlobalVisState *state,
+ FullTransactionId fxid)
+{
+ /*
+ * If fxid is older than maybe_needed bound, it definitely is visible to
+ * everyone.
+ */
+ if (FullTransactionIdPrecedes(fxid, state->maybe_needed))
+ return true;
+
+ /*
+ * If fxid is >= definitely_needed bound, it is very likely to still be
+ * considered running.
+ */
+ if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed))
+ return false;
+
+ /*
+ * fxid is between maybe_needed and definitely_needed, i.e. there might or
+ * might not exist a snapshot considering fxid running. If it makes sense,
+ * update boundaries and recheck.
+ */
+ if (GlobalVisTestShouldUpdate(state))
+ {
+ GlobalVisUpdate();
+
+ Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed));
+
+ return FullTransactionIdPrecedes(fxid, state->maybe_needed);
+ }
+ else
+ return false;
+}
+
+/*
+ * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids.
+ *
+ * It is crucial that this only gets called for xids from a source that
+ * protects against xid wraparounds (e.g. from a table and thus protected by
+ * relfrozenxid).
+ */
+bool
+GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
+{
+ FullTransactionId fxid;
+
+ /*
+ * Convert 32 bit argument to FullTransactionId. We can do so safely
+ * because we know the xid has to, at the very least, be between
+ * [oldestXid, nextXid), i.e. within 2 billion of xid. To avoid taking a
+ * lock to determine either, we can just compare with
+ * state->definitely_needed, which was based on those value at the time
+ * the current snapshot was built.
+ */
+ fxid = FullXidRelativeTo(state->definitely_needed, xid);
+
+ return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Return FullTransactionId below which all transactions are not considered
+ * running anymore.
+ *
+ * Note: This is less efficient than testing with
+ * GlobalVisTestIsRemovableFullXid as it likely requires building an accurate
+ * cutoff, even in the case all the XIDs compared with the cutoff are outside
+ * [maybe_needed, definitely_needed).
+ */
+FullTransactionId
+GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state)
+{
+ /* acquire accurate horizon if not already done */
+ if (GlobalVisTestShouldUpdate(state))
+ GlobalVisUpdate();
+
+ return state->maybe_needed;
+}
+
+/* Convenience wrapper around GlobalVisTestNonRemovableFullHorizon */
+TransactionId
+GlobalVisTestNonRemovableHorizon(GlobalVisState *state)
+{
+ FullTransactionId cutoff;
+
+ cutoff = GlobalVisTestNonRemovableFullHorizon(state);
+
+ return XidFromFullTransactionId(cutoff);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableFullXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
+{
+ GlobalVisState *state;
+
+ state = GlobalVisTestFor(rel);
+
+ return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
+{
+ GlobalVisState *state;
+
+ state = GlobalVisTestFor(rel);
+
+ return GlobalVisTestIsRemovableXid(state, xid);
+}
+
+/*
+ * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it
+ * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel).
+ *
+ * Be very careful about when to use this function. It can only safely be used
+ * when there is a guarantee that xid is within MaxTransactionId / 2 xids of
+ * rel. That e.g. can be guaranteed if the caller assures a snapshot is
+ * held by the backend and xid is from a table (where vacuum/freezing ensures
+ * the xid has to be within that range), or if xid is from the procarray and
+ * prevents xid wraparound that way.
+ */
+static inline FullTransactionId
+FullXidRelativeTo(FullTransactionId rel, TransactionId xid)
+{
+ TransactionId rel_xid = XidFromFullTransactionId(rel);
+
+ Assert(TransactionIdIsValid(xid));
+ Assert(TransactionIdIsValid(rel_xid));
+
+ /* not guaranteed to find issues, but likely to catch mistakes */
+ AssertTransactionIdInAllowableRange(xid);
+
+ return FullTransactionIdFromU64(U64FromFullTransactionId(rel)
+ + (int32) (xid - rel_xid));
+}
+
+
+/* ----------------------------------------------
+ * KnownAssignedTransactionIds sub-module
+ * ----------------------------------------------
+ */
+
+/*
+ * In Hot Standby mode, we maintain a list of transactions that are (or were)
+ * running on the primary at the current point in WAL. These XIDs must be
+ * treated as running by standby transactions, even though they are not in
+ * the standby server's PGPROC array.
+ *
+ * We record all XIDs that we know have been assigned. That includes all the
+ * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
+ * been assigned. We can deduce the existence of unobserved XIDs because we
+ * know XIDs are assigned in sequence, with no gaps. The KnownAssignedXids
+ * list expands as new XIDs are observed or inferred, and contracts when
+ * transaction completion records arrive.
+ *
+ * During hot standby we do not fret too much about the distinction between
+ * top-level XIDs and subtransaction XIDs. We store both together in the
+ * KnownAssignedXids list. In backends, this is copied into snapshots in
+ * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
+ * doesn't care about the distinction either. Subtransaction XIDs are
+ * effectively treated as top-level XIDs and in the typical case pg_subtrans
+ * links are *not* maintained (which does not affect visibility).
+ *
+ * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
+ * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must
+ * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
+ * least every PGPROC_MAX_CACHED_SUBXIDS. When we receive one of these
+ * records, we mark the subXIDs as children of the top XID in pg_subtrans,
+ * and then remove them from KnownAssignedXids. This prevents overflow of
+ * KnownAssignedXids and snapshots, at the cost that status checks for these
+ * subXIDs will take a slower path through TransactionIdIsInProgress().
+ * This means that KnownAssignedXids is not necessarily complete for subXIDs,
+ * though it should be complete for top-level XIDs; this is the same situation
+ * that holds with respect to the PGPROC entries in normal running.
+ *
+ * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
+ * that, similarly to tracking overflow of a PGPROC's subxids array. We do
+ * that by remembering the lastOverflowedXid, ie the last thrown-away subXID.
+ * As long as that is within the range of interesting XIDs, we have to assume
+ * that subXIDs are missing from snapshots. (Note that subXID overflow occurs
+ * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
+ * subXID arrives - that is not an error.)
+ *
+ * Should a backend on primary somehow disappear before it can write an abort
+ * record, then we just leave those XIDs in KnownAssignedXids. They actually
+ * aborted but we think they were running; the distinction is irrelevant
+ * because either way any changes done by the transaction are not visible to
+ * backends in the standby. We prune KnownAssignedXids when
+ * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
+ * array due to such dead XIDs.
+ */
+
+/*
+ * RecordKnownAssignedTransactionIds
+ * Record the given XID in KnownAssignedXids, as well as any preceding
+ * unobserved XIDs.
+ *
+ * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
+ * associated with a transaction. Must be called for each record after we
+ * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
+ *
+ * Called during recovery in analogy with and in place of GetNewTransactionId()
+ */
+void
+RecordKnownAssignedTransactionIds(TransactionId xid)
+{
+ Assert(standbyState >= STANDBY_INITIALIZED);
+ Assert(TransactionIdIsValid(xid));
+ Assert(TransactionIdIsValid(latestObservedXid));
+
+ elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
+ xid, latestObservedXid);
+
+ /*
+ * When a newly observed xid arrives, it is frequently the case that it is
+ * *not* the next xid in sequence. When this occurs, we must treat the
+ * intervening xids as running also.
+ */
+ if (TransactionIdFollows(xid, latestObservedXid))
+ {
+ TransactionId next_expected_xid;
+
+ /*
+ * Extend subtrans like we do in GetNewTransactionId() during normal
+ * operation using individual extend steps. Note that we do not need
+ * to extend clog since its extensions are WAL logged.
+ *
+ * This part has to be done regardless of standbyState since we
+ * immediately start assigning subtransactions to their toplevel
+ * transactions.
+ */
+ next_expected_xid = latestObservedXid;
+ while (TransactionIdPrecedes(next_expected_xid, xid))
+ {
+ TransactionIdAdvance(next_expected_xid);
+ ExtendSUBTRANS(next_expected_xid);
+ }
+ Assert(next_expected_xid == xid);
+
+ /*
+ * If the KnownAssignedXids machinery isn't up yet, there's nothing
+ * more to do since we don't track assigned xids yet.
+ */
+ if (standbyState <= STANDBY_INITIALIZED)
+ {
+ latestObservedXid = xid;
+ return;
+ }
+
+ /*
+ * Add (latestObservedXid, xid] onto the KnownAssignedXids array.
+ */
+ next_expected_xid = latestObservedXid;
+ TransactionIdAdvance(next_expected_xid);
+ KnownAssignedXidsAdd(next_expected_xid, xid, false);
+
+ /*
+ * Now we can advance latestObservedXid
+ */
+ latestObservedXid = xid;
+
+ /* ShmemVariableCache->nextXid must be beyond any observed xid */
+ AdvanceNextFullTransactionIdPastXid(latestObservedXid);
+ }
+}
+
+/*
+ * ExpireTreeKnownAssignedTransactionIds
+ * Remove the given XIDs from KnownAssignedXids.
+ *
+ * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
+ */
+void
+ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
+ TransactionId *subxids, TransactionId max_xid)
+{
+ Assert(standbyState >= STANDBY_INITIALIZED);
+
+ /*
+ * Uses same locking as transaction commit
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
+
+ /* As in ProcArrayEndTransaction, advance latestCompletedXid */
+ MaintainLatestCompletedXidRecovery(max_xid);
+
+ /* ... and xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ExpireAllKnownAssignedTransactionIds
+ * Remove all entries in KnownAssignedXids and reset lastOverflowedXid.
+ */
+void
+ExpireAllKnownAssignedTransactionIds(void)
+{
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ KnownAssignedXidsRemovePreceding(InvalidTransactionId);
+
+ /*
+ * Reset lastOverflowedXid. Currently, lastOverflowedXid has no use after
+ * the call of this function. But do this for unification with what
+ * ExpireOldKnownAssignedTransactionIds() do.
+ */
+ procArray->lastOverflowedXid = InvalidTransactionId;
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ExpireOldKnownAssignedTransactionIds
+ * Remove KnownAssignedXids entries preceding the given XID and
+ * potentially reset lastOverflowedXid.
+ */
+void
+ExpireOldKnownAssignedTransactionIds(TransactionId xid)
+{
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Reset lastOverflowedXid if we know all transactions that have been
+ * possibly running are being gone. Not doing so could cause an incorrect
+ * lastOverflowedXid value, which makes extra snapshots be marked as
+ * suboverflowed.
+ */
+ if (TransactionIdPrecedes(procArray->lastOverflowedXid, xid))
+ procArray->lastOverflowedXid = InvalidTransactionId;
+ KnownAssignedXidsRemovePreceding(xid);
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * KnownAssignedTransactionIdsIdleMaintenance
+ * Opportunistically do maintenance work when the startup process
+ * is about to go idle.
+ */
+void
+KnownAssignedTransactionIdsIdleMaintenance(void)
+{
+ KnownAssignedXidsCompress(KAX_STARTUP_PROCESS_IDLE, false);
+}
+
+
+/*
+ * Private module functions to manipulate KnownAssignedXids
+ *
+ * There are 5 main uses of the KnownAssignedXids data structure:
+ *
+ * * backends taking snapshots - all valid XIDs need to be copied out
+ * * backends seeking to determine presence of a specific XID
+ * * startup process adding new known-assigned XIDs
+ * * startup process removing specific XIDs as transactions end
+ * * startup process pruning array when special WAL records arrive
+ *
+ * This data structure is known to be a hot spot during Hot Standby, so we
+ * go to some lengths to make these operations as efficient and as concurrent
+ * as possible.
+ *
+ * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
+ * order, to be exact --- to allow binary search for specific XIDs. Note:
+ * in general TransactionIdPrecedes would not provide a total order, but
+ * we know that the entries present at any instant should not extend across
+ * a large enough fraction of XID space to wrap around (the primary would
+ * shut down for fear of XID wrap long before that happens). So it's OK to
+ * use TransactionIdPrecedes as a binary-search comparator.
+ *
+ * It's cheap to maintain the sortedness during insertions, since new known
+ * XIDs are always reported in XID order; we just append them at the right.
+ *
+ * To keep individual deletions cheap, we need to allow gaps in the array.
+ * This is implemented by marking array elements as valid or invalid using
+ * the parallel boolean array KnownAssignedXidsValid[]. A deletion is done
+ * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
+ * XID entry itself. This preserves the property that the XID entries are
+ * sorted, so we can do binary searches easily. Periodically we compress
+ * out the unused entries; that's much cheaper than having to compress the
+ * array immediately on every deletion.
+ *
+ * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
+ * are those with indexes tail <= i < head; items outside this subscript range
+ * have unspecified contents. When head reaches the end of the array, we
+ * force compression of unused entries rather than wrapping around, since
+ * allowing wraparound would greatly complicate the search logic. We maintain
+ * an explicit tail pointer so that pruning of old XIDs can be done without
+ * immediately moving the array contents. In most cases only a small fraction
+ * of the array contains valid entries at any instant.
+ *
+ * Although only the startup process can ever change the KnownAssignedXids
+ * data structure, we still need interlocking so that standby backends will
+ * not observe invalid intermediate states. The convention is that backends
+ * must hold shared ProcArrayLock to examine the array. To remove XIDs from
+ * the array, the startup process must hold ProcArrayLock exclusively, for
+ * the usual transactional reasons (compare commit/abort of a transaction
+ * during normal running). Compressing unused entries out of the array
+ * likewise requires exclusive lock. To add XIDs to the array, we just insert
+ * them into slots to the right of the head pointer and then advance the head
+ * pointer. This wouldn't require any lock at all, except that on machines
+ * with weak memory ordering we need to be careful that other processors
+ * see the array element changes before they see the head pointer change.
+ * We handle this by using a spinlock to protect reads and writes of the
+ * head/tail pointers. (We could dispense with the spinlock if we were to
+ * create suitable memory access barrier primitives and use those instead.)
+ * The spinlock must be taken to read or write the head/tail pointers unless
+ * the caller holds ProcArrayLock exclusively.
+ *
+ * Algorithmic analysis:
+ *
+ * If we have a maximum of M slots, with N XIDs currently spread across
+ * S elements then we have N <= S <= M always.
+ *
+ * * Adding a new XID is O(1) and needs little locking (unless compression
+ * must happen)
+ * * Compressing the array is O(S) and requires exclusive lock
+ * * Removing an XID is O(logS) and requires exclusive lock
+ * * Taking a snapshot is O(S) and requires shared lock
+ * * Checking for an XID is O(logS) and requires shared lock
+ *
+ * In comparison, using a hash table for KnownAssignedXids would mean that
+ * taking snapshots would be O(M). If we can maintain S << M then the
+ * sorted array technique will deliver significantly faster snapshots.
+ * If we try to keep S too small then we will spend too much time compressing,
+ * so there is an optimal point for any workload mix. We use a heuristic to
+ * decide when to compress the array, though trimming also helps reduce
+ * frequency of compressing. The heuristic requires us to track the number of
+ * currently valid XIDs in the array (N). Except in special cases, we'll
+ * compress when S >= 2N. Bounding S at 2N in turn bounds the time for
+ * taking a snapshot to be O(N), which it would have to be anyway.
+ */
+
+
+/*
+ * Compress KnownAssignedXids by shifting valid data down to the start of the
+ * array, removing any gaps.
+ *
+ * A compression step is forced if "reason" is KAX_NO_SPACE, otherwise
+ * we do it only if a heuristic indicates it's a good time to do it.
+ *
+ * Compression requires holding ProcArrayLock in exclusive mode.
+ * Caller must pass haveLock = true if it already holds the lock.
+ */
+static void
+KnownAssignedXidsCompress(KAXCompressReason reason, bool haveLock)
+{
+ ProcArrayStruct *pArray = procArray;
+ int head,
+ tail,
+ nelements;
+ int compress_index;
+ int i;
+
+ /* Counters for compression heuristics */
+ static unsigned int transactionEndsCounter;
+ static TimestampTz lastCompressTs;
+
+ /* Tuning constants */
+#define KAX_COMPRESS_FREQUENCY 128 /* in transactions */
+#define KAX_COMPRESS_IDLE_INTERVAL 1000 /* in ms */
+
+ /*
+ * Since only the startup process modifies the head/tail pointers, we
+ * don't need a lock to read them here.
+ */
+ head = pArray->headKnownAssignedXids;
+ tail = pArray->tailKnownAssignedXids;
+ nelements = head - tail;
+
+ /*
+ * If we can choose whether to compress, use a heuristic to avoid
+ * compressing too often or not often enough. "Compress" here simply
+ * means moving the values to the beginning of the array, so it is not as
+ * complex or costly as typical data compression algorithms.
+ */
+ if (nelements == pArray->numKnownAssignedXids)
+ {
+ /*
+ * When there are no gaps between head and tail, don't bother to
+ * compress, except in the KAX_NO_SPACE case where we must compress to
+ * create some space after the head.
+ */
+ if (reason != KAX_NO_SPACE)
+ return;
+ }
+ else if (reason == KAX_TRANSACTION_END)
+ {
+ /*
+ * Consider compressing only once every so many commits. Frequency
+ * determined by benchmarks.
+ */
+ if ((transactionEndsCounter++) % KAX_COMPRESS_FREQUENCY != 0)
+ return;
+
+ /*
+ * Furthermore, compress only if the used part of the array is less
+ * than 50% full (see comments above).
+ */
+ if (nelements < 2 * pArray->numKnownAssignedXids)
+ return;
+ }
+ else if (reason == KAX_STARTUP_PROCESS_IDLE)
+ {
+ /*
+ * We're about to go idle for lack of new WAL, so we might as well
+ * compress. But not too often, to avoid ProcArray lock contention
+ * with readers.
+ */
+ if (lastCompressTs != 0)
+ {
+ TimestampTz compress_after;
+
+ compress_after = TimestampTzPlusMilliseconds(lastCompressTs,
+ KAX_COMPRESS_IDLE_INTERVAL);
+ if (GetCurrentTimestamp() < compress_after)
+ return;
+ }
+ }
+
+ /* Need to compress, so get the lock if we don't have it. */
+ if (!haveLock)
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * We compress the array by reading the valid values from tail to head,
+ * re-aligning data to 0th element.
+ */
+ compress_index = 0;
+ for (i = tail; i < head; i++)
+ {
+ if (KnownAssignedXidsValid[i])
+ {
+ KnownAssignedXids[compress_index] = KnownAssignedXids[i];
+ KnownAssignedXidsValid[compress_index] = true;
+ compress_index++;
+ }
+ }
+ Assert(compress_index == pArray->numKnownAssignedXids);
+
+ pArray->tailKnownAssignedXids = 0;
+ pArray->headKnownAssignedXids = compress_index;
+
+ if (!haveLock)
+ LWLockRelease(ProcArrayLock);
+
+ /* Update timestamp for maintenance. No need to hold lock for this. */
+ lastCompressTs = GetCurrentTimestamp();
+}
+
+/*
+ * Add xids into KnownAssignedXids at the head of the array.
+ *
+ * xids from from_xid to to_xid, inclusive, are added to the array.
+ *
+ * If exclusive_lock is true then caller already holds ProcArrayLock in
+ * exclusive mode, so we need no extra locking here. Else caller holds no
+ * lock, so we need to be sure we maintain sufficient interlocks against
+ * concurrent readers. (Only the startup process ever calls this, so no need
+ * to worry about concurrent writers.)
+ */
+static void
+KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
+ bool exclusive_lock)
+{
+ ProcArrayStruct *pArray = procArray;
+ TransactionId next_xid;
+ int head,
+ tail;
+ int nxids;
+ int i;
+
+ Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
+
+ /*
+ * Calculate how many array slots we'll need. Normally this is cheap; in
+ * the unusual case where the XIDs cross the wrap point, we do it the hard
+ * way.
+ */
+ if (to_xid >= from_xid)
+ nxids = to_xid - from_xid + 1;
+ else
+ {
+ nxids = 1;
+ next_xid = from_xid;
+ while (TransactionIdPrecedes(next_xid, to_xid))
+ {
+ nxids++;
+ TransactionIdAdvance(next_xid);
+ }
+ }
+
+ /*
+ * Since only the startup process modifies the head/tail pointers, we
+ * don't need a lock to read them here.
+ */
+ head = pArray->headKnownAssignedXids;
+ tail = pArray->tailKnownAssignedXids;
+
+ Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
+ Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
+
+ /*
+ * Verify that insertions occur in TransactionId sequence. Note that even
+ * if the last existing element is marked invalid, it must still have a
+ * correctly sequenced XID value.
+ */
+ if (head > tail &&
+ TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
+ {
+ KnownAssignedXidsDisplay(LOG);
+ elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
+ }
+
+ /*
+ * If our xids won't fit in the remaining space, compress out free space
+ */
+ if (head + nxids > pArray->maxKnownAssignedXids)
+ {
+ KnownAssignedXidsCompress(KAX_NO_SPACE, exclusive_lock);
+
+ head = pArray->headKnownAssignedXids;
+ /* note: we no longer care about the tail pointer */
+
+ /*
+ * If it still won't fit then we're out of memory
+ */
+ if (head + nxids > pArray->maxKnownAssignedXids)
+ elog(ERROR, "too many KnownAssignedXids");
+ }
+
+ /* Now we can insert the xids into the space starting at head */
+ next_xid = from_xid;
+ for (i = 0; i < nxids; i++)
+ {
+ KnownAssignedXids[head] = next_xid;
+ KnownAssignedXidsValid[head] = true;
+ TransactionIdAdvance(next_xid);
+ head++;
+ }
+
+ /* Adjust count of number of valid entries */
+ pArray->numKnownAssignedXids += nxids;
+
+ /*
+ * Now update the head pointer. We use a spinlock to protect this
+ * pointer, not because the update is likely to be non-atomic, but to
+ * ensure that other processors see the above array updates before they
+ * see the head pointer change.
+ *
+ * If we're holding ProcArrayLock exclusively, there's no need to take the
+ * spinlock.
+ */
+ if (exclusive_lock)
+ pArray->headKnownAssignedXids = head;
+ else
+ {
+ SpinLockAcquire(&pArray->known_assigned_xids_lck);
+ pArray->headKnownAssignedXids = head;
+ SpinLockRelease(&pArray->known_assigned_xids_lck);
+ }
+}
+
+/*
+ * KnownAssignedXidsSearch
+ *
+ * Searches KnownAssignedXids for a specific xid and optionally removes it.
+ * Returns true if it was found, false if not.
+ *
+ * Caller must hold ProcArrayLock in shared or exclusive mode.
+ * Exclusive lock must be held for remove = true.
+ */
+static bool
+KnownAssignedXidsSearch(TransactionId xid, bool remove)
+{
+ ProcArrayStruct *pArray = procArray;
+ int first,
+ last;
+ int head;
+ int tail;
+ int result_index = -1;
+
+ if (remove)
+ {
+ /* we hold ProcArrayLock exclusively, so no need for spinlock */
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
+ }
+ else
+ {
+ /* take spinlock to ensure we see up-to-date array contents */
+ SpinLockAcquire(&pArray->known_assigned_xids_lck);
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
+ SpinLockRelease(&pArray->known_assigned_xids_lck);
+ }
+
+ /*
+ * Standard binary search. Note we can ignore the KnownAssignedXidsValid
+ * array here, since even invalid entries will contain sorted XIDs.
+ */
+ first = tail;
+ last = head - 1;
+ while (first <= last)
+ {
+ int mid_index;
+ TransactionId mid_xid;
+
+ mid_index = (first + last) / 2;
+ mid_xid = KnownAssignedXids[mid_index];
+
+ if (xid == mid_xid)
+ {
+ result_index = mid_index;
+ break;
+ }
+ else if (TransactionIdPrecedes(xid, mid_xid))
+ last = mid_index - 1;
+ else
+ first = mid_index + 1;
+ }
+
+ if (result_index < 0)
+ return false; /* not in array */
+
+ if (!KnownAssignedXidsValid[result_index])
+ return false; /* in array, but invalid */
+
+ if (remove)
+ {
+ KnownAssignedXidsValid[result_index] = false;
+
+ pArray->numKnownAssignedXids--;
+ Assert(pArray->numKnownAssignedXids >= 0);
+
+ /*
+ * If we're removing the tail element then advance tail pointer over
+ * any invalid elements. This will speed future searches.
+ */
+ if (result_index == tail)
+ {
+ tail++;
+ while (tail < head && !KnownAssignedXidsValid[tail])
+ tail++;
+ if (tail >= head)
+ {
+ /* Array is empty, so we can reset both pointers */
+ pArray->headKnownAssignedXids = 0;
+ pArray->tailKnownAssignedXids = 0;
+ }
+ else
+ {
+ pArray->tailKnownAssignedXids = tail;
+ }
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Is the specified XID present in KnownAssignedXids[]?
+ *
+ * Caller must hold ProcArrayLock in shared or exclusive mode.
+ */
+static bool
+KnownAssignedXidExists(TransactionId xid)
+{
+ Assert(TransactionIdIsValid(xid));
+
+ return KnownAssignedXidsSearch(xid, false);
+}
+
+/*
+ * Remove the specified XID from KnownAssignedXids[].
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemove(TransactionId xid)
+{
+ Assert(TransactionIdIsValid(xid));
+
+ elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);
+
+ /*
+ * Note: we cannot consider it an error to remove an XID that's not
+ * present. We intentionally remove subxact IDs while processing
+ * XLOG_XACT_ASSIGNMENT, to avoid array overflow. Then those XIDs will be
+ * removed again when the top-level xact commits or aborts.
+ *
+ * It might be possible to track such XIDs to distinguish this case from
+ * actual errors, but it would be complicated and probably not worth it.
+ * So, just ignore the search result.
+ */
+ (void) KnownAssignedXidsSearch(xid, true);
+}
+
+/*
+ * KnownAssignedXidsRemoveTree
+ * Remove xid (if it's not InvalidTransactionId) and all the subxids.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
+ TransactionId *subxids)
+{
+ int i;
+
+ if (TransactionIdIsValid(xid))
+ KnownAssignedXidsRemove(xid);
+
+ for (i = 0; i < nsubxids; i++)
+ KnownAssignedXidsRemove(subxids[i]);
+
+ /* Opportunistically compress the array */
+ KnownAssignedXidsCompress(KAX_TRANSACTION_END, true);
+}
+
+/*
+ * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
+ * then clear the whole table.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemovePreceding(TransactionId removeXid)
+{
+ ProcArrayStruct *pArray = procArray;
+ int count = 0;
+ int head,
+ tail,
+ i;
+
+ if (!TransactionIdIsValid(removeXid))
+ {
+ elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
+ pArray->numKnownAssignedXids = 0;
+ pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
+ return;
+ }
+
+ elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
+
+ /*
+ * Mark entries invalid starting at the tail. Since array is sorted, we
+ * can stop as soon as we reach an entry >= removeXid.
+ */
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
+
+ for (i = tail; i < head; i++)
+ {
+ if (KnownAssignedXidsValid[i])
+ {
+ TransactionId knownXid = KnownAssignedXids[i];
+
+ if (TransactionIdFollowsOrEquals(knownXid, removeXid))
+ break;
+
+ if (!StandbyTransactionIdIsPrepared(knownXid))
+ {
+ KnownAssignedXidsValid[i] = false;
+ count++;
+ }
+ }
+ }
+
+ pArray->numKnownAssignedXids -= count;
+ Assert(pArray->numKnownAssignedXids >= 0);
+
+ /*
+ * Advance the tail pointer if we've marked the tail item invalid.
+ */
+ for (i = tail; i < head; i++)
+ {
+ if (KnownAssignedXidsValid[i])
+ break;
+ }
+ if (i >= head)
+ {
+ /* Array is empty, so we can reset both pointers */
+ pArray->headKnownAssignedXids = 0;
+ pArray->tailKnownAssignedXids = 0;
+ }
+ else
+ {
+ pArray->tailKnownAssignedXids = i;
+ }
+
+ /* Opportunistically compress the array */
+ KnownAssignedXidsCompress(KAX_PRUNE, true);
+}
+
+/*
+ * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
+ * We filter out anything >= xmax.
+ *
+ * Returns the number of XIDs stored into xarray[]. Caller is responsible
+ * that array is large enough.
+ *
+ * Caller must hold ProcArrayLock in (at least) shared mode.
+ */
+static int
+KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
+{
+ TransactionId xtmp = InvalidTransactionId;
+
+ return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
+}
+
+/*
+ * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
+ * we reduce *xmin to the lowest xid value seen if not already lower.
+ *
+ * Caller must hold ProcArrayLock in (at least) shared mode.
+ */
+static int
+KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
+ TransactionId xmax)
+{
+ int count = 0;
+ int head,
+ tail;
+ int i;
+
+ /*
+ * Fetch head just once, since it may change while we loop. We can stop
+ * once we reach the initially seen head, since we are certain that an xid
+ * cannot enter and then leave the array while we hold ProcArrayLock. We
+ * might miss newly-added xids, but they should be >= xmax so irrelevant
+ * anyway.
+ *
+ * Must take spinlock to ensure we see up-to-date array contents.
+ */
+ SpinLockAcquire(&procArray->known_assigned_xids_lck);
+ tail = procArray->tailKnownAssignedXids;
+ head = procArray->headKnownAssignedXids;
+ SpinLockRelease(&procArray->known_assigned_xids_lck);
+
+ for (i = tail; i < head; i++)
+ {
+ /* Skip any gaps in the array */
+ if (KnownAssignedXidsValid[i])
+ {
+ TransactionId knownXid = KnownAssignedXids[i];
+
+ /*
+ * Update xmin if required. Only the first XID need be checked,
+ * since the array is sorted.
+ */
+ if (count == 0 &&
+ TransactionIdPrecedes(knownXid, *xmin))
+ *xmin = knownXid;
+
+ /*
+ * Filter out anything >= xmax, again relying on sorted property
+ * of array.
+ */
+ if (TransactionIdIsValid(xmax) &&
+ TransactionIdFollowsOrEquals(knownXid, xmax))
+ break;
+
+ /* Add knownXid into output array */
+ xarray[count++] = knownXid;
+ }
+ }
+
+ return count;
+}
+
+/*
+ * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
+ * if nothing there.
+ */
+static TransactionId
+KnownAssignedXidsGetOldestXmin(void)
+{
+ int head,
+ tail;
+ int i;
+
+ /*
+ * Fetch head just once, since it may change while we loop.
+ */
+ SpinLockAcquire(&procArray->known_assigned_xids_lck);
+ tail = procArray->tailKnownAssignedXids;
+ head = procArray->headKnownAssignedXids;
+ SpinLockRelease(&procArray->known_assigned_xids_lck);
+
+ for (i = tail; i < head; i++)
+ {
+ /* Skip any gaps in the array */
+ if (KnownAssignedXidsValid[i])
+ return KnownAssignedXids[i];
+ }
+
+ return InvalidTransactionId;
+}
+
+/*
+ * Display KnownAssignedXids to provide debug trail
+ *
+ * Currently this is only called within startup process, so we need no
+ * special locking.
+ *
+ * Note this is pretty expensive, and much of the expense will be incurred
+ * even if the elog message will get discarded. It's not currently called
+ * in any performance-critical places, however, so no need to be tenser.
+ */
+static void
+KnownAssignedXidsDisplay(int trace_level)
+{
+ ProcArrayStruct *pArray = procArray;
+ StringInfoData buf;
+ int head,
+ tail,
+ i;
+ int nxids = 0;
+
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
+
+ initStringInfo(&buf);
+
+ for (i = tail; i < head; i++)
+ {
+ if (KnownAssignedXidsValid[i])
+ {
+ nxids++;
+ appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
+ }
+ }
+
+ elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
+ nxids,
+ pArray->numKnownAssignedXids,
+ pArray->tailKnownAssignedXids,
+ pArray->headKnownAssignedXids,
+ buf.data);
+
+ pfree(buf.data);
+}
+
+/*
+ * KnownAssignedXidsReset
+ * Resets KnownAssignedXids to be empty
+ */
+static void
+KnownAssignedXidsReset(void)
+{
+ ProcArrayStruct *pArray = procArray;
+
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ pArray->numKnownAssignedXids = 0;
+ pArray->tailKnownAssignedXids = 0;
+ pArray->headKnownAssignedXids = 0;
+
+ LWLockRelease(ProcArrayLock);
+}
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
new file mode 100644
index 0000000..c85cb5c
--- /dev/null
+++ b/src/backend/storage/ipc/procsignal.c
@@ -0,0 +1,688 @@
+/*-------------------------------------------------------------------------
+ *
+ * procsignal.c
+ * Routines for interprocess signaling
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/procsignal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/parallel.h"
+#include "port/pg_bitutils.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "replication/logicalworker.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "storage/smgr.h"
+#include "storage/sinval.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+
+/*
+ * The SIGUSR1 signal is multiplexed to support signaling multiple event
+ * types. The specific reason is communicated via flags in shared memory.
+ * We keep a boolean flag for each possible "reason", so that different
+ * reasons can be signaled to a process concurrently. (However, if the same
+ * reason is signaled more than once nearly simultaneously, the process may
+ * observe it only once.)
+ *
+ * Each process that wants to receive signals registers its process ID
+ * in the ProcSignalSlots array. The array is indexed by backend ID to make
+ * slot allocation simple, and to avoid having to search the array when you
+ * know the backend ID of the process you're signaling. (We do support
+ * signaling without backend ID, but it's a bit less efficient.)
+ *
+ * The flags are actually declared as "volatile sig_atomic_t" for maximum
+ * portability. This should ensure that loads and stores of the flag
+ * values are atomic, allowing us to dispense with any explicit locking.
+ *
+ * pss_signalFlags are intended to be set in cases where we don't need to
+ * keep track of whether or not the target process has handled the signal,
+ * but sometimes we need confirmation, as when making a global state change
+ * that cannot be considered complete until all backends have taken notice
+ * of it. For such use cases, we set a bit in pss_barrierCheckMask and then
+ * increment the current "barrier generation"; when the new barrier generation
+ * (or greater) appears in the pss_barrierGeneration flag of every process,
+ * we know that the message has been received everywhere.
+ */
+typedef struct
+{
+ volatile pid_t pss_pid;
+ volatile sig_atomic_t pss_signalFlags[NUM_PROCSIGNALS];
+ pg_atomic_uint64 pss_barrierGeneration;
+ pg_atomic_uint32 pss_barrierCheckMask;
+ ConditionVariable pss_barrierCV;
+} ProcSignalSlot;
+
+/*
+ * Information that is global to the entire ProcSignal system can be stored
+ * here.
+ *
+ * psh_barrierGeneration is the highest barrier generation in existence.
+ */
+typedef struct
+{
+ pg_atomic_uint64 psh_barrierGeneration;
+ ProcSignalSlot psh_slot[FLEXIBLE_ARRAY_MEMBER];
+} ProcSignalHeader;
+
+/*
+ * We reserve a slot for each possible BackendId, plus one for each
+ * possible auxiliary process type. (This scheme assumes there is not
+ * more than one of any auxiliary process type at a time.)
+ */
+#define NumProcSignalSlots (MaxBackends + NUM_AUXPROCTYPES)
+
+/* Check whether the relevant type bit is set in the flags. */
+#define BARRIER_SHOULD_CHECK(flags, type) \
+ (((flags) & (((uint32) 1) << (uint32) (type))) != 0)
+
+/* Clear the relevant type bit from the flags. */
+#define BARRIER_CLEAR_BIT(flags, type) \
+ ((flags) &= ~(((uint32) 1) << (uint32) (type)))
+
+static ProcSignalHeader *ProcSignal = NULL;
+static ProcSignalSlot *MyProcSignalSlot = NULL;
+
+static bool CheckProcSignal(ProcSignalReason reason);
+static void CleanupProcSignalState(int status, Datum arg);
+static void ResetProcSignalBarrierBits(uint32 flags);
+
+/*
+ * ProcSignalShmemSize
+ * Compute space needed for ProcSignal's shared memory
+ */
+Size
+ProcSignalShmemSize(void)
+{
+ Size size;
+
+ size = mul_size(NumProcSignalSlots, sizeof(ProcSignalSlot));
+ size = add_size(size, offsetof(ProcSignalHeader, psh_slot));
+ return size;
+}
+
+/*
+ * ProcSignalShmemInit
+ * Allocate and initialize ProcSignal's shared memory
+ */
+void
+ProcSignalShmemInit(void)
+{
+ Size size = ProcSignalShmemSize();
+ bool found;
+
+ ProcSignal = (ProcSignalHeader *)
+ ShmemInitStruct("ProcSignal", size, &found);
+
+ /* If we're first, initialize. */
+ if (!found)
+ {
+ int i;
+
+ pg_atomic_init_u64(&ProcSignal->psh_barrierGeneration, 0);
+
+ for (i = 0; i < NumProcSignalSlots; ++i)
+ {
+ ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+
+ slot->pss_pid = 0;
+ MemSet(slot->pss_signalFlags, 0, sizeof(slot->pss_signalFlags));
+ pg_atomic_init_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX);
+ pg_atomic_init_u32(&slot->pss_barrierCheckMask, 0);
+ ConditionVariableInit(&slot->pss_barrierCV);
+ }
+ }
+}
+
+/*
+ * ProcSignalInit
+ * Register the current process in the ProcSignal array
+ *
+ * The passed index should be my BackendId if the process has one,
+ * or MaxBackends + aux process type if not.
+ */
+void
+ProcSignalInit(int pss_idx)
+{
+ ProcSignalSlot *slot;
+ uint64 barrier_generation;
+
+ Assert(pss_idx >= 1 && pss_idx <= NumProcSignalSlots);
+
+ slot = &ProcSignal->psh_slot[pss_idx - 1];
+
+ /* sanity check */
+ if (slot->pss_pid != 0)
+ elog(LOG, "process %d taking over ProcSignal slot %d, but it's not empty",
+ MyProcPid, pss_idx);
+
+ /* Clear out any leftover signal reasons */
+ MemSet(slot->pss_signalFlags, 0, NUM_PROCSIGNALS * sizeof(sig_atomic_t));
+
+ /*
+ * Initialize barrier state. Since we're a brand-new process, there
+ * shouldn't be any leftover backend-private state that needs to be
+ * updated. Therefore, we can broadcast the latest barrier generation and
+ * disregard any previously-set check bits.
+ *
+ * NB: This only works if this initialization happens early enough in the
+ * startup sequence that we haven't yet cached any state that might need
+ * to be invalidated. That's also why we have a memory barrier here, to be
+ * sure that any later reads of memory happen strictly after this.
+ */
+ pg_atomic_write_u32(&slot->pss_barrierCheckMask, 0);
+ barrier_generation =
+ pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+ pg_atomic_write_u64(&slot->pss_barrierGeneration, barrier_generation);
+ pg_memory_barrier();
+
+ /* Mark slot with my PID */
+ slot->pss_pid = MyProcPid;
+
+ /* Remember slot location for CheckProcSignal */
+ MyProcSignalSlot = slot;
+
+ /* Set up to release the slot on process exit */
+ on_shmem_exit(CleanupProcSignalState, Int32GetDatum(pss_idx));
+}
+
+/*
+ * CleanupProcSignalState
+ * Remove current process from ProcSignal mechanism
+ *
+ * This function is called via on_shmem_exit() during backend shutdown.
+ */
+static void
+CleanupProcSignalState(int status, Datum arg)
+{
+ int pss_idx = DatumGetInt32(arg);
+ ProcSignalSlot *slot;
+
+ slot = &ProcSignal->psh_slot[pss_idx - 1];
+ Assert(slot == MyProcSignalSlot);
+
+ /*
+ * Clear MyProcSignalSlot, so that a SIGUSR1 received after this point
+ * won't try to access it after it's no longer ours (and perhaps even
+ * after we've unmapped the shared memory segment).
+ */
+ MyProcSignalSlot = NULL;
+
+ /* sanity check */
+ if (slot->pss_pid != MyProcPid)
+ {
+ /*
+ * don't ERROR here. We're exiting anyway, and don't want to get into
+ * infinite loop trying to exit
+ */
+ elog(LOG, "process %d releasing ProcSignal slot %d, but it contains %d",
+ MyProcPid, pss_idx, (int) slot->pss_pid);
+ return; /* XXX better to zero the slot anyway? */
+ }
+
+ /*
+ * Make this slot look like it's absorbed all possible barriers, so that
+ * no barrier waits block on it.
+ */
+ pg_atomic_write_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX);
+ ConditionVariableBroadcast(&slot->pss_barrierCV);
+
+ slot->pss_pid = 0;
+}
+
+/*
+ * SendProcSignal
+ * Send a signal to a Postgres process
+ *
+ * Providing backendId is optional, but it will speed up the operation.
+ *
+ * On success (a signal was sent), zero is returned.
+ * On error, -1 is returned, and errno is set (typically to ESRCH or EPERM).
+ *
+ * Not to be confused with ProcSendSignal
+ */
+int
+SendProcSignal(pid_t pid, ProcSignalReason reason, BackendId backendId)
+{
+ volatile ProcSignalSlot *slot;
+
+ if (backendId != InvalidBackendId)
+ {
+ slot = &ProcSignal->psh_slot[backendId - 1];
+
+ /*
+ * Note: Since there's no locking, it's possible that the target
+ * process detaches from shared memory and exits right after this
+ * test, before we set the flag and send signal. And the signal slot
+ * might even be recycled by a new process, so it's remotely possible
+ * that we set a flag for a wrong process. That's OK, all the signals
+ * are such that no harm is done if they're mistakenly fired.
+ */
+ if (slot->pss_pid == pid)
+ {
+ /* Atomically set the proper flag */
+ slot->pss_signalFlags[reason] = true;
+ /* Send signal */
+ return kill(pid, SIGUSR1);
+ }
+ }
+ else
+ {
+ /*
+ * BackendId not provided, so search the array using pid. We search
+ * the array back to front so as to reduce search overhead. Passing
+ * InvalidBackendId means that the target is most likely an auxiliary
+ * process, which will have a slot near the end of the array.
+ */
+ int i;
+
+ for (i = NumProcSignalSlots - 1; i >= 0; i--)
+ {
+ slot = &ProcSignal->psh_slot[i];
+
+ if (slot->pss_pid == pid)
+ {
+ /* the above note about race conditions applies here too */
+
+ /* Atomically set the proper flag */
+ slot->pss_signalFlags[reason] = true;
+ /* Send signal */
+ return kill(pid, SIGUSR1);
+ }
+ }
+ }
+
+ errno = ESRCH;
+ return -1;
+}
+
+/*
+ * EmitProcSignalBarrier
+ * Send a signal to every Postgres process
+ *
+ * The return value of this function is the barrier "generation" created
+ * by this operation. This value can be passed to WaitForProcSignalBarrier
+ * to wait until it is known that every participant in the ProcSignal
+ * mechanism has absorbed the signal (or started afterwards).
+ *
+ * Note that it would be a bad idea to use this for anything that happens
+ * frequently, as interrupting every backend could cause a noticeable
+ * performance hit.
+ *
+ * Callers are entitled to assume that this function will not throw ERROR
+ * or FATAL.
+ */
+uint64
+EmitProcSignalBarrier(ProcSignalBarrierType type)
+{
+ uint32 flagbit = 1 << (uint32) type;
+ uint64 generation;
+
+ /*
+ * Set all the flags.
+ *
+ * Note that pg_atomic_fetch_or_u32 has full barrier semantics, so this is
+ * totally ordered with respect to anything the caller did before, and
+ * anything that we do afterwards. (This is also true of the later call to
+ * pg_atomic_add_fetch_u64.)
+ */
+ for (int i = 0; i < NumProcSignalSlots; i++)
+ {
+ volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+
+ pg_atomic_fetch_or_u32(&slot->pss_barrierCheckMask, flagbit);
+ }
+
+ /*
+ * Increment the generation counter.
+ */
+ generation =
+ pg_atomic_add_fetch_u64(&ProcSignal->psh_barrierGeneration, 1);
+
+ /*
+ * Signal all the processes, so that they update their advertised barrier
+ * generation.
+ *
+ * Concurrency is not a problem here. Backends that have exited don't
+ * matter, and new backends that have joined since we entered this
+ * function must already have current state, since the caller is
+ * responsible for making sure that the relevant state is entirely visible
+ * before calling this function in the first place. We still have to wake
+ * them up - because we can't distinguish between such backends and older
+ * backends that need to update state - but they won't actually need to
+ * change any state.
+ */
+ for (int i = NumProcSignalSlots - 1; i >= 0; i--)
+ {
+ volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+ pid_t pid = slot->pss_pid;
+
+ if (pid != 0)
+ {
+ /* see SendProcSignal for details */
+ slot->pss_signalFlags[PROCSIG_BARRIER] = true;
+ kill(pid, SIGUSR1);
+ }
+ }
+
+ return generation;
+}
+
+/*
+ * WaitForProcSignalBarrier - wait until it is guaranteed that all changes
+ * requested by a specific call to EmitProcSignalBarrier() have taken effect.
+ */
+void
+WaitForProcSignalBarrier(uint64 generation)
+{
+ Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration));
+
+ elog(DEBUG1,
+ "waiting for all backends to process ProcSignalBarrier generation "
+ UINT64_FORMAT,
+ generation);
+
+ for (int i = NumProcSignalSlots - 1; i >= 0; i--)
+ {
+ ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+ uint64 oldval;
+
+ /*
+ * It's important that we check only pss_barrierGeneration here and
+ * not pss_barrierCheckMask. Bits in pss_barrierCheckMask get cleared
+ * before the barrier is actually absorbed, but pss_barrierGeneration
+ * is updated only afterward.
+ */
+ oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
+ while (oldval < generation)
+ {
+ if (ConditionVariableTimedSleep(&slot->pss_barrierCV,
+ 5000,
+ WAIT_EVENT_PROC_SIGNAL_BARRIER))
+ ereport(LOG,
+ (errmsg("still waiting for backend with PID %d to accept ProcSignalBarrier",
+ (int) slot->pss_pid)));
+ oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
+ }
+ ConditionVariableCancelSleep();
+ }
+
+ elog(DEBUG1,
+ "finished waiting for all backends to process ProcSignalBarrier generation "
+ UINT64_FORMAT,
+ generation);
+
+ /*
+ * The caller is probably calling this function because it wants to read
+ * the shared state or perform further writes to shared state once all
+ * backends are known to have absorbed the barrier. However, the read of
+ * pss_barrierGeneration was performed unlocked; insert a memory barrier
+ * to separate it from whatever follows.
+ */
+ pg_memory_barrier();
+}
+
+/*
+ * Handle receipt of an interrupt indicating a global barrier event.
+ *
+ * All the actual work is deferred to ProcessProcSignalBarrier(), because we
+ * cannot safely access the barrier generation inside the signal handler as
+ * 64bit atomics might use spinlock based emulation, even for reads. As this
+ * routine only gets called when PROCSIG_BARRIER is sent that won't cause a
+ * lot of unnecessary work.
+ */
+static void
+HandleProcSignalBarrierInterrupt(void)
+{
+ InterruptPending = true;
+ ProcSignalBarrierPending = true;
+ /* latch will be set by procsignal_sigusr1_handler */
+}
+
+/*
+ * Perform global barrier related interrupt checking.
+ *
+ * Any backend that participates in ProcSignal signaling must arrange to
+ * call this function periodically. It is called from CHECK_FOR_INTERRUPTS(),
+ * which is enough for normal backends, but not necessarily for all types of
+ * background processes.
+ */
+void
+ProcessProcSignalBarrier(void)
+{
+ uint64 local_gen;
+ uint64 shared_gen;
+ volatile uint32 flags;
+
+ Assert(MyProcSignalSlot);
+
+ /* Exit quickly if there's no work to do. */
+ if (!ProcSignalBarrierPending)
+ return;
+ ProcSignalBarrierPending = false;
+
+ /*
+ * It's not unlikely to process multiple barriers at once, before the
+ * signals for all the barriers have arrived. To avoid unnecessary work in
+ * response to subsequent signals, exit early if we already have processed
+ * all of them.
+ */
+ local_gen = pg_atomic_read_u64(&MyProcSignalSlot->pss_barrierGeneration);
+ shared_gen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+
+ Assert(local_gen <= shared_gen);
+
+ if (local_gen == shared_gen)
+ return;
+
+ /*
+ * Get and clear the flags that are set for this backend. Note that
+ * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the
+ * read of the barrier generation above happens before we atomically
+ * extract the flags, and that any subsequent state changes happen
+ * afterward.
+ *
+ * NB: In order to avoid race conditions, we must zero
+ * pss_barrierCheckMask first and only afterwards try to do barrier
+ * processing. If we did it in the other order, someone could send us
+ * another barrier of some type right after we called the
+ * barrier-processing function but before we cleared the bit. We would
+ * have no way of knowing that the bit needs to stay set in that case, so
+ * the need to call the barrier-processing function again would just get
+ * forgotten. So instead, we tentatively clear all the bits and then put
+ * back any for which we don't manage to successfully absorb the barrier.
+ */
+ flags = pg_atomic_exchange_u32(&MyProcSignalSlot->pss_barrierCheckMask, 0);
+
+ /*
+ * If there are no flags set, then we can skip doing any real work.
+ * Otherwise, establish a PG_TRY block, so that we don't lose track of
+ * which types of barrier processing are needed if an ERROR occurs.
+ */
+ if (flags != 0)
+ {
+ bool success = true;
+
+ PG_TRY();
+ {
+ /*
+ * Process each type of barrier. The barrier-processing functions
+ * should normally return true, but may return false if the
+ * barrier can't be absorbed at the current time. This should be
+ * rare, because it's pretty expensive. Every single
+ * CHECK_FOR_INTERRUPTS() will return here until we manage to
+ * absorb the barrier, and that cost will add up in a hurry.
+ *
+ * NB: It ought to be OK to call the barrier-processing functions
+ * unconditionally, but it's more efficient to call only the ones
+ * that might need us to do something based on the flags.
+ */
+ while (flags != 0)
+ {
+ ProcSignalBarrierType type;
+ bool processed = true;
+
+ type = (ProcSignalBarrierType) pg_rightmost_one_pos32(flags);
+ switch (type)
+ {
+ case PROCSIGNAL_BARRIER_SMGRRELEASE:
+ processed = ProcessBarrierSmgrRelease();
+ break;
+ }
+
+ /*
+ * To avoid an infinite loop, we must always unset the bit in
+ * flags.
+ */
+ BARRIER_CLEAR_BIT(flags, type);
+
+ /*
+ * If we failed to process the barrier, reset the shared bit
+ * so we try again later, and set a flag so that we don't bump
+ * our generation.
+ */
+ if (!processed)
+ {
+ ResetProcSignalBarrierBits(((uint32) 1) << type);
+ success = false;
+ }
+ }
+ }
+ PG_CATCH();
+ {
+ /*
+ * If an ERROR occurred, we'll need to try again later to handle
+ * that barrier type and any others that haven't been handled yet
+ * or weren't successfully absorbed.
+ */
+ ResetProcSignalBarrierBits(flags);
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ /*
+ * If some barrier types were not successfully absorbed, we will have
+ * to try again later.
+ */
+ if (!success)
+ return;
+ }
+
+ /*
+ * State changes related to all types of barriers that might have been
+ * emitted have now been handled, so we can update our notion of the
+ * generation to the one we observed before beginning the updates. If
+ * things have changed further, it'll get fixed up when this function is
+ * next called.
+ */
+ pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen);
+ ConditionVariableBroadcast(&MyProcSignalSlot->pss_barrierCV);
+}
+
+/*
+ * If it turns out that we couldn't absorb one or more barrier types, either
+ * because the barrier-processing functions returned false or due to an error,
+ * arrange for processing to be retried later.
+ */
+static void
+ResetProcSignalBarrierBits(uint32 flags)
+{
+ pg_atomic_fetch_or_u32(&MyProcSignalSlot->pss_barrierCheckMask, flags);
+ ProcSignalBarrierPending = true;
+ InterruptPending = true;
+}
+
+/*
+ * CheckProcSignal - check to see if a particular reason has been
+ * signaled, and clear the signal flag. Should be called after receiving
+ * SIGUSR1.
+ */
+static bool
+CheckProcSignal(ProcSignalReason reason)
+{
+ volatile ProcSignalSlot *slot = MyProcSignalSlot;
+
+ if (slot != NULL)
+ {
+ /* Careful here --- don't clear flag if we haven't seen it set */
+ if (slot->pss_signalFlags[reason])
+ {
+ slot->pss_signalFlags[reason] = false;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * procsignal_sigusr1_handler - handle SIGUSR1 signal.
+ */
+void
+procsignal_sigusr1_handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ if (CheckProcSignal(PROCSIG_CATCHUP_INTERRUPT))
+ HandleCatchupInterrupt();
+
+ if (CheckProcSignal(PROCSIG_NOTIFY_INTERRUPT))
+ HandleNotifyInterrupt();
+
+ if (CheckProcSignal(PROCSIG_PARALLEL_MESSAGE))
+ HandleParallelMessageInterrupt();
+
+ if (CheckProcSignal(PROCSIG_WALSND_INIT_STOPPING))
+ HandleWalSndInitStopping();
+
+ if (CheckProcSignal(PROCSIG_BARRIER))
+ HandleProcSignalBarrierInterrupt();
+
+ if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT))
+ HandleLogMemoryContextInterrupt();
+
+ if (CheckProcSignal(PROCSIG_PARALLEL_APPLY_MESSAGE))
+ HandleParallelApplyMessageInterrupt();
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+
+ SetLatch(MyLatch);
+
+ errno = save_errno;
+}
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
new file mode 100644
index 0000000..d134a09
--- /dev/null
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -0,0 +1,1329 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_mq.c
+ * single-reader, single-writer shared memory message queue
+ *
+ * Both the sender and the receiver must have a PGPROC; their respective
+ * process latches are used for synchronization. Only the sender may send,
+ * and only the receiver may receive. This is intended to allow a user
+ * backend to communicate with worker backends that it has registered.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_mq.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/pg_bitutils.h"
+#include "postmaster/bgworker.h"
+#include "storage/procsignal.h"
+#include "storage/shm_mq.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+/*
+ * This structure represents the actual queue, stored in shared memory.
+ *
+ * Some notes on synchronization:
+ *
+ * mq_receiver and mq_bytes_read can only be changed by the receiver; and
+ * mq_sender and mq_bytes_written can only be changed by the sender.
+ * mq_receiver and mq_sender are protected by mq_mutex, although, importantly,
+ * they cannot change once set, and thus may be read without a lock once this
+ * is known to be the case.
+ *
+ * mq_bytes_read and mq_bytes_written are not protected by the mutex. Instead,
+ * they are written atomically using 8 byte loads and stores. Memory barriers
+ * must be carefully used to synchronize reads and writes of these values with
+ * reads and writes of the actual data in mq_ring.
+ *
+ * mq_detached needs no locking. It can be set by either the sender or the
+ * receiver, but only ever from false to true, so redundant writes don't
+ * matter. It is important that if we set mq_detached and then set the
+ * counterparty's latch, the counterparty must be certain to see the change
+ * after waking up. Since SetLatch begins with a memory barrier and ResetLatch
+ * ends with one, this should be OK.
+ *
+ * mq_ring_size and mq_ring_offset never change after initialization, and
+ * can therefore be read without the lock.
+ *
+ * Importantly, mq_ring can be safely read and written without a lock.
+ * At any given time, the difference between mq_bytes_read and
+ * mq_bytes_written defines the number of bytes within mq_ring that contain
+ * unread data, and mq_bytes_read defines the position where those bytes
+ * begin. The sender can increase the number of unread bytes at any time,
+ * but only the receiver can give license to overwrite those bytes, by
+ * incrementing mq_bytes_read. Therefore, it's safe for the receiver to read
+ * the unread bytes it knows to be present without the lock. Conversely,
+ * the sender can write to the unused portion of the ring buffer without
+ * the lock, because nobody else can be reading or writing those bytes. The
+ * receiver could be making more bytes unused by incrementing mq_bytes_read,
+ * but that's OK. Note that it would be unsafe for the receiver to read any
+ * data it's already marked as read, or to write any data; and it would be
+ * unsafe for the sender to reread any data after incrementing
+ * mq_bytes_written, but fortunately there's no need for any of that.
+ */
+struct shm_mq
+{
+ slock_t mq_mutex;
+ PGPROC *mq_receiver;
+ PGPROC *mq_sender;
+ pg_atomic_uint64 mq_bytes_read;
+ pg_atomic_uint64 mq_bytes_written;
+ Size mq_ring_size;
+ bool mq_detached;
+ uint8 mq_ring_offset;
+ char mq_ring[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * This structure is a backend-private handle for access to a queue.
+ *
+ * mqh_queue is a pointer to the queue we've attached, and mqh_segment is
+ * an optional pointer to the dynamic shared memory segment that contains it.
+ * (If mqh_segment is provided, we register an on_dsm_detach callback to
+ * make sure we detach from the queue before detaching from DSM.)
+ *
+ * If this queue is intended to connect the current process with a background
+ * worker that started it, the user can pass a pointer to the worker handle
+ * to shm_mq_attach(), and we'll store it in mqh_handle. The point of this
+ * is to allow us to begin sending to or receiving from that queue before the
+ * process we'll be communicating with has even been started. If it fails
+ * to start, the handle will allow us to notice that and fail cleanly, rather
+ * than waiting forever; see shm_mq_wait_internal. This is mostly useful in
+ * simple cases - e.g. where there are just 2 processes communicating; in
+ * more complex scenarios, every process may not have a BackgroundWorkerHandle
+ * available, or may need to watch for the failure of more than one other
+ * process at a time.
+ *
+ * When a message exists as a contiguous chunk of bytes in the queue - that is,
+ * it is smaller than the size of the ring buffer and does not wrap around
+ * the end - we return the message to the caller as a pointer into the buffer.
+ * For messages that are larger or happen to wrap, we reassemble the message
+ * locally by copying the chunks into a backend-local buffer. mqh_buffer is
+ * the buffer, and mqh_buflen is the number of bytes allocated for it.
+ *
+ * mqh_send_pending, is number of bytes that is written to the queue but not
+ * yet updated in the shared memory. We will not update it until the written
+ * data is 1/4th of the ring size or the tuple queue is full. This will
+ * prevent frequent CPU cache misses, and it will also avoid frequent
+ * SetLatch() calls, which are quite expensive.
+ *
+ * mqh_partial_bytes, mqh_expected_bytes, and mqh_length_word_complete
+ * are used to track the state of non-blocking operations. When the caller
+ * attempts a non-blocking operation that returns SHM_MQ_WOULD_BLOCK, they
+ * are expected to retry the call at a later time with the same argument;
+ * we need to retain enough state to pick up where we left off.
+ * mqh_length_word_complete tracks whether we are done sending or receiving
+ * (whichever we're doing) the entire length word. mqh_partial_bytes tracks
+ * the number of bytes read or written for either the length word or the
+ * message itself, and mqh_expected_bytes - which is used only for reads -
+ * tracks the expected total size of the payload.
+ *
+ * mqh_counterparty_attached tracks whether we know the counterparty to have
+ * attached to the queue at some previous point. This lets us avoid some
+ * mutex acquisitions.
+ *
+ * mqh_context is the memory context in effect at the time we attached to
+ * the shm_mq. The shm_mq_handle itself is allocated in this context, and
+ * we make sure any other allocations we do happen in this context as well,
+ * to avoid nasty surprises.
+ */
+struct shm_mq_handle
+{
+ shm_mq *mqh_queue;
+ dsm_segment *mqh_segment;
+ BackgroundWorkerHandle *mqh_handle;
+ char *mqh_buffer;
+ Size mqh_buflen;
+ Size mqh_consume_pending;
+ Size mqh_send_pending;
+ Size mqh_partial_bytes;
+ Size mqh_expected_bytes;
+ bool mqh_length_word_complete;
+ bool mqh_counterparty_attached;
+ MemoryContext mqh_context;
+};
+
+static void shm_mq_detach_internal(shm_mq *mq);
+static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes,
+ const void *data, bool nowait, Size *bytes_written);
+static shm_mq_result shm_mq_receive_bytes(shm_mq_handle *mqh,
+ Size bytes_needed, bool nowait, Size *nbytesp,
+ void **datap);
+static bool shm_mq_counterparty_gone(shm_mq *mq,
+ BackgroundWorkerHandle *handle);
+static bool shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr,
+ BackgroundWorkerHandle *handle);
+static void shm_mq_inc_bytes_read(shm_mq *mq, Size n);
+static void shm_mq_inc_bytes_written(shm_mq *mq, Size n);
+static void shm_mq_detach_callback(dsm_segment *seg, Datum arg);
+
+/* Minimum queue size is enough for header and at least one chunk of data. */
+const Size shm_mq_minimum_size =
+MAXALIGN(offsetof(shm_mq, mq_ring)) + MAXIMUM_ALIGNOF;
+
+#define MQH_INITIAL_BUFSIZE 8192
+
+/*
+ * Initialize a new shared message queue.
+ */
+shm_mq *
+shm_mq_create(void *address, Size size)
+{
+ shm_mq *mq = address;
+ Size data_offset = MAXALIGN(offsetof(shm_mq, mq_ring));
+
+ /* If the size isn't MAXALIGN'd, just discard the odd bytes. */
+ size = MAXALIGN_DOWN(size);
+
+ /* Queue size must be large enough to hold some data. */
+ Assert(size > data_offset);
+
+ /* Initialize queue header. */
+ SpinLockInit(&mq->mq_mutex);
+ mq->mq_receiver = NULL;
+ mq->mq_sender = NULL;
+ pg_atomic_init_u64(&mq->mq_bytes_read, 0);
+ pg_atomic_init_u64(&mq->mq_bytes_written, 0);
+ mq->mq_ring_size = size - data_offset;
+ mq->mq_detached = false;
+ mq->mq_ring_offset = data_offset - offsetof(shm_mq, mq_ring);
+
+ return mq;
+}
+
+/*
+ * Set the identity of the process that will receive from a shared message
+ * queue.
+ */
+void
+shm_mq_set_receiver(shm_mq *mq, PGPROC *proc)
+{
+ PGPROC *sender;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ Assert(mq->mq_receiver == NULL);
+ mq->mq_receiver = proc;
+ sender = mq->mq_sender;
+ SpinLockRelease(&mq->mq_mutex);
+
+ if (sender != NULL)
+ SetLatch(&sender->procLatch);
+}
+
+/*
+ * Set the identity of the process that will send to a shared message queue.
+ */
+void
+shm_mq_set_sender(shm_mq *mq, PGPROC *proc)
+{
+ PGPROC *receiver;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ Assert(mq->mq_sender == NULL);
+ mq->mq_sender = proc;
+ receiver = mq->mq_receiver;
+ SpinLockRelease(&mq->mq_mutex);
+
+ if (receiver != NULL)
+ SetLatch(&receiver->procLatch);
+}
+
+/*
+ * Get the configured receiver.
+ */
+PGPROC *
+shm_mq_get_receiver(shm_mq *mq)
+{
+ PGPROC *receiver;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ receiver = mq->mq_receiver;
+ SpinLockRelease(&mq->mq_mutex);
+
+ return receiver;
+}
+
+/*
+ * Get the configured sender.
+ */
+PGPROC *
+shm_mq_get_sender(shm_mq *mq)
+{
+ PGPROC *sender;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ sender = mq->mq_sender;
+ SpinLockRelease(&mq->mq_mutex);
+
+ return sender;
+}
+
+/*
+ * Attach to a shared message queue so we can send or receive messages.
+ *
+ * The memory context in effect at the time this function is called should
+ * be one which will last for at least as long as the message queue itself.
+ * We'll allocate the handle in that context, and future allocations that
+ * are needed to buffer incoming data will happen in that context as well.
+ *
+ * If seg != NULL, the queue will be automatically detached when that dynamic
+ * shared memory segment is detached.
+ *
+ * If handle != NULL, the queue can be read or written even before the
+ * other process has attached. We'll wait for it to do so if needed. The
+ * handle must be for a background worker initialized with bgw_notify_pid
+ * equal to our PID.
+ *
+ * shm_mq_detach() should be called when done. This will free the
+ * shm_mq_handle and mark the queue itself as detached, so that our
+ * counterpart won't get stuck waiting for us to fill or drain the queue
+ * after we've already lost interest.
+ */
+shm_mq_handle *
+shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
+{
+ shm_mq_handle *mqh = palloc(sizeof(shm_mq_handle));
+
+ Assert(mq->mq_receiver == MyProc || mq->mq_sender == MyProc);
+ mqh->mqh_queue = mq;
+ mqh->mqh_segment = seg;
+ mqh->mqh_handle = handle;
+ mqh->mqh_buffer = NULL;
+ mqh->mqh_buflen = 0;
+ mqh->mqh_consume_pending = 0;
+ mqh->mqh_send_pending = 0;
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_expected_bytes = 0;
+ mqh->mqh_length_word_complete = false;
+ mqh->mqh_counterparty_attached = false;
+ mqh->mqh_context = CurrentMemoryContext;
+
+ if (seg != NULL)
+ on_dsm_detach(seg, shm_mq_detach_callback, PointerGetDatum(mq));
+
+ return mqh;
+}
+
+/*
+ * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had
+ * been passed to shm_mq_attach.
+ */
+void
+shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle)
+{
+ Assert(mqh->mqh_handle == NULL);
+ mqh->mqh_handle = handle;
+}
+
+/*
+ * Write a message into a shared message queue.
+ */
+shm_mq_result
+shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait,
+ bool force_flush)
+{
+ shm_mq_iovec iov;
+
+ iov.data = data;
+ iov.len = nbytes;
+
+ return shm_mq_sendv(mqh, &iov, 1, nowait, force_flush);
+}
+
+/*
+ * Write a message into a shared message queue, gathered from multiple
+ * addresses.
+ *
+ * When nowait = false, we'll wait on our process latch when the ring buffer
+ * fills up, and then continue writing once the receiver has drained some data.
+ * The process latch is reset after each wait.
+ *
+ * When nowait = true, we do not manipulate the state of the process latch;
+ * instead, if the buffer becomes full, we return SHM_MQ_WOULD_BLOCK. In
+ * this case, the caller should call this function again, with the same
+ * arguments, each time the process latch is set. (Once begun, the sending
+ * of a message cannot be aborted except by detaching from the queue; changing
+ * the length or payload will corrupt the queue.)
+ *
+ * When force_flush = true, we immediately update the shm_mq's mq_bytes_written
+ * and notify the receiver (if it is already attached). Otherwise, we don't
+ * update it until we have written an amount of data greater than 1/4th of the
+ * ring size.
+ */
+shm_mq_result
+shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait,
+ bool force_flush)
+{
+ shm_mq_result res;
+ shm_mq *mq = mqh->mqh_queue;
+ PGPROC *receiver;
+ Size nbytes = 0;
+ Size bytes_written;
+ int i;
+ int which_iov = 0;
+ Size offset;
+
+ Assert(mq->mq_sender == MyProc);
+
+ /* Compute total size of write. */
+ for (i = 0; i < iovcnt; ++i)
+ nbytes += iov[i].len;
+
+ /* Prevent writing messages overwhelming the receiver. */
+ if (nbytes > MaxAllocSize)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot send a message of size %zu via shared memory queue",
+ nbytes)));
+
+ /* Try to write, or finish writing, the length word into the buffer. */
+ while (!mqh->mqh_length_word_complete)
+ {
+ Assert(mqh->mqh_partial_bytes < sizeof(Size));
+ res = shm_mq_send_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes,
+ ((char *) &nbytes) + mqh->mqh_partial_bytes,
+ nowait, &bytes_written);
+
+ if (res == SHM_MQ_DETACHED)
+ {
+ /* Reset state in case caller tries to send another message. */
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = false;
+ return res;
+ }
+ mqh->mqh_partial_bytes += bytes_written;
+
+ if (mqh->mqh_partial_bytes >= sizeof(Size))
+ {
+ Assert(mqh->mqh_partial_bytes == sizeof(Size));
+
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = true;
+ }
+
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+
+ /* Length word can't be split unless bigger than required alignment. */
+ Assert(mqh->mqh_length_word_complete || sizeof(Size) > MAXIMUM_ALIGNOF);
+ }
+
+ /* Write the actual data bytes into the buffer. */
+ Assert(mqh->mqh_partial_bytes <= nbytes);
+ offset = mqh->mqh_partial_bytes;
+ do
+ {
+ Size chunksize;
+
+ /* Figure out which bytes need to be sent next. */
+ if (offset >= iov[which_iov].len)
+ {
+ offset -= iov[which_iov].len;
+ ++which_iov;
+ if (which_iov >= iovcnt)
+ break;
+ continue;
+ }
+
+ /*
+ * We want to avoid copying the data if at all possible, but every
+ * chunk of bytes we write into the queue has to be MAXALIGN'd, except
+ * the last. Thus, if a chunk other than the last one ends on a
+ * non-MAXALIGN'd boundary, we have to combine the tail end of its
+ * data with data from one or more following chunks until we either
+ * reach the last chunk or accumulate a number of bytes which is
+ * MAXALIGN'd.
+ */
+ if (which_iov + 1 < iovcnt &&
+ offset + MAXIMUM_ALIGNOF > iov[which_iov].len)
+ {
+ char tmpbuf[MAXIMUM_ALIGNOF];
+ int j = 0;
+
+ for (;;)
+ {
+ if (offset < iov[which_iov].len)
+ {
+ tmpbuf[j] = iov[which_iov].data[offset];
+ j++;
+ offset++;
+ if (j == MAXIMUM_ALIGNOF)
+ break;
+ }
+ else
+ {
+ offset -= iov[which_iov].len;
+ which_iov++;
+ if (which_iov >= iovcnt)
+ break;
+ }
+ }
+
+ res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written);
+
+ if (res == SHM_MQ_DETACHED)
+ {
+ /* Reset state in case caller tries to send another message. */
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = false;
+ return res;
+ }
+
+ mqh->mqh_partial_bytes += bytes_written;
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ continue;
+ }
+
+ /*
+ * If this is the last chunk, we can write all the data, even if it
+ * isn't a multiple of MAXIMUM_ALIGNOF. Otherwise, we need to
+ * MAXALIGN_DOWN the write size.
+ */
+ chunksize = iov[which_iov].len - offset;
+ if (which_iov + 1 < iovcnt)
+ chunksize = MAXALIGN_DOWN(chunksize);
+ res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset],
+ nowait, &bytes_written);
+
+ if (res == SHM_MQ_DETACHED)
+ {
+ /* Reset state in case caller tries to send another message. */
+ mqh->mqh_length_word_complete = false;
+ mqh->mqh_partial_bytes = 0;
+ return res;
+ }
+
+ mqh->mqh_partial_bytes += bytes_written;
+ offset += bytes_written;
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ } while (mqh->mqh_partial_bytes < nbytes);
+
+ /* Reset for next message. */
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = false;
+
+ /* If queue has been detached, let caller know. */
+ if (mq->mq_detached)
+ return SHM_MQ_DETACHED;
+
+ /*
+ * If the counterparty is known to have attached, we can read mq_receiver
+ * without acquiring the spinlock. Otherwise, more caution is needed.
+ */
+ if (mqh->mqh_counterparty_attached)
+ receiver = mq->mq_receiver;
+ else
+ {
+ SpinLockAcquire(&mq->mq_mutex);
+ receiver = mq->mq_receiver;
+ SpinLockRelease(&mq->mq_mutex);
+ if (receiver != NULL)
+ mqh->mqh_counterparty_attached = true;
+ }
+
+ /*
+ * If the caller has requested force flush or we have written more than
+ * 1/4 of the ring size, mark it as written in shared memory and notify
+ * the receiver.
+ */
+ if (force_flush || mqh->mqh_send_pending > (mq->mq_ring_size >> 2))
+ {
+ shm_mq_inc_bytes_written(mq, mqh->mqh_send_pending);
+ if (receiver != NULL)
+ SetLatch(&receiver->procLatch);
+ mqh->mqh_send_pending = 0;
+ }
+
+ return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Receive a message from a shared message queue.
+ *
+ * We set *nbytes to the message length and *data to point to the message
+ * payload. If the entire message exists in the queue as a single,
+ * contiguous chunk, *data will point directly into shared memory; otherwise,
+ * it will point to a temporary buffer. This mostly avoids data copying in
+ * the hoped-for case where messages are short compared to the buffer size,
+ * while still allowing longer messages. In either case, the return value
+ * remains valid until the next receive operation is performed on the queue.
+ *
+ * When nowait = false, we'll wait on our process latch when the ring buffer
+ * is empty and we have not yet received a full message. The sender will
+ * set our process latch after more data has been written, and we'll resume
+ * processing. Each call will therefore return a complete message
+ * (unless the sender detaches the queue).
+ *
+ * When nowait = true, we do not manipulate the state of the process latch;
+ * instead, whenever the buffer is empty and we need to read from it, we
+ * return SHM_MQ_WOULD_BLOCK. In this case, the caller should call this
+ * function again after the process latch has been set.
+ */
+shm_mq_result
+shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait)
+{
+ shm_mq *mq = mqh->mqh_queue;
+ shm_mq_result res;
+ Size rb = 0;
+ Size nbytes;
+ void *rawdata;
+
+ Assert(mq->mq_receiver == MyProc);
+
+ /* We can't receive data until the sender has attached. */
+ if (!mqh->mqh_counterparty_attached)
+ {
+ if (nowait)
+ {
+ int counterparty_gone;
+
+ /*
+ * We shouldn't return at this point at all unless the sender
+ * hasn't attached yet. However, the correct return value depends
+ * on whether the sender is still attached. If we first test
+ * whether the sender has ever attached and then test whether the
+ * sender has detached, there's a race condition: a sender that
+ * attaches and detaches very quickly might fool us into thinking
+ * the sender never attached at all. So, test whether our
+ * counterparty is definitively gone first, and only afterwards
+ * check whether the sender ever attached in the first place.
+ */
+ counterparty_gone = shm_mq_counterparty_gone(mq, mqh->mqh_handle);
+ if (shm_mq_get_sender(mq) == NULL)
+ {
+ if (counterparty_gone)
+ return SHM_MQ_DETACHED;
+ else
+ return SHM_MQ_WOULD_BLOCK;
+ }
+ }
+ else if (!shm_mq_wait_internal(mq, &mq->mq_sender, mqh->mqh_handle)
+ && shm_mq_get_sender(mq) == NULL)
+ {
+ mq->mq_detached = true;
+ return SHM_MQ_DETACHED;
+ }
+ mqh->mqh_counterparty_attached = true;
+ }
+
+ /*
+ * If we've consumed an amount of data greater than 1/4th of the ring
+ * size, mark it consumed in shared memory. We try to avoid doing this
+ * unnecessarily when only a small amount of data has been consumed,
+ * because SetLatch() is fairly expensive and we don't want to do it too
+ * often.
+ */
+ if (mqh->mqh_consume_pending > mq->mq_ring_size / 4)
+ {
+ shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending);
+ mqh->mqh_consume_pending = 0;
+ }
+
+ /* Try to read, or finish reading, the length word from the buffer. */
+ while (!mqh->mqh_length_word_complete)
+ {
+ /* Try to receive the message length word. */
+ Assert(mqh->mqh_partial_bytes < sizeof(Size));
+ res = shm_mq_receive_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes,
+ nowait, &rb, &rawdata);
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+
+ /*
+ * Hopefully, we'll receive the entire message length word at once.
+ * But if sizeof(Size) > MAXIMUM_ALIGNOF, then it might be split over
+ * multiple reads.
+ */
+ if (mqh->mqh_partial_bytes == 0 && rb >= sizeof(Size))
+ {
+ Size needed;
+
+ nbytes = *(Size *) rawdata;
+
+ /* If we've already got the whole message, we're done. */
+ needed = MAXALIGN(sizeof(Size)) + MAXALIGN(nbytes);
+ if (rb >= needed)
+ {
+ mqh->mqh_consume_pending += needed;
+ *nbytesp = nbytes;
+ *datap = ((char *) rawdata) + MAXALIGN(sizeof(Size));
+ return SHM_MQ_SUCCESS;
+ }
+
+ /*
+ * We don't have the whole message, but we at least have the whole
+ * length word.
+ */
+ mqh->mqh_expected_bytes = nbytes;
+ mqh->mqh_length_word_complete = true;
+ mqh->mqh_consume_pending += MAXALIGN(sizeof(Size));
+ rb -= MAXALIGN(sizeof(Size));
+ }
+ else
+ {
+ Size lengthbytes;
+
+ /* Can't be split unless bigger than required alignment. */
+ Assert(sizeof(Size) > MAXIMUM_ALIGNOF);
+
+ /* Message word is split; need buffer to reassemble. */
+ if (mqh->mqh_buffer == NULL)
+ {
+ mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context,
+ MQH_INITIAL_BUFSIZE);
+ mqh->mqh_buflen = MQH_INITIAL_BUFSIZE;
+ }
+ Assert(mqh->mqh_buflen >= sizeof(Size));
+
+ /* Copy partial length word; remember to consume it. */
+ if (mqh->mqh_partial_bytes + rb > sizeof(Size))
+ lengthbytes = sizeof(Size) - mqh->mqh_partial_bytes;
+ else
+ lengthbytes = rb;
+ memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata,
+ lengthbytes);
+ mqh->mqh_partial_bytes += lengthbytes;
+ mqh->mqh_consume_pending += MAXALIGN(lengthbytes);
+ rb -= lengthbytes;
+
+ /* If we now have the whole word, we're ready to read payload. */
+ if (mqh->mqh_partial_bytes >= sizeof(Size))
+ {
+ Assert(mqh->mqh_partial_bytes == sizeof(Size));
+ mqh->mqh_expected_bytes = *(Size *) mqh->mqh_buffer;
+ mqh->mqh_length_word_complete = true;
+ mqh->mqh_partial_bytes = 0;
+ }
+ }
+ }
+ nbytes = mqh->mqh_expected_bytes;
+
+ /*
+ * Should be disallowed on the sending side already, but better check and
+ * error out on the receiver side as well rather than trying to read a
+ * prohibitively large message.
+ */
+ if (nbytes > MaxAllocSize)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("invalid message size %zu in shared memory queue",
+ nbytes)));
+
+ if (mqh->mqh_partial_bytes == 0)
+ {
+ /*
+ * Try to obtain the whole message in a single chunk. If this works,
+ * we need not copy the data and can return a pointer directly into
+ * shared memory.
+ */
+ res = shm_mq_receive_bytes(mqh, nbytes, nowait, &rb, &rawdata);
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ if (rb >= nbytes)
+ {
+ mqh->mqh_length_word_complete = false;
+ mqh->mqh_consume_pending += MAXALIGN(nbytes);
+ *nbytesp = nbytes;
+ *datap = rawdata;
+ return SHM_MQ_SUCCESS;
+ }
+
+ /*
+ * The message has wrapped the buffer. We'll need to copy it in order
+ * to return it to the client in one chunk. First, make sure we have
+ * a large enough buffer available.
+ */
+ if (mqh->mqh_buflen < nbytes)
+ {
+ Size newbuflen;
+
+ /*
+ * Increase size to the next power of 2 that's >= nbytes, but
+ * limit to MaxAllocSize.
+ */
+ newbuflen = pg_nextpower2_size_t(nbytes);
+ newbuflen = Min(newbuflen, MaxAllocSize);
+
+ if (mqh->mqh_buffer != NULL)
+ {
+ pfree(mqh->mqh_buffer);
+ mqh->mqh_buffer = NULL;
+ mqh->mqh_buflen = 0;
+ }
+ mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, newbuflen);
+ mqh->mqh_buflen = newbuflen;
+ }
+ }
+
+ /* Loop until we've copied the entire message. */
+ for (;;)
+ {
+ Size still_needed;
+
+ /* Copy as much as we can. */
+ Assert(mqh->mqh_partial_bytes + rb <= nbytes);
+ if (rb > 0)
+ {
+ memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, rb);
+ mqh->mqh_partial_bytes += rb;
+ }
+
+ /*
+ * Update count of bytes that can be consumed, accounting for
+ * alignment padding. Note that this will never actually insert any
+ * padding except at the end of a message, because the buffer size is
+ * a multiple of MAXIMUM_ALIGNOF, and each read and write is as well.
+ */
+ Assert(mqh->mqh_partial_bytes == nbytes || rb == MAXALIGN(rb));
+ mqh->mqh_consume_pending += MAXALIGN(rb);
+
+ /* If we got all the data, exit the loop. */
+ if (mqh->mqh_partial_bytes >= nbytes)
+ break;
+
+ /* Wait for some more data. */
+ still_needed = nbytes - mqh->mqh_partial_bytes;
+ res = shm_mq_receive_bytes(mqh, still_needed, nowait, &rb, &rawdata);
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ if (rb > still_needed)
+ rb = still_needed;
+ }
+
+ /* Return the complete message, and reset for next message. */
+ *nbytesp = nbytes;
+ *datap = mqh->mqh_buffer;
+ mqh->mqh_length_word_complete = false;
+ mqh->mqh_partial_bytes = 0;
+ return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Wait for the other process that's supposed to use this queue to attach
+ * to it.
+ *
+ * The return value is SHM_MQ_DETACHED if the worker has already detached or
+ * if it dies; it is SHM_MQ_SUCCESS if we detect that the worker has attached.
+ * Note that we will only be able to detect that the worker has died before
+ * attaching if a background worker handle was passed to shm_mq_attach().
+ */
+shm_mq_result
+shm_mq_wait_for_attach(shm_mq_handle *mqh)
+{
+ shm_mq *mq = mqh->mqh_queue;
+ PGPROC **victim;
+
+ if (shm_mq_get_receiver(mq) == MyProc)
+ victim = &mq->mq_sender;
+ else
+ {
+ Assert(shm_mq_get_sender(mq) == MyProc);
+ victim = &mq->mq_receiver;
+ }
+
+ if (shm_mq_wait_internal(mq, victim, mqh->mqh_handle))
+ return SHM_MQ_SUCCESS;
+ else
+ return SHM_MQ_DETACHED;
+}
+
+/*
+ * Detach from a shared message queue, and destroy the shm_mq_handle.
+ */
+void
+shm_mq_detach(shm_mq_handle *mqh)
+{
+ /* Before detaching, notify the receiver about any already-written data. */
+ if (mqh->mqh_send_pending > 0)
+ {
+ shm_mq_inc_bytes_written(mqh->mqh_queue, mqh->mqh_send_pending);
+ mqh->mqh_send_pending = 0;
+ }
+
+ /* Notify counterparty that we're outta here. */
+ shm_mq_detach_internal(mqh->mqh_queue);
+
+ /* Cancel on_dsm_detach callback, if any. */
+ if (mqh->mqh_segment)
+ cancel_on_dsm_detach(mqh->mqh_segment,
+ shm_mq_detach_callback,
+ PointerGetDatum(mqh->mqh_queue));
+
+ /* Release local memory associated with handle. */
+ if (mqh->mqh_buffer != NULL)
+ pfree(mqh->mqh_buffer);
+ pfree(mqh);
+}
+
+/*
+ * Notify counterparty that we're detaching from shared message queue.
+ *
+ * The purpose of this function is to make sure that the process
+ * with which we're communicating doesn't block forever waiting for us to
+ * fill or drain the queue once we've lost interest. When the sender
+ * detaches, the receiver can read any messages remaining in the queue;
+ * further reads will return SHM_MQ_DETACHED. If the receiver detaches,
+ * further attempts to send messages will likewise return SHM_MQ_DETACHED.
+ *
+ * This is separated out from shm_mq_detach() because if the on_dsm_detach
+ * callback fires, we only want to do this much. We do not try to touch
+ * the local shm_mq_handle, as it may have been pfree'd already.
+ */
+static void
+shm_mq_detach_internal(shm_mq *mq)
+{
+ PGPROC *victim;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ if (mq->mq_sender == MyProc)
+ victim = mq->mq_receiver;
+ else
+ {
+ Assert(mq->mq_receiver == MyProc);
+ victim = mq->mq_sender;
+ }
+ mq->mq_detached = true;
+ SpinLockRelease(&mq->mq_mutex);
+
+ if (victim != NULL)
+ SetLatch(&victim->procLatch);
+}
+
+/*
+ * Get the shm_mq from handle.
+ */
+shm_mq *
+shm_mq_get_queue(shm_mq_handle *mqh)
+{
+ return mqh->mqh_queue;
+}
+
+/*
+ * Write bytes into a shared message queue.
+ */
+static shm_mq_result
+shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
+ bool nowait, Size *bytes_written)
+{
+ shm_mq *mq = mqh->mqh_queue;
+ Size sent = 0;
+ uint64 used;
+ Size ringsize = mq->mq_ring_size;
+ Size available;
+
+ while (sent < nbytes)
+ {
+ uint64 rb;
+ uint64 wb;
+
+ /* Compute number of ring buffer bytes used and available. */
+ rb = pg_atomic_read_u64(&mq->mq_bytes_read);
+ wb = pg_atomic_read_u64(&mq->mq_bytes_written) + mqh->mqh_send_pending;
+ Assert(wb >= rb);
+ used = wb - rb;
+ Assert(used <= ringsize);
+ available = Min(ringsize - used, nbytes - sent);
+
+ /*
+ * Bail out if the queue has been detached. Note that we would be in
+ * trouble if the compiler decided to cache the value of
+ * mq->mq_detached in a register or on the stack across loop
+ * iterations. It probably shouldn't do that anyway since we'll
+ * always return, call an external function that performs a system
+ * call, or reach a memory barrier at some point later in the loop,
+ * but just to be sure, insert a compiler barrier here.
+ */
+ pg_compiler_barrier();
+ if (mq->mq_detached)
+ {
+ *bytes_written = sent;
+ return SHM_MQ_DETACHED;
+ }
+
+ if (available == 0 && !mqh->mqh_counterparty_attached)
+ {
+ /*
+ * The queue is full, so if the receiver isn't yet known to be
+ * attached, we must wait for that to happen.
+ */
+ if (nowait)
+ {
+ if (shm_mq_counterparty_gone(mq, mqh->mqh_handle))
+ {
+ *bytes_written = sent;
+ return SHM_MQ_DETACHED;
+ }
+ if (shm_mq_get_receiver(mq) == NULL)
+ {
+ *bytes_written = sent;
+ return SHM_MQ_WOULD_BLOCK;
+ }
+ }
+ else if (!shm_mq_wait_internal(mq, &mq->mq_receiver,
+ mqh->mqh_handle))
+ {
+ mq->mq_detached = true;
+ *bytes_written = sent;
+ return SHM_MQ_DETACHED;
+ }
+ mqh->mqh_counterparty_attached = true;
+
+ /*
+ * The receiver may have read some data after attaching, so we
+ * must not wait without rechecking the queue state.
+ */
+ }
+ else if (available == 0)
+ {
+ /* Update the pending send bytes in the shared memory. */
+ shm_mq_inc_bytes_written(mq, mqh->mqh_send_pending);
+
+ /*
+ * Since mq->mqh_counterparty_attached is known to be true at this
+ * point, mq_receiver has been set, and it can't change once set.
+ * Therefore, we can read it without acquiring the spinlock.
+ */
+ Assert(mqh->mqh_counterparty_attached);
+ SetLatch(&mq->mq_receiver->procLatch);
+
+ /*
+ * We have just updated the mqh_send_pending bytes in the shared
+ * memory so reset it.
+ */
+ mqh->mqh_send_pending = 0;
+
+ /* Skip manipulation of our latch if nowait = true. */
+ if (nowait)
+ {
+ *bytes_written = sent;
+ return SHM_MQ_WOULD_BLOCK;
+ }
+
+ /*
+ * Wait for our latch to be set. It might already be set for some
+ * unrelated reason, but that'll just result in one extra trip
+ * through the loop. It's worth it to avoid resetting the latch
+ * at top of loop, because setting an already-set latch is much
+ * cheaper than setting one that has been reset.
+ */
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ WAIT_EVENT_MQ_SEND);
+
+ /* Reset the latch so we don't spin. */
+ ResetLatch(MyLatch);
+
+ /* An interrupt may have occurred while we were waiting. */
+ CHECK_FOR_INTERRUPTS();
+ }
+ else
+ {
+ Size offset;
+ Size sendnow;
+
+ offset = wb % (uint64) ringsize;
+ sendnow = Min(available, ringsize - offset);
+
+ /*
+ * Write as much data as we can via a single memcpy(). Make sure
+ * these writes happen after the read of mq_bytes_read, above.
+ * This barrier pairs with the one in shm_mq_inc_bytes_read.
+ * (Since we're separating the read of mq_bytes_read from a
+ * subsequent write to mq_ring, we need a full barrier here.)
+ */
+ pg_memory_barrier();
+ memcpy(&mq->mq_ring[mq->mq_ring_offset + offset],
+ (char *) data + sent, sendnow);
+ sent += sendnow;
+
+ /*
+ * Update count of bytes written, with alignment padding. Note
+ * that this will never actually insert any padding except at the
+ * end of a run of bytes, because the buffer size is a multiple of
+ * MAXIMUM_ALIGNOF, and each read is as well.
+ */
+ Assert(sent == nbytes || sendnow == MAXALIGN(sendnow));
+
+ /*
+ * For efficiency, we don't update the bytes written in the shared
+ * memory and also don't set the reader's latch here. Refer to
+ * the comments atop the shm_mq_handle structure for more
+ * information.
+ */
+ mqh->mqh_send_pending += MAXALIGN(sendnow);
+ }
+ }
+
+ *bytes_written = sent;
+ return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Wait until at least *nbytesp bytes are available to be read from the
+ * shared message queue, or until the buffer wraps around. If the queue is
+ * detached, returns SHM_MQ_DETACHED. If nowait is specified and a wait
+ * would be required, returns SHM_MQ_WOULD_BLOCK. Otherwise, *datap is set
+ * to the location at which data bytes can be read, *nbytesp is set to the
+ * number of bytes which can be read at that address, and the return value
+ * is SHM_MQ_SUCCESS.
+ */
+static shm_mq_result
+shm_mq_receive_bytes(shm_mq_handle *mqh, Size bytes_needed, bool nowait,
+ Size *nbytesp, void **datap)
+{
+ shm_mq *mq = mqh->mqh_queue;
+ Size ringsize = mq->mq_ring_size;
+ uint64 used;
+ uint64 written;
+
+ for (;;)
+ {
+ Size offset;
+ uint64 read;
+
+ /* Get bytes written, so we can compute what's available to read. */
+ written = pg_atomic_read_u64(&mq->mq_bytes_written);
+
+ /*
+ * Get bytes read. Include bytes we could consume but have not yet
+ * consumed.
+ */
+ read = pg_atomic_read_u64(&mq->mq_bytes_read) +
+ mqh->mqh_consume_pending;
+ used = written - read;
+ Assert(used <= ringsize);
+ offset = read % (uint64) ringsize;
+
+ /* If we have enough data or buffer has wrapped, we're done. */
+ if (used >= bytes_needed || offset + used >= ringsize)
+ {
+ *nbytesp = Min(used, ringsize - offset);
+ *datap = &mq->mq_ring[mq->mq_ring_offset + offset];
+
+ /*
+ * Separate the read of mq_bytes_written, above, from caller's
+ * attempt to read the data itself. Pairs with the barrier in
+ * shm_mq_inc_bytes_written.
+ */
+ pg_read_barrier();
+ return SHM_MQ_SUCCESS;
+ }
+
+ /*
+ * Fall out before waiting if the queue has been detached.
+ *
+ * Note that we don't check for this until *after* considering whether
+ * the data already available is enough, since the receiver can finish
+ * receiving a message stored in the buffer even after the sender has
+ * detached.
+ */
+ if (mq->mq_detached)
+ {
+ /*
+ * If the writer advanced mq_bytes_written and then set
+ * mq_detached, we might not have read the final value of
+ * mq_bytes_written above. Insert a read barrier and then check
+ * again if mq_bytes_written has advanced.
+ */
+ pg_read_barrier();
+ if (written != pg_atomic_read_u64(&mq->mq_bytes_written))
+ continue;
+
+ return SHM_MQ_DETACHED;
+ }
+
+ /*
+ * We didn't get enough data to satisfy the request, so mark any data
+ * previously-consumed as read to make more buffer space.
+ */
+ if (mqh->mqh_consume_pending > 0)
+ {
+ shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending);
+ mqh->mqh_consume_pending = 0;
+ }
+
+ /* Skip manipulation of our latch if nowait = true. */
+ if (nowait)
+ return SHM_MQ_WOULD_BLOCK;
+
+ /*
+ * Wait for our latch to be set. It might already be set for some
+ * unrelated reason, but that'll just result in one extra trip through
+ * the loop. It's worth it to avoid resetting the latch at top of
+ * loop, because setting an already-set latch is much cheaper than
+ * setting one that has been reset.
+ */
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ WAIT_EVENT_MQ_RECEIVE);
+
+ /* Reset the latch so we don't spin. */
+ ResetLatch(MyLatch);
+
+ /* An interrupt may have occurred while we were waiting. */
+ CHECK_FOR_INTERRUPTS();
+ }
+}
+
+/*
+ * Test whether a counterparty who may not even be alive yet is definitely gone.
+ */
+static bool
+shm_mq_counterparty_gone(shm_mq *mq, BackgroundWorkerHandle *handle)
+{
+ pid_t pid;
+
+ /* If the queue has been detached, counterparty is definitely gone. */
+ if (mq->mq_detached)
+ return true;
+
+ /* If there's a handle, check worker status. */
+ if (handle != NULL)
+ {
+ BgwHandleStatus status;
+
+ /* Check for unexpected worker death. */
+ status = GetBackgroundWorkerPid(handle, &pid);
+ if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+ {
+ /* Mark it detached, just to make it official. */
+ mq->mq_detached = true;
+ return true;
+ }
+ }
+
+ /* Counterparty is not definitively gone. */
+ return false;
+}
+
+/*
+ * This is used when a process is waiting for its counterpart to attach to the
+ * queue. We exit when the other process attaches as expected, or, if
+ * handle != NULL, when the referenced background process or the postmaster
+ * dies. Note that if handle == NULL, and the process fails to attach, we'll
+ * potentially get stuck here forever waiting for a process that may never
+ * start. We do check for interrupts, though.
+ *
+ * ptr is a pointer to the memory address that we're expecting to become
+ * non-NULL when our counterpart attaches to the queue.
+ */
+static bool
+shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr, BackgroundWorkerHandle *handle)
+{
+ bool result = false;
+
+ for (;;)
+ {
+ BgwHandleStatus status;
+ pid_t pid;
+
+ /* Acquire the lock just long enough to check the pointer. */
+ SpinLockAcquire(&mq->mq_mutex);
+ result = (*ptr != NULL);
+ SpinLockRelease(&mq->mq_mutex);
+
+ /* Fail if detached; else succeed if initialized. */
+ if (mq->mq_detached)
+ {
+ result = false;
+ break;
+ }
+ if (result)
+ break;
+
+ if (handle != NULL)
+ {
+ /* Check for unexpected worker death. */
+ status = GetBackgroundWorkerPid(handle, &pid);
+ if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+ {
+ result = false;
+ break;
+ }
+ }
+
+ /* Wait to be signaled. */
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ WAIT_EVENT_MQ_INTERNAL);
+
+ /* Reset the latch so we don't spin. */
+ ResetLatch(MyLatch);
+
+ /* An interrupt may have occurred while we were waiting. */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ return result;
+}
+
+/*
+ * Increment the number of bytes read.
+ */
+static void
+shm_mq_inc_bytes_read(shm_mq *mq, Size n)
+{
+ PGPROC *sender;
+
+ /*
+ * Separate prior reads of mq_ring from the increment of mq_bytes_read
+ * which follows. This pairs with the full barrier in
+ * shm_mq_send_bytes(). We only need a read barrier here because the
+ * increment of mq_bytes_read is actually a read followed by a dependent
+ * write.
+ */
+ pg_read_barrier();
+
+ /*
+ * There's no need to use pg_atomic_fetch_add_u64 here, because nobody
+ * else can be changing this value. This method should be cheaper.
+ */
+ pg_atomic_write_u64(&mq->mq_bytes_read,
+ pg_atomic_read_u64(&mq->mq_bytes_read) + n);
+
+ /*
+ * We shouldn't have any bytes to read without a sender, so we can read
+ * mq_sender here without a lock. Once it's initialized, it can't change.
+ */
+ sender = mq->mq_sender;
+ Assert(sender != NULL);
+ SetLatch(&sender->procLatch);
+}
+
+/*
+ * Increment the number of bytes written.
+ */
+static void
+shm_mq_inc_bytes_written(shm_mq *mq, Size n)
+{
+ /*
+ * Separate prior reads of mq_ring from the write of mq_bytes_written
+ * which we're about to do. Pairs with the read barrier found in
+ * shm_mq_receive_bytes.
+ */
+ pg_write_barrier();
+
+ /*
+ * There's no need to use pg_atomic_fetch_add_u64 here, because nobody
+ * else can be changing this value. This method avoids taking the bus
+ * lock unnecessarily.
+ */
+ pg_atomic_write_u64(&mq->mq_bytes_written,
+ pg_atomic_read_u64(&mq->mq_bytes_written) + n);
+}
+
+/* Shim for on_dsm_detach callback. */
+static void
+shm_mq_detach_callback(dsm_segment *seg, Datum arg)
+{
+ shm_mq *mq = (shm_mq *) DatumGetPointer(arg);
+
+ shm_mq_detach_internal(mq);
+}
diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c
new file mode 100644
index 0000000..0cd8244
--- /dev/null
+++ b/src/backend/storage/ipc/shm_toc.c
@@ -0,0 +1,272 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_toc.c
+ * shared memory segment table of contents
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_toc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/shm_toc.h"
+#include "storage/spin.h"
+
+typedef struct shm_toc_entry
+{
+ uint64 key; /* Arbitrary identifier */
+ Size offset; /* Offset, in bytes, from TOC start */
+} shm_toc_entry;
+
+struct shm_toc
+{
+ uint64 toc_magic; /* Magic number identifying this TOC */
+ slock_t toc_mutex; /* Spinlock for mutual exclusion */
+ Size toc_total_bytes; /* Bytes managed by this TOC */
+ Size toc_allocated_bytes; /* Bytes allocated of those managed */
+ uint32 toc_nentry; /* Number of entries in TOC */
+ shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * Initialize a region of shared memory with a table of contents.
+ */
+shm_toc *
+shm_toc_create(uint64 magic, void *address, Size nbytes)
+{
+ shm_toc *toc = (shm_toc *) address;
+
+ Assert(nbytes > offsetof(shm_toc, toc_entry));
+ toc->toc_magic = magic;
+ SpinLockInit(&toc->toc_mutex);
+
+ /*
+ * The alignment code in shm_toc_allocate() assumes that the starting
+ * value is buffer-aligned.
+ */
+ toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
+ toc->toc_allocated_bytes = 0;
+ toc->toc_nentry = 0;
+
+ return toc;
+}
+
+/*
+ * Attach to an existing table of contents. If the magic number found at
+ * the target address doesn't match our expectations, return NULL.
+ */
+shm_toc *
+shm_toc_attach(uint64 magic, void *address)
+{
+ shm_toc *toc = (shm_toc *) address;
+
+ if (toc->toc_magic != magic)
+ return NULL;
+
+ Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
+ Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
+
+ return toc;
+}
+
+/*
+ * Allocate shared memory from a segment managed by a table of contents.
+ *
+ * This is not a full-blown allocator; there's no way to free memory. It's
+ * just a way of dividing a single physical shared memory segment into logical
+ * chunks that may be used for different purposes.
+ *
+ * We allocate backwards from the end of the segment, so that the TOC entries
+ * can grow forward from the start of the segment.
+ */
+void *
+shm_toc_allocate(shm_toc *toc, Size nbytes)
+{
+ volatile shm_toc *vtoc = toc;
+ Size total_bytes;
+ Size allocated_bytes;
+ Size nentry;
+ Size toc_bytes;
+
+ /*
+ * Make sure request is well-aligned. XXX: MAXALIGN is not enough,
+ * because atomic ops might need a wider alignment. We don't have a
+ * proper definition for the minimum to make atomic ops safe, but
+ * BUFFERALIGN ought to be enough.
+ */
+ nbytes = BUFFERALIGN(nbytes);
+
+ SpinLockAcquire(&toc->toc_mutex);
+
+ total_bytes = vtoc->toc_total_bytes;
+ allocated_bytes = vtoc->toc_allocated_bytes;
+ nentry = vtoc->toc_nentry;
+ toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+ + allocated_bytes;
+
+ /* Check for memory exhaustion and overflow. */
+ if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
+ {
+ SpinLockRelease(&toc->toc_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+ }
+ vtoc->toc_allocated_bytes += nbytes;
+
+ SpinLockRelease(&toc->toc_mutex);
+
+ return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
+}
+
+/*
+ * Return the number of bytes that can still be allocated.
+ */
+Size
+shm_toc_freespace(shm_toc *toc)
+{
+ volatile shm_toc *vtoc = toc;
+ Size total_bytes;
+ Size allocated_bytes;
+ Size nentry;
+ Size toc_bytes;
+
+ SpinLockAcquire(&toc->toc_mutex);
+ total_bytes = vtoc->toc_total_bytes;
+ allocated_bytes = vtoc->toc_allocated_bytes;
+ nentry = vtoc->toc_nentry;
+ SpinLockRelease(&toc->toc_mutex);
+
+ toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
+ Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
+ return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
+}
+
+/*
+ * Insert a TOC entry.
+ *
+ * The idea here is that the process setting up the shared memory segment will
+ * register the addresses of data structures within the segment using this
+ * function. Each data structure will be identified using a 64-bit key, which
+ * is assumed to be a well-known or discoverable integer. Other processes
+ * accessing the shared memory segment can pass the same key to
+ * shm_toc_lookup() to discover the addresses of those data structures.
+ *
+ * Since the shared memory segment may be mapped at different addresses within
+ * different backends, we store relative rather than absolute pointers.
+ *
+ * This won't scale well to a large number of keys. Hopefully, that isn't
+ * necessary; if it proves to be, we might need to provide a more sophisticated
+ * data structure here. But the real idea here is just to give someone mapping
+ * a dynamic shared memory the ability to find the bare minimum number of
+ * pointers that they need to bootstrap. If you're storing a lot of stuff in
+ * the TOC, you're doing it wrong.
+ */
+void
+shm_toc_insert(shm_toc *toc, uint64 key, void *address)
+{
+ volatile shm_toc *vtoc = toc;
+ Size total_bytes;
+ Size allocated_bytes;
+ Size nentry;
+ Size toc_bytes;
+ Size offset;
+
+ /* Relativize pointer. */
+ Assert(address > (void *) toc);
+ offset = ((char *) address) - (char *) toc;
+
+ SpinLockAcquire(&toc->toc_mutex);
+
+ total_bytes = vtoc->toc_total_bytes;
+ allocated_bytes = vtoc->toc_allocated_bytes;
+ nentry = vtoc->toc_nentry;
+ toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+ + allocated_bytes;
+
+ /* Check for memory exhaustion and overflow. */
+ if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
+ toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
+ nentry >= PG_UINT32_MAX)
+ {
+ SpinLockRelease(&toc->toc_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+ }
+
+ Assert(offset < total_bytes);
+ vtoc->toc_entry[nentry].key = key;
+ vtoc->toc_entry[nentry].offset = offset;
+
+ /*
+ * By placing a write barrier after filling in the entry and before
+ * updating the number of entries, we make it safe to read the TOC
+ * unlocked.
+ */
+ pg_write_barrier();
+
+ vtoc->toc_nentry++;
+
+ SpinLockRelease(&toc->toc_mutex);
+}
+
+/*
+ * Look up a TOC entry.
+ *
+ * If the key is not found, returns NULL if noError is true, otherwise
+ * throws elog(ERROR).
+ *
+ * Unlike the other functions in this file, this operation acquires no lock;
+ * it uses only barriers. It probably wouldn't hurt concurrency very much even
+ * if it did get a lock, but since it's reasonably likely that a group of
+ * worker processes could each read a series of entries from the same TOC
+ * right around the same time, there seems to be some value in avoiding it.
+ */
+void *
+shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
+{
+ uint32 nentry;
+ uint32 i;
+
+ /*
+ * Read the number of entries before we examine any entry. We assume that
+ * reading a uint32 is atomic.
+ */
+ nentry = toc->toc_nentry;
+ pg_read_barrier();
+
+ /* Now search for a matching entry. */
+ for (i = 0; i < nentry; ++i)
+ {
+ if (toc->toc_entry[i].key == key)
+ return ((char *) toc) + toc->toc_entry[i].offset;
+ }
+
+ /* No matching entry was found. */
+ if (!noError)
+ elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
+ key, toc);
+ return NULL;
+}
+
+/*
+ * Estimate how much shared memory will be required to store a TOC and its
+ * dependent data structures.
+ */
+Size
+shm_toc_estimate(shm_toc_estimator *e)
+{
+ Size sz;
+
+ sz = offsetof(shm_toc, toc_entry);
+ sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
+ sz = add_size(sz, e->space_for_chunks);
+
+ return BUFFERALIGN(sz);
+}
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
new file mode 100644
index 0000000..5465fa1
--- /dev/null
+++ b/src/backend/storage/ipc/shmem.c
@@ -0,0 +1,584 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.c
+ * create shared memory and initialize shared memory data structures.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/shmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES processes share one or more regions of shared memory.
+ * The shared memory is created by a postmaster and is inherited
+ * by each backend via fork() (or, in some ports, via other OS-specific
+ * methods). The routines in this file are used for allocating and
+ * binding to shared memory data structures.
+ *
+ * NOTES:
+ * (a) There are three kinds of shared memory data structures
+ * available to POSTGRES: fixed-size structures, queues and hash
+ * tables. Fixed-size structures contain things like global variables
+ * for a module and should never be allocated after the shared memory
+ * initialization phase. Hash tables have a fixed maximum size, but
+ * their actual size can vary dynamically. When entries are added
+ * to the table, more space is allocated. Queues link data structures
+ * that have been allocated either within fixed-size structures or as hash
+ * buckets. Each shared data structure has a string name to identify
+ * it (assigned in the module that declares it).
+ *
+ * (b) During initialization, each module looks for its
+ * shared data structures in a hash table called the "Shmem Index".
+ * If the data structure is not present, the caller can allocate
+ * a new one and initialize it. If the data structure is present,
+ * the caller "attaches" to the structure by initializing a pointer
+ * in the local address space.
+ * The shmem index has two purposes: first, it gives us
+ * a simple model of how the world looks when a backend process
+ * initializes. If something is present in the shmem index,
+ * it is initialized. If it is not, it is uninitialized. Second,
+ * the shmem index allows us to allocate shared memory on demand
+ * instead of trying to preallocate structures and hard-wire the
+ * sizes and locations in header files. If you are using a lot
+ * of shared memory in a lot of different places (and changing
+ * things during development), this is important.
+ *
+ * (c) In standard Unix-ish environments, individual backends do not
+ * need to re-establish their local pointers into shared memory, because
+ * they inherit correct values of those variables via fork() from the
+ * postmaster. However, this does not work in the EXEC_BACKEND case.
+ * In ports using EXEC_BACKEND, new backends have to set up their local
+ * pointers using the method described in (b) above.
+ *
+ * (d) memory allocation model: shared memory can never be
+ * freed, once allocated. Each hash table has its own free list,
+ * so hash buckets can be reused when an item is deleted. However,
+ * if one hash table grows very large and then shrinks, its space
+ * cannot be redistributed to other tables. We could build a simple
+ * hash bucket garbage collector if need be. Right now, it seems
+ * unnecessary.
+ */
+
+#include "postgres.h"
+
+#include "access/transam.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+
+static void *ShmemAllocRaw(Size size, Size *allocated_size);
+
+/* shared memory global variables */
+
+static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
+
+static void *ShmemBase; /* start address of shared memory */
+
+static void *ShmemEnd; /* end+1 address of shared memory */
+
+slock_t *ShmemLock; /* spinlock for shared memory and LWLock
+ * allocation */
+
+static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+
+
+/*
+ * InitShmemAccess() --- set up basic pointers to shared memory.
+ *
+ * Note: the argument should be declared "PGShmemHeader *seghdr",
+ * but we use void to avoid having to include ipc.h in shmem.h.
+ */
+void
+InitShmemAccess(void *seghdr)
+{
+ PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr;
+
+ ShmemSegHdr = shmhdr;
+ ShmemBase = (void *) shmhdr;
+ ShmemEnd = (char *) ShmemBase + shmhdr->totalsize;
+}
+
+/*
+ * InitShmemAllocation() --- set up shared-memory space allocation.
+ *
+ * This should be called only in the postmaster or a standalone backend.
+ */
+void
+InitShmemAllocation(void)
+{
+ PGShmemHeader *shmhdr = ShmemSegHdr;
+ char *aligned;
+
+ Assert(shmhdr != NULL);
+
+ /*
+ * Initialize the spinlock used by ShmemAlloc. We must use
+ * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
+ */
+ ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
+
+ SpinLockInit(ShmemLock);
+
+ /*
+ * Allocations after this point should go through ShmemAlloc, which
+ * expects to allocate everything on cache line boundaries. Make sure the
+ * first allocation begins on a cache line boundary.
+ */
+ aligned = (char *)
+ (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
+ shmhdr->freeoffset = aligned - (char *) shmhdr;
+
+ /* ShmemIndex can't be set up yet (need LWLocks first) */
+ shmhdr->index = NULL;
+ ShmemIndex = (HTAB *) NULL;
+
+ /*
+ * Initialize ShmemVariableCache for transaction manager. (This doesn't
+ * really belong here, but not worth moving.)
+ */
+ ShmemVariableCache = (VariableCache)
+ ShmemAlloc(sizeof(*ShmemVariableCache));
+ memset(ShmemVariableCache, 0, sizeof(*ShmemVariableCache));
+}
+
+/*
+ * ShmemAlloc -- allocate max-aligned chunk from shared memory
+ *
+ * Throws error if request cannot be satisfied.
+ *
+ * Assumes ShmemLock and ShmemSegHdr are initialized.
+ */
+void *
+ShmemAlloc(Size size)
+{
+ void *newSpace;
+ Size allocated_size;
+
+ newSpace = ShmemAllocRaw(size, &allocated_size);
+ if (!newSpace)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory (%zu bytes requested)",
+ size)));
+ return newSpace;
+}
+
+/*
+ * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
+ *
+ * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
+ */
+void *
+ShmemAllocNoError(Size size)
+{
+ Size allocated_size;
+
+ return ShmemAllocRaw(size, &allocated_size);
+}
+
+/*
+ * ShmemAllocRaw -- allocate align chunk and return allocated size
+ *
+ * Also sets *allocated_size to the number of bytes allocated, which will
+ * be equal to the number requested plus any padding we choose to add.
+ */
+static void *
+ShmemAllocRaw(Size size, Size *allocated_size)
+{
+ Size newStart;
+ Size newFree;
+ void *newSpace;
+
+ /*
+ * Ensure all space is adequately aligned. We used to only MAXALIGN this
+ * space but experience has proved that on modern systems that is not good
+ * enough. Many parts of the system are very sensitive to critical data
+ * structures getting split across cache line boundaries. To avoid that,
+ * attempt to align the beginning of the allocation to a cache line
+ * boundary. The calling code will still need to be careful about how it
+ * uses the allocated space - e.g. by padding each element in an array of
+ * structures out to a power-of-two size - but without this, even that
+ * won't be sufficient.
+ */
+ size = CACHELINEALIGN(size);
+ *allocated_size = size;
+
+ Assert(ShmemSegHdr != NULL);
+
+ SpinLockAcquire(ShmemLock);
+
+ newStart = ShmemSegHdr->freeoffset;
+
+ newFree = newStart + size;
+ if (newFree <= ShmemSegHdr->totalsize)
+ {
+ newSpace = (void *) ((char *) ShmemBase + newStart);
+ ShmemSegHdr->freeoffset = newFree;
+ }
+ else
+ newSpace = NULL;
+
+ SpinLockRelease(ShmemLock);
+
+ /* note this assert is okay with newSpace == NULL */
+ Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
+
+ return newSpace;
+}
+
+/*
+ * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
+ *
+ * Allocate space without locking ShmemLock. This should be used for,
+ * and only for, allocations that must happen before ShmemLock is ready.
+ *
+ * We consider maxalign, rather than cachealign, sufficient here.
+ */
+void *
+ShmemAllocUnlocked(Size size)
+{
+ Size newStart;
+ Size newFree;
+ void *newSpace;
+
+ /*
+ * Ensure allocated space is adequately aligned.
+ */
+ size = MAXALIGN(size);
+
+ Assert(ShmemSegHdr != NULL);
+
+ newStart = ShmemSegHdr->freeoffset;
+
+ newFree = newStart + size;
+ if (newFree > ShmemSegHdr->totalsize)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory (%zu bytes requested)",
+ size)));
+ ShmemSegHdr->freeoffset = newFree;
+
+ newSpace = (void *) ((char *) ShmemBase + newStart);
+
+ Assert(newSpace == (void *) MAXALIGN(newSpace));
+
+ return newSpace;
+}
+
+/*
+ * ShmemAddrIsValid -- test if an address refers to shared memory
+ *
+ * Returns true if the pointer points within the shared memory segment.
+ */
+bool
+ShmemAddrIsValid(const void *addr)
+{
+ return (addr >= ShmemBase) && (addr < ShmemEnd);
+}
+
+/*
+ * InitShmemIndex() --- set up or attach to shmem index table.
+ */
+void
+InitShmemIndex(void)
+{
+ HASHCTL info;
+
+ /*
+ * Create the shared memory shmem index.
+ *
+ * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
+ * hashtable to exist already, we have a bit of a circularity problem in
+ * initializing the ShmemIndex itself. The special "ShmemIndex" hash
+ * table name will tell ShmemInitStruct to fake it.
+ */
+ info.keysize = SHMEM_INDEX_KEYSIZE;
+ info.entrysize = sizeof(ShmemIndexEnt);
+
+ ShmemIndex = ShmemInitHash("ShmemIndex",
+ SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
+ &info,
+ HASH_ELEM | HASH_STRINGS);
+}
+
+/*
+ * ShmemInitHash -- Create and initialize, or attach to, a
+ * shared memory hash table.
+ *
+ * We assume caller is doing some kind of synchronization
+ * so that two processes don't try to create/initialize the same
+ * table at once. (In practice, all creations are done in the postmaster
+ * process; child processes should always be attaching to existing tables.)
+ *
+ * max_size is the estimated maximum number of hashtable entries. This is
+ * not a hard limit, but the access efficiency will degrade if it is
+ * exceeded substantially (since it's used to compute directory size and
+ * the hash table buckets will get overfull).
+ *
+ * init_size is the number of hashtable entries to preallocate. For a table
+ * whose maximum size is certain, this should be equal to max_size; that
+ * ensures that no run-time out-of-shared-memory failures can occur.
+ *
+ * *infoP and hash_flags must specify at least the entry sizes and key
+ * comparison semantics (see hash_create()). Flag bits and values specific
+ * to shared-memory hash tables are added here, except that callers may
+ * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
+ *
+ * Note: before Postgres 9.0, this function returned NULL for some failure
+ * cases. Now, it always throws error instead, so callers need not check
+ * for NULL.
+ */
+HTAB *
+ShmemInitHash(const char *name, /* table string name for shmem index */
+ long init_size, /* initial table size */
+ long max_size, /* max size of the table */
+ HASHCTL *infoP, /* info about key and bucket size */
+ int hash_flags) /* info about infoP */
+{
+ bool found;
+ void *location;
+
+ /*
+ * Hash tables allocated in shared memory have a fixed directory; it can't
+ * grow or other backends wouldn't be able to find it. So, make sure we
+ * make it big enough to start with.
+ *
+ * The shared memory allocator must be specified too.
+ */
+ infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
+ infoP->alloc = ShmemAllocNoError;
+ hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
+
+ /* look it up in the shmem index */
+ location = ShmemInitStruct(name,
+ hash_get_shared_size(infoP, hash_flags),
+ &found);
+
+ /*
+ * if it already exists, attach to it rather than allocate and initialize
+ * new space
+ */
+ if (found)
+ hash_flags |= HASH_ATTACH;
+
+ /* Pass location of hashtable header to hash_create */
+ infoP->hctl = (HASHHDR *) location;
+
+ return hash_create(name, init_size, infoP, hash_flags);
+}
+
+/*
+ * ShmemInitStruct -- Create/attach to a structure in shared memory.
+ *
+ * This is called during initialization to find or allocate
+ * a data structure in shared memory. If no other process
+ * has created the structure, this routine allocates space
+ * for it. If it exists already, a pointer to the existing
+ * structure is returned.
+ *
+ * Returns: pointer to the object. *foundPtr is set true if the object was
+ * already in the shmem index (hence, already initialized).
+ *
+ * Note: before Postgres 9.0, this function returned NULL for some failure
+ * cases. Now, it always throws error instead, so callers need not check
+ * for NULL.
+ */
+void *
+ShmemInitStruct(const char *name, Size size, bool *foundPtr)
+{
+ ShmemIndexEnt *result;
+ void *structPtr;
+
+ LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
+
+ if (!ShmemIndex)
+ {
+ PGShmemHeader *shmemseghdr = ShmemSegHdr;
+
+ /* Must be trying to create/attach to ShmemIndex itself */
+ Assert(strcmp(name, "ShmemIndex") == 0);
+
+ if (IsUnderPostmaster)
+ {
+ /* Must be initializing a (non-standalone) backend */
+ Assert(shmemseghdr->index != NULL);
+ structPtr = shmemseghdr->index;
+ *foundPtr = true;
+ }
+ else
+ {
+ /*
+ * If the shmem index doesn't exist, we are bootstrapping: we must
+ * be trying to init the shmem index itself.
+ *
+ * Notice that the ShmemIndexLock is released before the shmem
+ * index has been initialized. This should be OK because no other
+ * process can be accessing shared memory yet.
+ */
+ Assert(shmemseghdr->index == NULL);
+ structPtr = ShmemAlloc(size);
+ shmemseghdr->index = structPtr;
+ *foundPtr = false;
+ }
+ LWLockRelease(ShmemIndexLock);
+ return structPtr;
+ }
+
+ /* look it up in the shmem index */
+ result = (ShmemIndexEnt *)
+ hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
+
+ if (!result)
+ {
+ LWLockRelease(ShmemIndexLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("could not create ShmemIndex entry for data structure \"%s\"",
+ name)));
+ }
+
+ if (*foundPtr)
+ {
+ /*
+ * Structure is in the shmem index so someone else has allocated it
+ * already. The size better be the same as the size we are trying to
+ * initialize to, or there is a name conflict (or worse).
+ */
+ if (result->size != size)
+ {
+ LWLockRelease(ShmemIndexLock);
+ ereport(ERROR,
+ (errmsg("ShmemIndex entry size is wrong for data structure"
+ " \"%s\": expected %zu, actual %zu",
+ name, size, result->size)));
+ }
+ structPtr = result->location;
+ }
+ else
+ {
+ Size allocated_size;
+
+ /* It isn't in the table yet. allocate and initialize it */
+ structPtr = ShmemAllocRaw(size, &allocated_size);
+ if (structPtr == NULL)
+ {
+ /* out of memory; remove the failed ShmemIndex entry */
+ hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
+ LWLockRelease(ShmemIndexLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("not enough shared memory for data structure"
+ " \"%s\" (%zu bytes requested)",
+ name, size)));
+ }
+ result->size = size;
+ result->allocated_size = allocated_size;
+ result->location = structPtr;
+ }
+
+ LWLockRelease(ShmemIndexLock);
+
+ Assert(ShmemAddrIsValid(structPtr));
+
+ Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
+
+ return structPtr;
+}
+
+
+/*
+ * Add two Size values, checking for overflow
+ */
+Size
+add_size(Size s1, Size s2)
+{
+ Size result;
+
+ result = s1 + s2;
+ /* We are assuming Size is an unsigned type here... */
+ if (result < s1 || result < s2)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("requested shared memory size overflows size_t")));
+ return result;
+}
+
+/*
+ * Multiply two Size values, checking for overflow
+ */
+Size
+mul_size(Size s1, Size s2)
+{
+ Size result;
+
+ if (s1 == 0 || s2 == 0)
+ return 0;
+ result = s1 * s2;
+ /* We are assuming Size is an unsigned type here... */
+ if (result / s2 != s1)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("requested shared memory size overflows size_t")));
+ return result;
+}
+
+/* SQL SRF showing allocated shared memory */
+Datum
+pg_get_shmem_allocations(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_SIZES_COLS 4
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ HASH_SEQ_STATUS hstat;
+ ShmemIndexEnt *ent;
+ Size named_allocated = 0;
+ Datum values[PG_GET_SHMEM_SIZES_COLS];
+ bool nulls[PG_GET_SHMEM_SIZES_COLS];
+
+ InitMaterializedSRF(fcinfo, 0);
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ hash_seq_init(&hstat, ShmemIndex);
+
+ /* output all allocated entries */
+ memset(nulls, 0, sizeof(nulls));
+ while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ values[0] = CStringGetTextDatum(ent->key);
+ values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
+ values[2] = Int64GetDatum(ent->size);
+ values[3] = Int64GetDatum(ent->allocated_size);
+ named_allocated += ent->allocated_size;
+
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc,
+ values, nulls);
+ }
+
+ /* output shared memory allocated but not counted via the shmem index */
+ values[0] = CStringGetTextDatum("<anonymous>");
+ nulls[1] = true;
+ values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
+ values[3] = values[2];
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+
+ /* output as-of-yet unused shared memory */
+ nulls[0] = true;
+ values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
+ nulls[1] = false;
+ values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
+ values[3] = values[2];
+ tuplestore_putvalues(rsinfo->setResult, rsinfo->setDesc, values, nulls);
+
+ LWLockRelease(ShmemIndexLock);
+
+ return (Datum) 0;
+}
diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c
new file mode 100644
index 0000000..b595c2d
--- /dev/null
+++ b/src/backend/storage/ipc/signalfuncs.c
@@ -0,0 +1,318 @@
+/*-------------------------------------------------------------------------
+ *
+ * signalfuncs.c
+ * Functions for signaling backends
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/signalfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "catalog/pg_authid.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/syslogger.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+
+
+/*
+ * Send a signal to another backend.
+ *
+ * The signal is delivered if the user is either a superuser or the same
+ * role as the backend being signaled. For "dangerous" signals, an explicit
+ * check for superuser needs to be done prior to calling this function.
+ *
+ * Returns 0 on success, 1 on general failure, 2 on normal permission error
+ * and 3 if the caller needs to be a superuser.
+ *
+ * In the event of a general failure (return code 1), a warning message will
+ * be emitted. For permission errors, doing that is the responsibility of
+ * the caller.
+ */
+#define SIGNAL_BACKEND_SUCCESS 0
+#define SIGNAL_BACKEND_ERROR 1
+#define SIGNAL_BACKEND_NOPERMISSION 2
+#define SIGNAL_BACKEND_NOSUPERUSER 3
+static int
+pg_signal_backend(int pid, int sig)
+{
+ PGPROC *proc = BackendPidGetProc(pid);
+
+ /*
+ * BackendPidGetProc returns NULL if the pid isn't valid; but by the time
+ * we reach kill(), a process for which we get a valid proc here might
+ * have terminated on its own. There's no way to acquire a lock on an
+ * arbitrary process to prevent that. But since so far all the callers of
+ * this mechanism involve some request for ending the process anyway, that
+ * it might end on its own first is not a problem.
+ *
+ * Note that proc will also be NULL if the pid refers to an auxiliary
+ * process or the postmaster (neither of which can be signaled via
+ * pg_signal_backend()).
+ */
+ if (proc == NULL)
+ {
+ /*
+ * This is just a warning so a loop-through-resultset will not abort
+ * if one backend terminated on its own during the run.
+ */
+ ereport(WARNING,
+ (errmsg("PID %d is not a PostgreSQL backend process", pid)));
+
+ return SIGNAL_BACKEND_ERROR;
+ }
+
+ /*
+ * Only allow superusers to signal superuser-owned backends. Any process
+ * not advertising a role might have the importance of a superuser-owned
+ * backend, so treat it that way.
+ */
+ if ((!OidIsValid(proc->roleId) || superuser_arg(proc->roleId)) &&
+ !superuser())
+ return SIGNAL_BACKEND_NOSUPERUSER;
+
+ /* Users can signal backends they have role membership in. */
+ if (!has_privs_of_role(GetUserId(), proc->roleId) &&
+ !has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+ return SIGNAL_BACKEND_NOPERMISSION;
+
+ /*
+ * Can the process we just validated above end, followed by the pid being
+ * recycled for a new process, before reaching here? Then we'd be trying
+ * to kill the wrong thing. Seems near impossible when sequential pid
+ * assignment and wraparound is used. Perhaps it could happen on a system
+ * where pid re-use is randomized. That race condition possibility seems
+ * too unlikely to worry about.
+ */
+
+ /* If we have setsid(), signal the backend's whole process group */
+#ifdef HAVE_SETSID
+ if (kill(-pid, sig))
+#else
+ if (kill(pid, sig))
+#endif
+ {
+ /* Again, just a warning to allow loops */
+ ereport(WARNING,
+ (errmsg("could not send signal to process %d: %m", pid)));
+ return SIGNAL_BACKEND_ERROR;
+ }
+ return SIGNAL_BACKEND_SUCCESS;
+}
+
+/*
+ * Signal to cancel a backend process. This is allowed if you are a member of
+ * the role whose process is being canceled.
+ *
+ * Note that only superusers can signal superuser-owned processes.
+ */
+Datum
+pg_cancel_backend(PG_FUNCTION_ARGS)
+{
+ int r = pg_signal_backend(PG_GETARG_INT32(0), SIGINT);
+
+ if (r == SIGNAL_BACKEND_NOSUPERUSER)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied to cancel query"),
+ errdetail("Only roles with the %s attribute may cancel queries of roles with the %s attribute.",
+ "SUPERUSER", "SUPERUSER")));
+
+ if (r == SIGNAL_BACKEND_NOPERMISSION)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied to cancel query"),
+ errdetail("Only roles with privileges of the role whose query is being canceled or with privileges of the \"%s\" role may cancel this query.",
+ "pg_signal_backend")));
+
+ PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS);
+}
+
+/*
+ * Wait until there is no backend process with the given PID and return true.
+ * On timeout, a warning is emitted and false is returned.
+ */
+static bool
+pg_wait_until_termination(int pid, int64 timeout)
+{
+ /*
+ * Wait in steps of waittime milliseconds until this function exits or
+ * timeout.
+ */
+ int64 waittime = 100;
+
+ /*
+ * Initially remaining time is the entire timeout specified by the user.
+ */
+ int64 remainingtime = timeout;
+
+ /*
+ * Check existence of the backend. If the backend still exists, then wait
+ * for waittime milliseconds, again check for the existence. Repeat this
+ * until timeout or an error occurs or a pending interrupt such as query
+ * cancel gets processed.
+ */
+ do
+ {
+ if (remainingtime < waittime)
+ waittime = remainingtime;
+
+ if (kill(pid, 0) == -1)
+ {
+ if (errno == ESRCH)
+ return true;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("could not check the existence of the backend with PID %d: %m",
+ pid)));
+ }
+
+ /* Process interrupts, if any, before waiting */
+ CHECK_FOR_INTERRUPTS();
+
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ waittime,
+ WAIT_EVENT_BACKEND_TERMINATION);
+
+ ResetLatch(MyLatch);
+
+ remainingtime -= waittime;
+ } while (remainingtime > 0);
+
+ ereport(WARNING,
+ (errmsg_plural("backend with PID %d did not terminate within %lld millisecond",
+ "backend with PID %d did not terminate within %lld milliseconds",
+ timeout,
+ pid, (long long int) timeout)));
+
+ return false;
+}
+
+/*
+ * Send a signal to terminate a backend process. This is allowed if you are a
+ * member of the role whose process is being terminated. If the timeout input
+ * argument is 0, then this function just signals the backend and returns
+ * true. If timeout is nonzero, then it waits until no process has the given
+ * PID; if the process ends within the timeout, true is returned, and if the
+ * timeout is exceeded, a warning is emitted and false is returned.
+ *
+ * Note that only superusers can signal superuser-owned processes.
+ */
+Datum
+pg_terminate_backend(PG_FUNCTION_ARGS)
+{
+ int pid;
+ int r;
+ int timeout; /* milliseconds */
+
+ pid = PG_GETARG_INT32(0);
+ timeout = PG_GETARG_INT64(1);
+
+ if (timeout < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+ errmsg("\"timeout\" must not be negative")));
+
+ r = pg_signal_backend(pid, SIGTERM);
+
+ if (r == SIGNAL_BACKEND_NOSUPERUSER)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied to terminate process"),
+ errdetail("Only roles with the %s attribute may terminate processes of roles with the %s attribute.",
+ "SUPERUSER", "SUPERUSER")));
+
+ if (r == SIGNAL_BACKEND_NOPERMISSION)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied to terminate process"),
+ errdetail("Only roles with privileges of the role whose process is being terminated or with privileges of the \"%s\" role may terminate this process.",
+ "pg_signal_backend")));
+
+ /* Wait only on success and if actually requested */
+ if (r == SIGNAL_BACKEND_SUCCESS && timeout > 0)
+ PG_RETURN_BOOL(pg_wait_until_termination(pid, timeout));
+ else
+ PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS);
+}
+
+/*
+ * Signal to reload the database configuration
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_reload_conf(PG_FUNCTION_ARGS)
+{
+ if (kill(PostmasterPid, SIGHUP))
+ {
+ ereport(WARNING,
+ (errmsg("failed to send signal to postmaster: %m")));
+ PG_RETURN_BOOL(false);
+ }
+
+ PG_RETURN_BOOL(true);
+}
+
+
+/*
+ * Rotate log file
+ *
+ * This function is kept to support adminpack 1.0.
+ */
+Datum
+pg_rotate_logfile(PG_FUNCTION_ARGS)
+{
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to rotate log files with adminpack 1.0"),
+ /* translator: %s is a SQL function name */
+ errhint("Consider using %s, which is part of core, instead.",
+ "pg_logfile_rotate()")));
+
+ if (!Logging_collector)
+ {
+ ereport(WARNING,
+ (errmsg("rotation not possible because log collection not active")));
+ PG_RETURN_BOOL(false);
+ }
+
+ SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE);
+ PG_RETURN_BOOL(true);
+}
+
+/*
+ * Rotate log file
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_rotate_logfile_v2(PG_FUNCTION_ARGS)
+{
+ if (!Logging_collector)
+ {
+ ereport(WARNING,
+ (errmsg("rotation not possible because log collection not active")));
+ PG_RETURN_BOOL(false);
+ }
+
+ SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE);
+ PG_RETURN_BOOL(true);
+}
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
new file mode 100644
index 0000000..b405f08
--- /dev/null
+++ b/src/backend/storage/ipc/sinval.c
@@ -0,0 +1,205 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.c
+ * POSTGRES shared cache invalidation communication code.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/sinval.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/sinvaladt.h"
+#include "utils/inval.h"
+
+
+uint64 SharedInvalidMessageCounter;
+
+
+/*
+ * Because backends sitting idle will not be reading sinval events, we
+ * need a way to give an idle backend a swift kick in the rear and make
+ * it catch up before the sinval queue overflows and forces it to go
+ * through a cache reset exercise. This is done by sending
+ * PROCSIG_CATCHUP_INTERRUPT to any backend that gets too far behind.
+ *
+ * The signal handler will set an interrupt pending flag and will set the
+ * processes latch. Whenever starting to read from the client, or when
+ * interrupted while doing so, ProcessClientReadInterrupt() will call
+ * ProcessCatchupEvent().
+ */
+volatile sig_atomic_t catchupInterruptPending = false;
+
+
+/*
+ * SendSharedInvalidMessages
+ * Add shared-cache-invalidation message(s) to the global SI message queue.
+ */
+void
+SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, int n)
+{
+ SIInsertDataEntries(msgs, n);
+}
+
+/*
+ * ReceiveSharedInvalidMessages
+ * Process shared-cache-invalidation messages waiting for this backend
+ *
+ * We guarantee to process all messages that had been queued before the
+ * routine was entered. It is of course possible for more messages to get
+ * queued right after our last SIGetDataEntries call.
+ *
+ * NOTE: it is entirely possible for this routine to be invoked recursively
+ * as a consequence of processing inside the invalFunction or resetFunction.
+ * Furthermore, such a recursive call must guarantee that all outstanding
+ * inval messages have been processed before it exits. This is the reason
+ * for the strange-looking choice to use a statically allocated buffer array
+ * and counters; it's so that a recursive call can process messages already
+ * sucked out of sinvaladt.c.
+ */
+void
+ReceiveSharedInvalidMessages(void (*invalFunction) (SharedInvalidationMessage *msg),
+ void (*resetFunction) (void))
+{
+#define MAXINVALMSGS 32
+ static SharedInvalidationMessage messages[MAXINVALMSGS];
+
+ /*
+ * We use volatile here to prevent bugs if a compiler doesn't realize that
+ * recursion is a possibility ...
+ */
+ static volatile int nextmsg = 0;
+ static volatile int nummsgs = 0;
+
+ /* Deal with any messages still pending from an outer recursion */
+ while (nextmsg < nummsgs)
+ {
+ SharedInvalidationMessage msg = messages[nextmsg++];
+
+ SharedInvalidMessageCounter++;
+ invalFunction(&msg);
+ }
+
+ do
+ {
+ int getResult;
+
+ nextmsg = nummsgs = 0;
+
+ /* Try to get some more messages */
+ getResult = SIGetDataEntries(messages, MAXINVALMSGS);
+
+ if (getResult < 0)
+ {
+ /* got a reset message */
+ elog(DEBUG4, "cache state reset");
+ SharedInvalidMessageCounter++;
+ resetFunction();
+ break; /* nothing more to do */
+ }
+
+ /* Process them, being wary that a recursive call might eat some */
+ nextmsg = 0;
+ nummsgs = getResult;
+
+ while (nextmsg < nummsgs)
+ {
+ SharedInvalidationMessage msg = messages[nextmsg++];
+
+ SharedInvalidMessageCounter++;
+ invalFunction(&msg);
+ }
+
+ /*
+ * We only need to loop if the last SIGetDataEntries call (which might
+ * have been within a recursive call) returned a full buffer.
+ */
+ } while (nummsgs == MAXINVALMSGS);
+
+ /*
+ * We are now caught up. If we received a catchup signal, reset that
+ * flag, and call SICleanupQueue(). This is not so much because we need
+ * to flush dead messages right now, as that we want to pass on the
+ * catchup signal to the next slowest backend. "Daisy chaining" the
+ * catchup signal this way avoids creating spikes in system load for what
+ * should be just a background maintenance activity.
+ */
+ if (catchupInterruptPending)
+ {
+ catchupInterruptPending = false;
+ elog(DEBUG4, "sinval catchup complete, cleaning queue");
+ SICleanupQueue(false, 0);
+ }
+}
+
+
+/*
+ * HandleCatchupInterrupt
+ *
+ * This is called when PROCSIG_CATCHUP_INTERRUPT is received.
+ *
+ * We used to directly call ProcessCatchupEvent directly when idle. These days
+ * we just set a flag to do it later and notify the process of that fact by
+ * setting the process's latch.
+ */
+void
+HandleCatchupInterrupt(void)
+{
+ /*
+ * Note: this is called by a SIGNAL HANDLER. You must be very wary what
+ * you do here.
+ */
+
+ catchupInterruptPending = true;
+
+ /* make sure the event is processed in due course */
+ SetLatch(MyLatch);
+}
+
+/*
+ * ProcessCatchupInterrupt
+ *
+ * The portion of catchup interrupt handling that runs outside of the signal
+ * handler, which allows it to actually process pending invalidations.
+ */
+void
+ProcessCatchupInterrupt(void)
+{
+ while (catchupInterruptPending)
+ {
+ /*
+ * What we need to do here is cause ReceiveSharedInvalidMessages() to
+ * run, which will do the necessary work and also reset the
+ * catchupInterruptPending flag. If we are inside a transaction we
+ * can just call AcceptInvalidationMessages() to do this. If we
+ * aren't, we start and immediately end a transaction; the call to
+ * AcceptInvalidationMessages() happens down inside transaction start.
+ *
+ * It is awfully tempting to just call AcceptInvalidationMessages()
+ * without the rest of the xact start/stop overhead, and I think that
+ * would actually work in the normal case; but I am not sure that
+ * things would clean up nicely if we got an error partway through.
+ */
+ if (IsTransactionOrTransactionBlock())
+ {
+ elog(DEBUG4, "ProcessCatchupEvent inside transaction");
+ AcceptInvalidationMessages();
+ }
+ else
+ {
+ elog(DEBUG4, "ProcessCatchupEvent outside transaction");
+ StartTransactionCommand();
+ CommitTransactionCommand();
+ }
+ }
+}
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
new file mode 100644
index 0000000..3d97c75
--- /dev/null
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -0,0 +1,791 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.c
+ * POSTGRES shared cache invalidation data manager.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/sinvaladt.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "storage/backendid.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+
+/*
+ * Conceptually, the shared cache invalidation messages are stored in an
+ * infinite array, where maxMsgNum is the next array subscript to store a
+ * submitted message in, minMsgNum is the smallest array subscript containing
+ * a message not yet read by all backends, and we always have maxMsgNum >=
+ * minMsgNum. (They are equal when there are no messages pending.) For each
+ * active backend, there is a nextMsgNum pointer indicating the next message it
+ * needs to read; we have maxMsgNum >= nextMsgNum >= minMsgNum for every
+ * backend.
+ *
+ * (In the current implementation, minMsgNum is a lower bound for the
+ * per-process nextMsgNum values, but it isn't rigorously kept equal to the
+ * smallest nextMsgNum --- it may lag behind. We only update it when
+ * SICleanupQueue is called, and we try not to do that often.)
+ *
+ * In reality, the messages are stored in a circular buffer of MAXNUMMESSAGES
+ * entries. We translate MsgNum values into circular-buffer indexes by
+ * computing MsgNum % MAXNUMMESSAGES (this should be fast as long as
+ * MAXNUMMESSAGES is a constant and a power of 2). As long as maxMsgNum
+ * doesn't exceed minMsgNum by more than MAXNUMMESSAGES, we have enough space
+ * in the buffer. If the buffer does overflow, we recover by setting the
+ * "reset" flag for each backend that has fallen too far behind. A backend
+ * that is in "reset" state is ignored while determining minMsgNum. When
+ * it does finally attempt to receive inval messages, it must discard all
+ * its invalidatable state, since it won't know what it missed.
+ *
+ * To reduce the probability of needing resets, we send a "catchup" interrupt
+ * to any backend that seems to be falling unreasonably far behind. The
+ * normal behavior is that at most one such interrupt is in flight at a time;
+ * when a backend completes processing a catchup interrupt, it executes
+ * SICleanupQueue, which will signal the next-furthest-behind backend if
+ * needed. This avoids undue contention from multiple backends all trying
+ * to catch up at once. However, the furthest-back backend might be stuck
+ * in a state where it can't catch up. Eventually it will get reset, so it
+ * won't cause any more problems for anyone but itself. But we don't want
+ * to find that a bunch of other backends are now too close to the reset
+ * threshold to be saved. So SICleanupQueue is designed to occasionally
+ * send extra catchup interrupts as the queue gets fuller, to backends that
+ * are far behind and haven't gotten one yet. As long as there aren't a lot
+ * of "stuck" backends, we won't need a lot of extra interrupts, since ones
+ * that aren't stuck will propagate their interrupts to the next guy.
+ *
+ * We would have problems if the MsgNum values overflow an integer, so
+ * whenever minMsgNum exceeds MSGNUMWRAPAROUND, we subtract MSGNUMWRAPAROUND
+ * from all the MsgNum variables simultaneously. MSGNUMWRAPAROUND can be
+ * large so that we don't need to do this often. It must be a multiple of
+ * MAXNUMMESSAGES so that the existing circular-buffer entries don't need
+ * to be moved when we do it.
+ *
+ * Access to the shared sinval array is protected by two locks, SInvalReadLock
+ * and SInvalWriteLock. Readers take SInvalReadLock in shared mode; this
+ * authorizes them to modify their own ProcState but not to modify or even
+ * look at anyone else's. When we need to perform array-wide updates,
+ * such as in SICleanupQueue, we take SInvalReadLock in exclusive mode to
+ * lock out all readers. Writers take SInvalWriteLock (always in exclusive
+ * mode) to serialize adding messages to the queue. Note that a writer
+ * can operate in parallel with one or more readers, because the writer
+ * has no need to touch anyone's ProcState, except in the infrequent cases
+ * when SICleanupQueue is needed. The only point of overlap is that
+ * the writer wants to change maxMsgNum while readers need to read it.
+ * We deal with that by having a spinlock that readers must take for just
+ * long enough to read maxMsgNum, while writers take it for just long enough
+ * to write maxMsgNum. (The exact rule is that you need the spinlock to
+ * read maxMsgNum if you are not holding SInvalWriteLock, and you need the
+ * spinlock to write maxMsgNum unless you are holding both locks.)
+ *
+ * Note: since maxMsgNum is an int and hence presumably atomically readable/
+ * writable, the spinlock might seem unnecessary. The reason it is needed
+ * is to provide a memory barrier: we need to be sure that messages written
+ * to the array are actually there before maxMsgNum is increased, and that
+ * readers will see that data after fetching maxMsgNum. Multiprocessors
+ * that have weak memory-ordering guarantees can fail without the memory
+ * barrier instructions that are included in the spinlock sequences.
+ */
+
+
+/*
+ * Configurable parameters.
+ *
+ * MAXNUMMESSAGES: max number of shared-inval messages we can buffer.
+ * Must be a power of 2 for speed.
+ *
+ * MSGNUMWRAPAROUND: how often to reduce MsgNum variables to avoid overflow.
+ * Must be a multiple of MAXNUMMESSAGES. Should be large.
+ *
+ * CLEANUP_MIN: the minimum number of messages that must be in the buffer
+ * before we bother to call SICleanupQueue.
+ *
+ * CLEANUP_QUANTUM: how often (in messages) to call SICleanupQueue once
+ * we exceed CLEANUP_MIN. Should be a power of 2 for speed.
+ *
+ * SIG_THRESHOLD: the minimum number of messages a backend must have fallen
+ * behind before we'll send it PROCSIG_CATCHUP_INTERRUPT.
+ *
+ * WRITE_QUANTUM: the max number of messages to push into the buffer per
+ * iteration of SIInsertDataEntries. Noncritical but should be less than
+ * CLEANUP_QUANTUM, because we only consider calling SICleanupQueue once
+ * per iteration.
+ */
+
+#define MAXNUMMESSAGES 4096
+#define MSGNUMWRAPAROUND (MAXNUMMESSAGES * 262144)
+#define CLEANUP_MIN (MAXNUMMESSAGES / 2)
+#define CLEANUP_QUANTUM (MAXNUMMESSAGES / 16)
+#define SIG_THRESHOLD (MAXNUMMESSAGES / 2)
+#define WRITE_QUANTUM 64
+
+/* Per-backend state in shared invalidation structure */
+typedef struct ProcState
+{
+ /* procPid is zero in an inactive ProcState array entry. */
+ pid_t procPid; /* PID of backend, for signaling */
+ PGPROC *proc; /* PGPROC of backend */
+ /* nextMsgNum is meaningless if procPid == 0 or resetState is true. */
+ int nextMsgNum; /* next message number to read */
+ bool resetState; /* backend needs to reset its state */
+ bool signaled; /* backend has been sent catchup signal */
+ bool hasMessages; /* backend has unread messages */
+
+ /*
+ * Backend only sends invalidations, never receives them. This only makes
+ * sense for Startup process during recovery because it doesn't maintain a
+ * relcache, yet it fires inval messages to allow query backends to see
+ * schema changes.
+ */
+ bool sendOnly; /* backend only sends, never receives */
+
+ /*
+ * Next LocalTransactionId to use for each idle backend slot. We keep
+ * this here because it is indexed by BackendId and it is convenient to
+ * copy the value to and from local memory when MyBackendId is set. It's
+ * meaningless in an active ProcState entry.
+ */
+ LocalTransactionId nextLXID;
+} ProcState;
+
+/* Shared cache invalidation memory segment */
+typedef struct SISeg
+{
+ /*
+ * General state information
+ */
+ int minMsgNum; /* oldest message still needed */
+ int maxMsgNum; /* next message number to be assigned */
+ int nextThreshold; /* # of messages to call SICleanupQueue */
+ int lastBackend; /* index of last active procState entry, +1 */
+ int maxBackends; /* size of procState array */
+
+ slock_t msgnumLock; /* spinlock protecting maxMsgNum */
+
+ /*
+ * Circular buffer holding shared-inval messages
+ */
+ SharedInvalidationMessage buffer[MAXNUMMESSAGES];
+
+ /*
+ * Per-backend invalidation state info (has MaxBackends entries).
+ */
+ ProcState procState[FLEXIBLE_ARRAY_MEMBER];
+} SISeg;
+
+static SISeg *shmInvalBuffer; /* pointer to the shared inval buffer */
+
+
+static LocalTransactionId nextLocalTransactionId;
+
+static void CleanupInvalidationState(int status, Datum arg);
+
+
+/*
+ * SInvalShmemSize --- return shared-memory space needed
+ */
+Size
+SInvalShmemSize(void)
+{
+ Size size;
+
+ size = offsetof(SISeg, procState);
+
+ /*
+ * In Hot Standby mode, the startup process requests a procState array
+ * slot using InitRecoveryTransactionEnvironment(). Even though
+ * MaxBackends doesn't account for the startup process, it is guaranteed
+ * to get a free slot. This is because the autovacuum launcher and worker
+ * processes, which are included in MaxBackends, are not started in Hot
+ * Standby mode.
+ */
+ size = add_size(size, mul_size(sizeof(ProcState), MaxBackends));
+
+ return size;
+}
+
+/*
+ * CreateSharedInvalidationState
+ * Create and initialize the SI message buffer
+ */
+void
+CreateSharedInvalidationState(void)
+{
+ int i;
+ bool found;
+
+ /* Allocate space in shared memory */
+ shmInvalBuffer = (SISeg *)
+ ShmemInitStruct("shmInvalBuffer", SInvalShmemSize(), &found);
+ if (found)
+ return;
+
+ /* Clear message counters, save size of procState array, init spinlock */
+ shmInvalBuffer->minMsgNum = 0;
+ shmInvalBuffer->maxMsgNum = 0;
+ shmInvalBuffer->nextThreshold = CLEANUP_MIN;
+ shmInvalBuffer->lastBackend = 0;
+ shmInvalBuffer->maxBackends = MaxBackends;
+ SpinLockInit(&shmInvalBuffer->msgnumLock);
+
+ /* The buffer[] array is initially all unused, so we need not fill it */
+
+ /* Mark all backends inactive, and initialize nextLXID */
+ for (i = 0; i < shmInvalBuffer->maxBackends; i++)
+ {
+ shmInvalBuffer->procState[i].procPid = 0; /* inactive */
+ shmInvalBuffer->procState[i].proc = NULL;
+ shmInvalBuffer->procState[i].nextMsgNum = 0; /* meaningless */
+ shmInvalBuffer->procState[i].resetState = false;
+ shmInvalBuffer->procState[i].signaled = false;
+ shmInvalBuffer->procState[i].hasMessages = false;
+ shmInvalBuffer->procState[i].nextLXID = InvalidLocalTransactionId;
+ }
+}
+
+/*
+ * SharedInvalBackendInit
+ * Initialize a new backend to operate on the sinval buffer
+ */
+void
+SharedInvalBackendInit(bool sendOnly)
+{
+ int index;
+ ProcState *stateP = NULL;
+ SISeg *segP = shmInvalBuffer;
+
+ /*
+ * This can run in parallel with read operations, but not with write
+ * operations, since SIInsertDataEntries relies on lastBackend to set
+ * hasMessages appropriately.
+ */
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+ /* Look for a free entry in the procState array */
+ for (index = 0; index < segP->lastBackend; index++)
+ {
+ if (segP->procState[index].procPid == 0) /* inactive slot? */
+ {
+ stateP = &segP->procState[index];
+ break;
+ }
+ }
+
+ if (stateP == NULL)
+ {
+ if (segP->lastBackend < segP->maxBackends)
+ {
+ stateP = &segP->procState[segP->lastBackend];
+ Assert(stateP->procPid == 0);
+ segP->lastBackend++;
+ }
+ else
+ {
+ /*
+ * out of procState slots: MaxBackends exceeded -- report normally
+ */
+ MyBackendId = InvalidBackendId;
+ LWLockRelease(SInvalWriteLock);
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("sorry, too many clients already")));
+ }
+ }
+
+ MyBackendId = (stateP - &segP->procState[0]) + 1;
+
+ /* Advertise assigned backend ID in MyProc */
+ MyProc->backendId = MyBackendId;
+
+ /* Fetch next local transaction ID into local memory */
+ nextLocalTransactionId = stateP->nextLXID;
+
+ /* mark myself active, with all extant messages already read */
+ stateP->procPid = MyProcPid;
+ stateP->proc = MyProc;
+ stateP->nextMsgNum = segP->maxMsgNum;
+ stateP->resetState = false;
+ stateP->signaled = false;
+ stateP->hasMessages = false;
+ stateP->sendOnly = sendOnly;
+
+ LWLockRelease(SInvalWriteLock);
+
+ /* register exit routine to mark my entry inactive at exit */
+ on_shmem_exit(CleanupInvalidationState, PointerGetDatum(segP));
+
+ elog(DEBUG4, "my backend ID is %d", MyBackendId);
+}
+
+/*
+ * CleanupInvalidationState
+ * Mark the current backend as no longer active.
+ *
+ * This function is called via on_shmem_exit() during backend shutdown.
+ *
+ * arg is really of type "SISeg*".
+ */
+static void
+CleanupInvalidationState(int status, Datum arg)
+{
+ SISeg *segP = (SISeg *) DatumGetPointer(arg);
+ ProcState *stateP;
+ int i;
+
+ Assert(PointerIsValid(segP));
+
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+ stateP = &segP->procState[MyBackendId - 1];
+
+ /* Update next local transaction ID for next holder of this backendID */
+ stateP->nextLXID = nextLocalTransactionId;
+
+ /* Mark myself inactive */
+ stateP->procPid = 0;
+ stateP->proc = NULL;
+ stateP->nextMsgNum = 0;
+ stateP->resetState = false;
+ stateP->signaled = false;
+
+ /* Recompute index of last active backend */
+ for (i = segP->lastBackend; i > 0; i--)
+ {
+ if (segP->procState[i - 1].procPid != 0)
+ break;
+ }
+ segP->lastBackend = i;
+
+ LWLockRelease(SInvalWriteLock);
+}
+
+/*
+ * BackendIdGetProc
+ * Get the PGPROC structure for a backend, given the backend ID.
+ * The result may be out of date arbitrarily quickly, so the caller
+ * must be careful about how this information is used. NULL is
+ * returned if the backend is not active.
+ */
+PGPROC *
+BackendIdGetProc(int backendID)
+{
+ PGPROC *result = NULL;
+ SISeg *segP = shmInvalBuffer;
+
+ /* Need to lock out additions/removals of backends */
+ LWLockAcquire(SInvalWriteLock, LW_SHARED);
+
+ if (backendID > 0 && backendID <= segP->lastBackend)
+ {
+ ProcState *stateP = &segP->procState[backendID - 1];
+
+ result = stateP->proc;
+ }
+
+ LWLockRelease(SInvalWriteLock);
+
+ return result;
+}
+
+/*
+ * BackendIdGetTransactionIds
+ * Get the xid, xmin, nsubxid and overflow status of the backend. The
+ * result may be out of date arbitrarily quickly, so the caller must be
+ * careful about how this information is used.
+ */
+void
+BackendIdGetTransactionIds(int backendID, TransactionId *xid,
+ TransactionId *xmin, int *nsubxid, bool *overflowed)
+{
+ SISeg *segP = shmInvalBuffer;
+
+ *xid = InvalidTransactionId;
+ *xmin = InvalidTransactionId;
+ *nsubxid = 0;
+ *overflowed = false;
+
+ /* Need to lock out additions/removals of backends */
+ LWLockAcquire(SInvalWriteLock, LW_SHARED);
+
+ if (backendID > 0 && backendID <= segP->lastBackend)
+ {
+ ProcState *stateP = &segP->procState[backendID - 1];
+ PGPROC *proc = stateP->proc;
+
+ if (proc != NULL)
+ {
+ *xid = proc->xid;
+ *xmin = proc->xmin;
+ *nsubxid = proc->subxidStatus.count;
+ *overflowed = proc->subxidStatus.overflowed;
+ }
+ }
+
+ LWLockRelease(SInvalWriteLock);
+}
+
+/*
+ * SIInsertDataEntries
+ * Add new invalidation message(s) to the buffer.
+ */
+void
+SIInsertDataEntries(const SharedInvalidationMessage *data, int n)
+{
+ SISeg *segP = shmInvalBuffer;
+
+ /*
+ * N can be arbitrarily large. We divide the work into groups of no more
+ * than WRITE_QUANTUM messages, to be sure that we don't hold the lock for
+ * an unreasonably long time. (This is not so much because we care about
+ * letting in other writers, as that some just-caught-up backend might be
+ * trying to do SICleanupQueue to pass on its signal, and we don't want it
+ * to have to wait a long time.) Also, we need to consider calling
+ * SICleanupQueue every so often.
+ */
+ while (n > 0)
+ {
+ int nthistime = Min(n, WRITE_QUANTUM);
+ int numMsgs;
+ int max;
+ int i;
+
+ n -= nthistime;
+
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+ /*
+ * If the buffer is full, we *must* acquire some space. Clean the
+ * queue and reset anyone who is preventing space from being freed.
+ * Otherwise, clean the queue only when it's exceeded the next
+ * fullness threshold. We have to loop and recheck the buffer state
+ * after any call of SICleanupQueue.
+ */
+ for (;;)
+ {
+ numMsgs = segP->maxMsgNum - segP->minMsgNum;
+ if (numMsgs + nthistime > MAXNUMMESSAGES ||
+ numMsgs >= segP->nextThreshold)
+ SICleanupQueue(true, nthistime);
+ else
+ break;
+ }
+
+ /*
+ * Insert new message(s) into proper slot of circular buffer
+ */
+ max = segP->maxMsgNum;
+ while (nthistime-- > 0)
+ {
+ segP->buffer[max % MAXNUMMESSAGES] = *data++;
+ max++;
+ }
+
+ /* Update current value of maxMsgNum using spinlock */
+ SpinLockAcquire(&segP->msgnumLock);
+ segP->maxMsgNum = max;
+ SpinLockRelease(&segP->msgnumLock);
+
+ /*
+ * Now that the maxMsgNum change is globally visible, we give everyone
+ * a swift kick to make sure they read the newly added messages.
+ * Releasing SInvalWriteLock will enforce a full memory barrier, so
+ * these (unlocked) changes will be committed to memory before we exit
+ * the function.
+ */
+ for (i = 0; i < segP->lastBackend; i++)
+ {
+ ProcState *stateP = &segP->procState[i];
+
+ stateP->hasMessages = true;
+ }
+
+ LWLockRelease(SInvalWriteLock);
+ }
+}
+
+/*
+ * SIGetDataEntries
+ * get next SI message(s) for current backend, if there are any
+ *
+ * Possible return values:
+ * 0: no SI message available
+ * n>0: next n SI messages have been extracted into data[]
+ * -1: SI reset message extracted
+ *
+ * If the return value is less than the array size "datasize", the caller
+ * can assume that there are no more SI messages after the one(s) returned.
+ * Otherwise, another call is needed to collect more messages.
+ *
+ * NB: this can run in parallel with other instances of SIGetDataEntries
+ * executing on behalf of other backends, since each instance will modify only
+ * fields of its own backend's ProcState, and no instance will look at fields
+ * of other backends' ProcStates. We express this by grabbing SInvalReadLock
+ * in shared mode. Note that this is not exactly the normal (read-only)
+ * interpretation of a shared lock! Look closely at the interactions before
+ * allowing SInvalReadLock to be grabbed in shared mode for any other reason!
+ *
+ * NB: this can also run in parallel with SIInsertDataEntries. It is not
+ * guaranteed that we will return any messages added after the routine is
+ * entered.
+ *
+ * Note: we assume that "datasize" is not so large that it might be important
+ * to break our hold on SInvalReadLock into segments.
+ */
+int
+SIGetDataEntries(SharedInvalidationMessage *data, int datasize)
+{
+ SISeg *segP;
+ ProcState *stateP;
+ int max;
+ int n;
+
+ segP = shmInvalBuffer;
+ stateP = &segP->procState[MyBackendId - 1];
+
+ /*
+ * Before starting to take locks, do a quick, unlocked test to see whether
+ * there can possibly be anything to read. On a multiprocessor system,
+ * it's possible that this load could migrate backwards and occur before
+ * we actually enter this function, so we might miss a sinval message that
+ * was just added by some other processor. But they can't migrate
+ * backwards over a preceding lock acquisition, so it should be OK. If we
+ * haven't acquired a lock preventing against further relevant
+ * invalidations, any such occurrence is not much different than if the
+ * invalidation had arrived slightly later in the first place.
+ */
+ if (!stateP->hasMessages)
+ return 0;
+
+ LWLockAcquire(SInvalReadLock, LW_SHARED);
+
+ /*
+ * We must reset hasMessages before determining how many messages we're
+ * going to read. That way, if new messages arrive after we have
+ * determined how many we're reading, the flag will get reset and we'll
+ * notice those messages part-way through.
+ *
+ * Note that, if we don't end up reading all of the messages, we had
+ * better be certain to reset this flag before exiting!
+ */
+ stateP->hasMessages = false;
+
+ /* Fetch current value of maxMsgNum using spinlock */
+ SpinLockAcquire(&segP->msgnumLock);
+ max = segP->maxMsgNum;
+ SpinLockRelease(&segP->msgnumLock);
+
+ if (stateP->resetState)
+ {
+ /*
+ * Force reset. We can say we have dealt with any messages added
+ * since the reset, as well; and that means we should clear the
+ * signaled flag, too.
+ */
+ stateP->nextMsgNum = max;
+ stateP->resetState = false;
+ stateP->signaled = false;
+ LWLockRelease(SInvalReadLock);
+ return -1;
+ }
+
+ /*
+ * Retrieve messages and advance backend's counter, until data array is
+ * full or there are no more messages.
+ *
+ * There may be other backends that haven't read the message(s), so we
+ * cannot delete them here. SICleanupQueue() will eventually remove them
+ * from the queue.
+ */
+ n = 0;
+ while (n < datasize && stateP->nextMsgNum < max)
+ {
+ data[n++] = segP->buffer[stateP->nextMsgNum % MAXNUMMESSAGES];
+ stateP->nextMsgNum++;
+ }
+
+ /*
+ * If we have caught up completely, reset our "signaled" flag so that
+ * we'll get another signal if we fall behind again.
+ *
+ * If we haven't caught up completely, reset the hasMessages flag so that
+ * we see the remaining messages next time.
+ */
+ if (stateP->nextMsgNum >= max)
+ stateP->signaled = false;
+ else
+ stateP->hasMessages = true;
+
+ LWLockRelease(SInvalReadLock);
+ return n;
+}
+
+/*
+ * SICleanupQueue
+ * Remove messages that have been consumed by all active backends
+ *
+ * callerHasWriteLock is true if caller is holding SInvalWriteLock.
+ * minFree is the minimum number of message slots to make free.
+ *
+ * Possible side effects of this routine include marking one or more
+ * backends as "reset" in the array, and sending PROCSIG_CATCHUP_INTERRUPT
+ * to some backend that seems to be getting too far behind. We signal at
+ * most one backend at a time, for reasons explained at the top of the file.
+ *
+ * Caution: because we transiently release write lock when we have to signal
+ * some other backend, it is NOT guaranteed that there are still minFree
+ * free message slots at exit. Caller must recheck and perhaps retry.
+ */
+void
+SICleanupQueue(bool callerHasWriteLock, int minFree)
+{
+ SISeg *segP = shmInvalBuffer;
+ int min,
+ minsig,
+ lowbound,
+ numMsgs,
+ i;
+ ProcState *needSig = NULL;
+
+ /* Lock out all writers and readers */
+ if (!callerHasWriteLock)
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+ LWLockAcquire(SInvalReadLock, LW_EXCLUSIVE);
+
+ /*
+ * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify the
+ * furthest-back backend that needs signaling (if any), and reset any
+ * backends that are too far back. Note that because we ignore sendOnly
+ * backends here it is possible for them to keep sending messages without
+ * a problem even when they are the only active backend.
+ */
+ min = segP->maxMsgNum;
+ minsig = min - SIG_THRESHOLD;
+ lowbound = min - MAXNUMMESSAGES + minFree;
+
+ for (i = 0; i < segP->lastBackend; i++)
+ {
+ ProcState *stateP = &segP->procState[i];
+ int n = stateP->nextMsgNum;
+
+ /* Ignore if inactive or already in reset state */
+ if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly)
+ continue;
+
+ /*
+ * If we must free some space and this backend is preventing it, force
+ * him into reset state and then ignore until he catches up.
+ */
+ if (n < lowbound)
+ {
+ stateP->resetState = true;
+ /* no point in signaling him ... */
+ continue;
+ }
+
+ /* Track the global minimum nextMsgNum */
+ if (n < min)
+ min = n;
+
+ /* Also see who's furthest back of the unsignaled backends */
+ if (n < minsig && !stateP->signaled)
+ {
+ minsig = n;
+ needSig = stateP;
+ }
+ }
+ segP->minMsgNum = min;
+
+ /*
+ * When minMsgNum gets really large, decrement all message counters so as
+ * to forestall overflow of the counters. This happens seldom enough that
+ * folding it into the previous loop would be a loser.
+ */
+ if (min >= MSGNUMWRAPAROUND)
+ {
+ segP->minMsgNum -= MSGNUMWRAPAROUND;
+ segP->maxMsgNum -= MSGNUMWRAPAROUND;
+ for (i = 0; i < segP->lastBackend; i++)
+ {
+ /* we don't bother skipping inactive entries here */
+ segP->procState[i].nextMsgNum -= MSGNUMWRAPAROUND;
+ }
+ }
+
+ /*
+ * Determine how many messages are still in the queue, and set the
+ * threshold at which we should repeat SICleanupQueue().
+ */
+ numMsgs = segP->maxMsgNum - segP->minMsgNum;
+ if (numMsgs < CLEANUP_MIN)
+ segP->nextThreshold = CLEANUP_MIN;
+ else
+ segP->nextThreshold = (numMsgs / CLEANUP_QUANTUM + 1) * CLEANUP_QUANTUM;
+
+ /*
+ * Lastly, signal anyone who needs a catchup interrupt. Since
+ * SendProcSignal() might not be fast, we don't want to hold locks while
+ * executing it.
+ */
+ if (needSig)
+ {
+ pid_t his_pid = needSig->procPid;
+ BackendId his_backendId = (needSig - &segP->procState[0]) + 1;
+
+ needSig->signaled = true;
+ LWLockRelease(SInvalReadLock);
+ LWLockRelease(SInvalWriteLock);
+ elog(DEBUG4, "sending sinval catchup signal to PID %d", (int) his_pid);
+ SendProcSignal(his_pid, PROCSIG_CATCHUP_INTERRUPT, his_backendId);
+ if (callerHasWriteLock)
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+ }
+ else
+ {
+ LWLockRelease(SInvalReadLock);
+ if (!callerHasWriteLock)
+ LWLockRelease(SInvalWriteLock);
+ }
+}
+
+
+/*
+ * GetNextLocalTransactionId --- allocate a new LocalTransactionId
+ *
+ * We split VirtualTransactionIds into two parts so that it is possible
+ * to allocate a new one without any contention for shared memory, except
+ * for a bit of additional overhead during backend startup/shutdown.
+ * The high-order part of a VirtualTransactionId is a BackendId, and the
+ * low-order part is a LocalTransactionId, which we assign from a local
+ * counter. To avoid the risk of a VirtualTransactionId being reused
+ * within a short interval, successive procs occupying the same backend ID
+ * slot should use a consecutive sequence of local IDs, which is implemented
+ * by copying nextLocalTransactionId as seen above.
+ */
+LocalTransactionId
+GetNextLocalTransactionId(void)
+{
+ LocalTransactionId result;
+
+ /* loop to avoid returning InvalidLocalTransactionId at wraparound */
+ do
+ {
+ result = nextLocalTransactionId++;
+ } while (!LocalTransactionIdIsValid(result));
+
+ return result;
+}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
new file mode 100644
index 0000000..4c06741
--- /dev/null
+++ b/src/backend/storage/ipc/standby.c
@@ -0,0 +1,1518 @@
+/*-------------------------------------------------------------------------
+ *
+ * standby.c
+ * Misc functions used in Hot Standby mode.
+ *
+ * All functions for handling RM_STANDBY_ID, which relate to
+ * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
+ * Plus conflict recovery processing.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/standby.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xloginsert.h"
+#include "access/xlogrecovery.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "replication/slot.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/standby.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/* User-settable GUC parameters */
+int max_standby_archive_delay = 30 * 1000;
+int max_standby_streaming_delay = 30 * 1000;
+bool log_recovery_conflict_waits = false;
+
+/*
+ * Keep track of all the exclusive locks owned by original transactions.
+ * For each known exclusive lock, there is a RecoveryLockEntry in the
+ * RecoveryLockHash hash table. All RecoveryLockEntrys belonging to a
+ * given XID are chained together so that we can find them easily.
+ * For each original transaction that is known to have any such locks,
+ * there is a RecoveryLockXidEntry in the RecoveryLockXidHash hash table,
+ * which stores the head of the chain of its locks.
+ */
+typedef struct RecoveryLockEntry
+{
+ xl_standby_lock key; /* hash key: xid, dbOid, relOid */
+ struct RecoveryLockEntry *next; /* chain link */
+} RecoveryLockEntry;
+
+typedef struct RecoveryLockXidEntry
+{
+ TransactionId xid; /* hash key -- must be first */
+ struct RecoveryLockEntry *head; /* chain head */
+} RecoveryLockXidEntry;
+
+static HTAB *RecoveryLockHash = NULL;
+static HTAB *RecoveryLockXidHash = NULL;
+
+/* Flags set by timeout handlers */
+static volatile sig_atomic_t got_standby_deadlock_timeout = false;
+static volatile sig_atomic_t got_standby_delay_timeout = false;
+static volatile sig_atomic_t got_standby_lock_timeout = false;
+
+static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+ ProcSignalReason reason,
+ uint32 wait_event_info,
+ bool report_waiting);
+static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
+static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
+static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
+static const char *get_recovery_conflict_desc(ProcSignalReason reason);
+
+/*
+ * InitRecoveryTransactionEnvironment
+ * Initialize tracking of our primary's in-progress transactions.
+ *
+ * We need to issue shared invalidations and hold locks. Holding locks
+ * means others may want to wait on us, so we need to make a lock table
+ * vxact entry like a real transaction. We could create and delete
+ * lock table entries for each transaction but its simpler just to create
+ * one permanent entry and leave it there all the time. Locks are then
+ * acquired and released as needed. Yes, this means you can see the
+ * Startup process in pg_locks once we have run this.
+ */
+void
+InitRecoveryTransactionEnvironment(void)
+{
+ VirtualTransactionId vxid;
+ HASHCTL hash_ctl;
+
+ Assert(RecoveryLockHash == NULL); /* don't run this twice */
+
+ /*
+ * Initialize the hash tables for tracking the locks held by each
+ * transaction.
+ */
+ hash_ctl.keysize = sizeof(xl_standby_lock);
+ hash_ctl.entrysize = sizeof(RecoveryLockEntry);
+ RecoveryLockHash = hash_create("RecoveryLockHash",
+ 64,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS);
+ hash_ctl.keysize = sizeof(TransactionId);
+ hash_ctl.entrysize = sizeof(RecoveryLockXidEntry);
+ RecoveryLockXidHash = hash_create("RecoveryLockXidHash",
+ 64,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS);
+
+ /*
+ * Initialize shared invalidation management for Startup process, being
+ * careful to register ourselves as a sendOnly process so we don't need to
+ * read messages, nor will we get signaled when the queue starts filling
+ * up.
+ */
+ SharedInvalBackendInit(true);
+
+ /*
+ * Lock a virtual transaction id for Startup process.
+ *
+ * We need to do GetNextLocalTransactionId() because
+ * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
+ * manager doesn't like that at all.
+ *
+ * Note that we don't need to run XactLockTableInsert() because nobody
+ * needs to wait on xids. That sounds a little strange, but table locks
+ * are held by vxids and row level locks are held by xids. All queries
+ * hold AccessShareLocks so never block while we write or lock new rows.
+ */
+ vxid.backendId = MyBackendId;
+ vxid.localTransactionId = GetNextLocalTransactionId();
+ VirtualXactLockTableInsert(vxid);
+
+ standbyState = STANDBY_INITIALIZED;
+}
+
+/*
+ * ShutdownRecoveryTransactionEnvironment
+ * Shut down transaction tracking
+ *
+ * Prepare to switch from hot standby mode to normal operation. Shut down
+ * recovery-time transaction tracking.
+ *
+ * This must be called even in shutdown of startup process if transaction
+ * tracking has been initialized. Otherwise some locks the tracked
+ * transactions were holding will not be released and may interfere with
+ * the processes still running (but will exit soon later) at the exit of
+ * startup process.
+ */
+void
+ShutdownRecoveryTransactionEnvironment(void)
+{
+ /*
+ * Do nothing if RecoveryLockHash is NULL because that means that
+ * transaction tracking has not yet been initialized or has already been
+ * shut down. This makes it safe to have possibly-redundant calls of this
+ * function during process exit.
+ */
+ if (RecoveryLockHash == NULL)
+ return;
+
+ /* Mark all tracked in-progress transactions as finished. */
+ ExpireAllKnownAssignedTransactionIds();
+
+ /* Release all locks the tracked transactions were holding */
+ StandbyReleaseAllLocks();
+
+ /* Destroy the lock hash tables. */
+ hash_destroy(RecoveryLockHash);
+ hash_destroy(RecoveryLockXidHash);
+ RecoveryLockHash = NULL;
+ RecoveryLockXidHash = NULL;
+
+ /* Cleanup our VirtualTransaction */
+ VirtualXactLockTableCleanup();
+}
+
+
+/*
+ * -----------------------------------------------------
+ * Standby wait timers and backend cancel logic
+ * -----------------------------------------------------
+ */
+
+/*
+ * Determine the cutoff time at which we want to start canceling conflicting
+ * transactions. Returns zero (a time safely in the past) if we are willing
+ * to wait forever.
+ */
+static TimestampTz
+GetStandbyLimitTime(void)
+{
+ TimestampTz rtime;
+ bool fromStream;
+
+ /*
+ * The cutoff time is the last WAL data receipt time plus the appropriate
+ * delay variable. Delay of -1 means wait forever.
+ */
+ GetXLogReceiptTime(&rtime, &fromStream);
+ if (fromStream)
+ {
+ if (max_standby_streaming_delay < 0)
+ return 0; /* wait forever */
+ return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
+ }
+ else
+ {
+ if (max_standby_archive_delay < 0)
+ return 0; /* wait forever */
+ return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
+ }
+}
+
+#define STANDBY_INITIAL_WAIT_US 1000
+static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
+
+/*
+ * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
+ * We wait here for a while then return. If we decide we can't wait any
+ * more then we return true, if we can wait some more return false.
+ */
+static bool
+WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
+{
+ TimestampTz ltime;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* Are we past the limit time? */
+ ltime = GetStandbyLimitTime();
+ if (ltime && GetCurrentTimestamp() >= ltime)
+ return true;
+
+ /*
+ * Sleep a bit (this is essential to avoid busy-waiting).
+ */
+ pgstat_report_wait_start(wait_event_info);
+ pg_usleep(standbyWait_us);
+ pgstat_report_wait_end();
+
+ /*
+ * Progressively increase the sleep times, but not to more than 1s, since
+ * pg_usleep isn't interruptible on some platforms.
+ */
+ standbyWait_us *= 2;
+ if (standbyWait_us > 1000000)
+ standbyWait_us = 1000000;
+
+ return false;
+}
+
+/*
+ * Log the recovery conflict.
+ *
+ * wait_start is the timestamp when the caller started to wait.
+ * now is the timestamp when this function has been called.
+ * wait_list is the list of virtual transaction ids assigned to
+ * conflicting processes. still_waiting indicates whether
+ * the startup process is still waiting for the recovery conflict
+ * to be resolved or not.
+ */
+void
+LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
+ TimestampTz now, VirtualTransactionId *wait_list,
+ bool still_waiting)
+{
+ long secs;
+ int usecs;
+ long msecs;
+ StringInfoData buf;
+ int nprocs = 0;
+
+ /*
+ * There must be no conflicting processes when the recovery conflict has
+ * already been resolved.
+ */
+ Assert(still_waiting || wait_list == NULL);
+
+ TimestampDifference(wait_start, now, &secs, &usecs);
+ msecs = secs * 1000 + usecs / 1000;
+ usecs = usecs % 1000;
+
+ if (wait_list)
+ {
+ VirtualTransactionId *vxids;
+
+ /* Construct a string of list of the conflicting processes */
+ vxids = wait_list;
+ while (VirtualTransactionIdIsValid(*vxids))
+ {
+ PGPROC *proc = BackendIdGetProc(vxids->backendId);
+
+ /* proc can be NULL if the target backend is not active */
+ if (proc)
+ {
+ if (nprocs == 0)
+ {
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "%d", proc->pid);
+ }
+ else
+ appendStringInfo(&buf, ", %d", proc->pid);
+
+ nprocs++;
+ }
+
+ vxids++;
+ }
+ }
+
+ /*
+ * If wait_list is specified, report the list of PIDs of active
+ * conflicting backends in a detail message. Note that if all the backends
+ * in the list are not active, no detail message is logged.
+ */
+ if (still_waiting)
+ {
+ ereport(LOG,
+ errmsg("recovery still waiting after %ld.%03d ms: %s",
+ msecs, usecs, get_recovery_conflict_desc(reason)),
+ nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
+ "Conflicting processes: %s.",
+ nprocs, buf.data) : 0);
+ }
+ else
+ {
+ ereport(LOG,
+ errmsg("recovery finished waiting after %ld.%03d ms: %s",
+ msecs, usecs, get_recovery_conflict_desc(reason)));
+ }
+
+ if (nprocs > 0)
+ pfree(buf.data);
+}
+
+/*
+ * This is the main executioner for any query backend that conflicts with
+ * recovery processing. Judgement has already been passed on it within
+ * a specific rmgr. Here we just issue the orders to the procs. The procs
+ * then throw the required error as instructed.
+ *
+ * If report_waiting is true, "waiting" is reported in PS display and the
+ * wait for recovery conflict is reported in the log, if necessary. If
+ * the caller is responsible for reporting them, report_waiting should be
+ * false. Otherwise, both the caller and this function report the same
+ * thing unexpectedly.
+ */
+static void
+ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+ ProcSignalReason reason, uint32 wait_event_info,
+ bool report_waiting)
+{
+ TimestampTz waitStart = 0;
+ bool waiting = false;
+ bool logged_recovery_conflict = false;
+
+ /* Fast exit, to avoid a kernel call if there's no work to be done. */
+ if (!VirtualTransactionIdIsValid(*waitlist))
+ return;
+
+ /* Set the wait start timestamp for reporting */
+ if (report_waiting && (log_recovery_conflict_waits || update_process_title))
+ waitStart = GetCurrentTimestamp();
+
+ while (VirtualTransactionIdIsValid(*waitlist))
+ {
+ /* reset standbyWait_us for each xact we wait for */
+ standbyWait_us = STANDBY_INITIAL_WAIT_US;
+
+ /* wait until the virtual xid is gone */
+ while (!VirtualXactLock(*waitlist, false))
+ {
+ /* Is it time to kill it? */
+ if (WaitExceedsMaxStandbyDelay(wait_event_info))
+ {
+ pid_t pid;
+
+ /*
+ * Now find out who to throw out of the balloon.
+ */
+ Assert(VirtualTransactionIdIsValid(*waitlist));
+ pid = CancelVirtualTransaction(*waitlist, reason);
+
+ /*
+ * Wait a little bit for it to die so that we avoid flooding
+ * an unresponsive backend when system is heavily loaded.
+ */
+ if (pid != 0)
+ pg_usleep(5000L);
+ }
+
+ if (waitStart != 0 && (!logged_recovery_conflict || !waiting))
+ {
+ TimestampTz now = 0;
+ bool maybe_log_conflict;
+ bool maybe_update_title;
+
+ maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
+ maybe_update_title = (update_process_title && !waiting);
+
+ /* Get the current timestamp if not report yet */
+ if (maybe_log_conflict || maybe_update_title)
+ now = GetCurrentTimestamp();
+
+ /*
+ * Report via ps if we have been waiting for more than 500
+ * msec (should that be configurable?)
+ */
+ if (maybe_update_title &&
+ TimestampDifferenceExceeds(waitStart, now, 500))
+ {
+ set_ps_display_suffix("waiting");
+ waiting = true;
+ }
+
+ /*
+ * Emit the log message if the startup process is waiting
+ * longer than deadlock_timeout for recovery conflict.
+ */
+ if (maybe_log_conflict &&
+ TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
+ {
+ LogRecoveryConflict(reason, waitStart, now, waitlist, true);
+ logged_recovery_conflict = true;
+ }
+ }
+ }
+
+ /* The virtual transaction is gone now, wait for the next one */
+ waitlist++;
+ }
+
+ /*
+ * Emit the log message if recovery conflict was resolved but the startup
+ * process waited longer than deadlock_timeout for it.
+ */
+ if (logged_recovery_conflict)
+ LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
+ NULL, false);
+
+ /* reset ps display to remove the suffix if we added one */
+ if (waiting)
+ set_ps_display_remove_suffix();
+
+}
+
+/*
+ * Generate whatever recovery conflicts are needed to eliminate snapshots that
+ * might see XIDs <= snapshotConflictHorizon as still running.
+ *
+ * snapshotConflictHorizon cutoffs are our standard approach to generating
+ * granular recovery conflicts. Note that InvalidTransactionId values are
+ * interpreted as "definitely don't need any conflicts" here, which is a
+ * general convention that WAL records can (and often do) depend on.
+ */
+void
+ResolveRecoveryConflictWithSnapshot(TransactionId snapshotConflictHorizon,
+ bool isCatalogRel,
+ RelFileLocator locator)
+{
+ VirtualTransactionId *backends;
+
+ /*
+ * If we get passed InvalidTransactionId then we do nothing (no conflict).
+ *
+ * This can happen when replaying already-applied WAL records after a
+ * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
+ * record that marks as frozen a page which was already all-visible. It's
+ * also quite common with records generated during index deletion
+ * (original execution of the deletion can reason that a recovery conflict
+ * which is sufficient for the deletion operation must take place before
+ * replay of the deletion record itself).
+ */
+ if (!TransactionIdIsValid(snapshotConflictHorizon))
+ return;
+
+ Assert(TransactionIdIsNormal(snapshotConflictHorizon));
+ backends = GetConflictingVirtualXIDs(snapshotConflictHorizon,
+ locator.dbOid);
+ ResolveRecoveryConflictWithVirtualXIDs(backends,
+ PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
+ WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
+ true);
+
+ /*
+ * Note that WaitExceedsMaxStandbyDelay() is not taken into account here
+ * (as opposed to ResolveRecoveryConflictWithVirtualXIDs() above). That
+ * seems OK, given that this kind of conflict should not normally be
+ * reached, e.g. due to using a physical replication slot.
+ */
+ if (wal_level >= WAL_LEVEL_LOGICAL && isCatalogRel)
+ InvalidateObsoleteReplicationSlots(RS_INVAL_HORIZON, 0, locator.dbOid,
+ snapshotConflictHorizon);
+}
+
+/*
+ * Variant of ResolveRecoveryConflictWithSnapshot that works with
+ * FullTransactionId values
+ */
+void
+ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId snapshotConflictHorizon,
+ bool isCatalogRel,
+ RelFileLocator locator)
+{
+ /*
+ * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
+ * so truncate the logged FullTransactionId. If the logged value is very
+ * old, so that XID wrap-around already happened on it, there can't be any
+ * snapshots that still see it.
+ */
+ FullTransactionId nextXid = ReadNextFullTransactionId();
+ uint64 diff;
+
+ diff = U64FromFullTransactionId(nextXid) -
+ U64FromFullTransactionId(snapshotConflictHorizon);
+ if (diff < MaxTransactionId / 2)
+ {
+ TransactionId truncated;
+
+ truncated = XidFromFullTransactionId(snapshotConflictHorizon);
+ ResolveRecoveryConflictWithSnapshot(truncated,
+ isCatalogRel,
+ locator);
+ }
+}
+
+void
+ResolveRecoveryConflictWithTablespace(Oid tsid)
+{
+ VirtualTransactionId *temp_file_users;
+
+ /*
+ * Standby users may be currently using this tablespace for their
+ * temporary files. We only care about current users because
+ * temp_tablespace parameter will just ignore tablespaces that no longer
+ * exist.
+ *
+ * Ask everybody to cancel their queries immediately so we can ensure no
+ * temp files remain and we can remove the tablespace. Nuke the entire
+ * site from orbit, it's the only way to be sure.
+ *
+ * XXX: We could work out the pids of active backends using this
+ * tablespace by examining the temp filenames in the directory. We would
+ * then convert the pids into VirtualXIDs before attempting to cancel
+ * them.
+ *
+ * We don't wait for commit because drop tablespace is non-transactional.
+ */
+ temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+ InvalidOid);
+ ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
+ PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
+ WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
+ true);
+}
+
+void
+ResolveRecoveryConflictWithDatabase(Oid dbid)
+{
+ /*
+ * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
+ * only waits for transactions and completely idle sessions would block
+ * us. This is rare enough that we do this as simply as possible: no wait,
+ * just force them off immediately.
+ *
+ * No locking is required here because we already acquired
+ * AccessExclusiveLock. Anybody trying to connect while we do this will
+ * block during InitPostgres() and then disconnect when they see the
+ * database has been removed.
+ */
+ while (CountDBBackends(dbid) > 0)
+ {
+ CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
+
+ /*
+ * Wait awhile for them to die so that we avoid flooding an
+ * unresponsive backend when system is heavily loaded.
+ */
+ pg_usleep(10000);
+ }
+}
+
+/*
+ * ResolveRecoveryConflictWithLock is called from ProcSleep()
+ * to resolve conflicts with other backends holding relation locks.
+ *
+ * The WaitLatch sleep normally done in ProcSleep()
+ * (when not InHotStandby) is performed here, for code clarity.
+ *
+ * We either resolve conflicts immediately or set a timeout to wake us at
+ * the limit of our patience.
+ *
+ * Resolve conflicts by canceling to all backends holding a conflicting
+ * lock. As we are already queued to be granted the lock, no new lock
+ * requests conflicting with ours will be granted in the meantime.
+ *
+ * We also must check for deadlocks involving the Startup process and
+ * hot-standby backend processes. If deadlock_timeout is reached in
+ * this function, all the backends holding the conflicting locks are
+ * requested to check themselves for deadlocks.
+ *
+ * logging_conflict should be true if the recovery conflict has not been
+ * logged yet even though logging is enabled. After deadlock_timeout is
+ * reached and the request for deadlock check is sent, we wait again to
+ * be signaled by the release of the lock if logging_conflict is false.
+ * Otherwise we return without waiting again so that the caller can report
+ * the recovery conflict. In this case, then, this function is called again
+ * with logging_conflict=false (because the recovery conflict has already
+ * been logged) and we will wait again for the lock to be released.
+ */
+void
+ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
+{
+ TimestampTz ltime;
+ TimestampTz now;
+
+ Assert(InHotStandby);
+
+ ltime = GetStandbyLimitTime();
+ now = GetCurrentTimestamp();
+
+ /*
+ * Update waitStart if first time through after the startup process
+ * started waiting for the lock. It should not be updated every time
+ * ResolveRecoveryConflictWithLock() is called during the wait.
+ *
+ * Use the current time obtained for comparison with ltime as waitStart
+ * (i.e., the time when this process started waiting for the lock). Since
+ * getting the current time newly can cause overhead, we reuse the
+ * already-obtained time to avoid that overhead.
+ *
+ * Note that waitStart is updated without holding the lock table's
+ * partition lock, to avoid the overhead by additional lock acquisition.
+ * This can cause "waitstart" in pg_locks to become NULL for a very short
+ * period of time after the wait started even though "granted" is false.
+ * This is OK in practice because we can assume that users are likely to
+ * look at "waitstart" when waiting for the lock for a long time.
+ */
+ if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
+ pg_atomic_write_u64(&MyProc->waitStart, now);
+
+ if (now >= ltime && ltime != 0)
+ {
+ /*
+ * We're already behind, so clear a path as quickly as possible.
+ */
+ VirtualTransactionId *backends;
+
+ backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
+
+ /*
+ * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
+ * "waiting" in PS display by disabling its argument report_waiting
+ * because the caller, WaitOnLock(), has already reported that.
+ */
+ ResolveRecoveryConflictWithVirtualXIDs(backends,
+ PROCSIG_RECOVERY_CONFLICT_LOCK,
+ PG_WAIT_LOCK | locktag.locktag_type,
+ false);
+ }
+ else
+ {
+ /*
+ * Wait (or wait again) until ltime, and check for deadlocks as well
+ * if we will be waiting longer than deadlock_timeout
+ */
+ EnableTimeoutParams timeouts[2];
+ int cnt = 0;
+
+ if (ltime != 0)
+ {
+ got_standby_lock_timeout = false;
+ timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
+ timeouts[cnt].type = TMPARAM_AT;
+ timeouts[cnt].fin_time = ltime;
+ cnt++;
+ }
+
+ got_standby_deadlock_timeout = false;
+ timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
+ timeouts[cnt].type = TMPARAM_AFTER;
+ timeouts[cnt].delay_ms = DeadlockTimeout;
+ cnt++;
+
+ enable_timeouts(timeouts, cnt);
+ }
+
+ /* Wait to be signaled by the release of the Relation Lock */
+ ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
+
+ /*
+ * Exit if ltime is reached. Then all the backends holding conflicting
+ * locks will be canceled in the next ResolveRecoveryConflictWithLock()
+ * call.
+ */
+ if (got_standby_lock_timeout)
+ goto cleanup;
+
+ if (got_standby_deadlock_timeout)
+ {
+ VirtualTransactionId *backends;
+
+ backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
+
+ /* Quick exit if there's no work to be done */
+ if (!VirtualTransactionIdIsValid(*backends))
+ goto cleanup;
+
+ /*
+ * Send signals to all the backends holding the conflicting locks, to
+ * ask them to check themselves for deadlocks.
+ */
+ while (VirtualTransactionIdIsValid(*backends))
+ {
+ SignalVirtualTransaction(*backends,
+ PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
+ false);
+ backends++;
+ }
+
+ /*
+ * Exit if the recovery conflict has not been logged yet even though
+ * logging is enabled, so that the caller can log that. Then
+ * RecoveryConflictWithLock() is called again and we will wait again
+ * for the lock to be released.
+ */
+ if (logging_conflict)
+ goto cleanup;
+
+ /*
+ * Wait again here to be signaled by the release of the Relation Lock,
+ * to prevent the subsequent RecoveryConflictWithLock() from causing
+ * deadlock_timeout and sending a request for deadlocks check again.
+ * Otherwise the request continues to be sent every deadlock_timeout
+ * until the relation locks are released or ltime is reached.
+ */
+ got_standby_deadlock_timeout = false;
+ ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
+ }
+
+cleanup:
+
+ /*
+ * Clear any timeout requests established above. We assume here that the
+ * Startup process doesn't have any other outstanding timeouts than those
+ * used by this function. If that stops being true, we could cancel the
+ * timeouts individually, but that'd be slower.
+ */
+ disable_all_timeouts(false);
+ got_standby_lock_timeout = false;
+ got_standby_deadlock_timeout = false;
+}
+
+/*
+ * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
+ * to resolve conflicts with other backends holding buffer pins.
+ *
+ * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
+ * (when not InHotStandby) is performed here, for code clarity.
+ *
+ * We either resolve conflicts immediately or set a timeout to wake us at
+ * the limit of our patience.
+ *
+ * Resolve conflicts by sending a PROCSIG signal to all backends to check if
+ * they hold one of the buffer pins that is blocking Startup process. If so,
+ * those backends will take an appropriate error action, ERROR or FATAL.
+ *
+ * We also must check for deadlocks. Deadlocks occur because if queries
+ * wait on a lock, that must be behind an AccessExclusiveLock, which can only
+ * be cleared if the Startup process replays a transaction completion record.
+ * If Startup process is also waiting then that is a deadlock. The deadlock
+ * can occur if the query is waiting and then the Startup sleeps, or if
+ * Startup is sleeping and the query waits on a lock. We protect against
+ * only the former sequence here, the latter sequence is checked prior to
+ * the query sleeping, in CheckRecoveryConflictDeadlock().
+ *
+ * Deadlocks are extremely rare, and relatively expensive to check for,
+ * so we don't do a deadlock check right away ... only if we have had to wait
+ * at least deadlock_timeout.
+ */
+void
+ResolveRecoveryConflictWithBufferPin(void)
+{
+ TimestampTz ltime;
+
+ Assert(InHotStandby);
+
+ ltime = GetStandbyLimitTime();
+
+ if (GetCurrentTimestamp() >= ltime && ltime != 0)
+ {
+ /*
+ * We're already behind, so clear a path as quickly as possible.
+ */
+ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+ }
+ else
+ {
+ /*
+ * Wake up at ltime, and check for deadlocks as well if we will be
+ * waiting longer than deadlock_timeout
+ */
+ EnableTimeoutParams timeouts[2];
+ int cnt = 0;
+
+ if (ltime != 0)
+ {
+ timeouts[cnt].id = STANDBY_TIMEOUT;
+ timeouts[cnt].type = TMPARAM_AT;
+ timeouts[cnt].fin_time = ltime;
+ cnt++;
+ }
+
+ got_standby_deadlock_timeout = false;
+ timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
+ timeouts[cnt].type = TMPARAM_AFTER;
+ timeouts[cnt].delay_ms = DeadlockTimeout;
+ cnt++;
+
+ enable_timeouts(timeouts, cnt);
+ }
+
+ /*
+ * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
+ * by one of the timeouts established above.
+ *
+ * We assume that only UnpinBuffer() and the timeout requests established
+ * above can wake us up here. WakeupRecovery() called by walreceiver or
+ * SIGHUP signal handler, etc cannot do that because it uses the different
+ * latch from that ProcWaitForSignal() waits on.
+ */
+ ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
+
+ if (got_standby_delay_timeout)
+ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+ else if (got_standby_deadlock_timeout)
+ {
+ /*
+ * Send out a request for hot-standby backends to check themselves for
+ * deadlocks.
+ *
+ * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
+ * to be signaled by UnpinBuffer() again and send a request for
+ * deadlocks check if deadlock_timeout happens. This causes the
+ * request to continue to be sent every deadlock_timeout until the
+ * buffer is unpinned or ltime is reached. This would increase the
+ * workload in the startup process and backends. In practice it may
+ * not be so harmful because the period that the buffer is kept pinned
+ * is basically no so long. But we should fix this?
+ */
+ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+ }
+
+ /*
+ * Clear any timeout requests established above. We assume here that the
+ * Startup process doesn't have any other timeouts than what this function
+ * uses. If that stops being true, we could cancel the timeouts
+ * individually, but that'd be slower.
+ */
+ disable_all_timeouts(false);
+ got_standby_delay_timeout = false;
+ got_standby_deadlock_timeout = false;
+}
+
+static void
+SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
+{
+ Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
+ reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+
+ /*
+ * We send signal to all backends to ask them if they are holding the
+ * buffer pin which is delaying the Startup process. We must not set the
+ * conflict flag yet, since most backends will be innocent. Let the
+ * SIGUSR1 handling in each backend decide their own fate.
+ */
+ CancelDBBackends(InvalidOid, reason, false);
+}
+
+/*
+ * In Hot Standby perform early deadlock detection. We abort the lock
+ * wait if we are about to sleep while holding the buffer pin that Startup
+ * process is waiting for.
+ *
+ * Note: this code is pessimistic, because there is no way for it to
+ * determine whether an actual deadlock condition is present: the lock we
+ * need to wait for might be unrelated to any held by the Startup process.
+ * Sooner or later, this mechanism should get ripped out in favor of somehow
+ * accounting for buffer locks in DeadLockCheck(). However, errors here
+ * seem to be very low-probability in practice, so for now it's not worth
+ * the trouble.
+ */
+void
+CheckRecoveryConflictDeadlock(void)
+{
+ Assert(!InRecovery); /* do not call in Startup process */
+
+ if (!HoldingBufferPinThatDelaysRecovery())
+ return;
+
+ /*
+ * Error message should match ProcessInterrupts() but we avoid calling
+ * that because we aren't handling an interrupt at this point. Note that
+ * we only cancel the current transaction here, so if we are in a
+ * subtransaction and the pin is held by a parent, then the Startup
+ * process will continue to wait even though we have avoided deadlock.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
+ errmsg("canceling statement due to conflict with recovery"),
+ errdetail("User transaction caused buffer deadlock with recovery.")));
+}
+
+
+/* --------------------------------
+ * timeout handler routines
+ * --------------------------------
+ */
+
+/*
+ * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
+ * exceeded.
+ */
+void
+StandbyDeadLockHandler(void)
+{
+ got_standby_deadlock_timeout = true;
+}
+
+/*
+ * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
+ */
+void
+StandbyTimeoutHandler(void)
+{
+ got_standby_delay_timeout = true;
+}
+
+/*
+ * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
+ */
+void
+StandbyLockTimeoutHandler(void)
+{
+ got_standby_lock_timeout = true;
+}
+
+/*
+ * -----------------------------------------------------
+ * Locking in Recovery Mode
+ * -----------------------------------------------------
+ *
+ * All locks are held by the Startup process using a single virtual
+ * transaction. This implementation is both simpler and in some senses,
+ * more correct. The locks held mean "some original transaction held
+ * this lock, so query access is not allowed at this time". So the Startup
+ * process is the proxy by which the original locks are implemented.
+ *
+ * We only keep track of AccessExclusiveLocks, which are only ever held by
+ * one transaction on one relation.
+ *
+ * We keep a table of known locks in the RecoveryLockHash hash table.
+ * The point of that table is to let us efficiently de-duplicate locks,
+ * which is important because checkpoints will re-report the same locks
+ * already held. There is also a RecoveryLockXidHash table with one entry
+ * per xid, which allows us to efficiently find all the locks held by a
+ * given original transaction.
+ *
+ * We use session locks rather than normal locks so we don't need
+ * ResourceOwners.
+ */
+
+
+void
+StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
+{
+ RecoveryLockXidEntry *xidentry;
+ RecoveryLockEntry *lockentry;
+ xl_standby_lock key;
+ LOCKTAG locktag;
+ bool found;
+
+ /* Already processed? */
+ if (!TransactionIdIsValid(xid) ||
+ TransactionIdDidCommit(xid) ||
+ TransactionIdDidAbort(xid))
+ return;
+
+ elog(trace_recovery(DEBUG4),
+ "adding recovery lock: db %u rel %u", dbOid, relOid);
+
+ /* dbOid is InvalidOid when we are locking a shared relation. */
+ Assert(OidIsValid(relOid));
+
+ /* Create a hash entry for this xid, if we don't have one already. */
+ xidentry = hash_search(RecoveryLockXidHash, &xid, HASH_ENTER, &found);
+ if (!found)
+ {
+ Assert(xidentry->xid == xid); /* dynahash should have set this */
+ xidentry->head = NULL;
+ }
+
+ /* Create a hash entry for this lock, unless we have one already. */
+ key.xid = xid;
+ key.dbOid = dbOid;
+ key.relOid = relOid;
+ lockentry = hash_search(RecoveryLockHash, &key, HASH_ENTER, &found);
+ if (!found)
+ {
+ /* It's new, so link it into the XID's list ... */
+ lockentry->next = xidentry->head;
+ xidentry->head = lockentry;
+
+ /* ... and acquire the lock locally. */
+ SET_LOCKTAG_RELATION(locktag, dbOid, relOid);
+
+ (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
+ }
+}
+
+/*
+ * Release all the locks associated with this RecoveryLockXidEntry.
+ */
+static void
+StandbyReleaseXidEntryLocks(RecoveryLockXidEntry *xidentry)
+{
+ RecoveryLockEntry *entry;
+ RecoveryLockEntry *next;
+
+ for (entry = xidentry->head; entry != NULL; entry = next)
+ {
+ LOCKTAG locktag;
+
+ elog(trace_recovery(DEBUG4),
+ "releasing recovery lock: xid %u db %u rel %u",
+ entry->key.xid, entry->key.dbOid, entry->key.relOid);
+ /* Release the lock ... */
+ SET_LOCKTAG_RELATION(locktag, entry->key.dbOid, entry->key.relOid);
+ if (!LockRelease(&locktag, AccessExclusiveLock, true))
+ {
+ elog(LOG,
+ "RecoveryLockHash contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
+ entry->key.xid, entry->key.dbOid, entry->key.relOid);
+ Assert(false);
+ }
+ /* ... and remove the per-lock hash entry */
+ next = entry->next;
+ hash_search(RecoveryLockHash, entry, HASH_REMOVE, NULL);
+ }
+
+ xidentry->head = NULL; /* just for paranoia */
+}
+
+/*
+ * Release locks for specific XID, or all locks if it's InvalidXid.
+ */
+static void
+StandbyReleaseLocks(TransactionId xid)
+{
+ RecoveryLockXidEntry *entry;
+
+ if (TransactionIdIsValid(xid))
+ {
+ if ((entry = hash_search(RecoveryLockXidHash, &xid, HASH_FIND, NULL)))
+ {
+ StandbyReleaseXidEntryLocks(entry);
+ hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
+ }
+ }
+ else
+ StandbyReleaseAllLocks();
+}
+
+/*
+ * Release locks for a transaction tree, starting at xid down, from
+ * RecoveryLockXidHash.
+ *
+ * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
+ * to remove any AccessExclusiveLocks requested by a transaction.
+ */
+void
+StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
+{
+ int i;
+
+ StandbyReleaseLocks(xid);
+
+ for (i = 0; i < nsubxids; i++)
+ StandbyReleaseLocks(subxids[i]);
+}
+
+/*
+ * Called at end of recovery and when we see a shutdown checkpoint.
+ */
+void
+StandbyReleaseAllLocks(void)
+{
+ HASH_SEQ_STATUS status;
+ RecoveryLockXidEntry *entry;
+
+ elog(trace_recovery(DEBUG2), "release all standby locks");
+
+ hash_seq_init(&status, RecoveryLockXidHash);
+ while ((entry = hash_seq_search(&status)))
+ {
+ StandbyReleaseXidEntryLocks(entry);
+ hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
+ }
+}
+
+/*
+ * StandbyReleaseOldLocks
+ * Release standby locks held by top-level XIDs that aren't running,
+ * as long as they're not prepared transactions.
+ */
+void
+StandbyReleaseOldLocks(TransactionId oldxid)
+{
+ HASH_SEQ_STATUS status;
+ RecoveryLockXidEntry *entry;
+
+ hash_seq_init(&status, RecoveryLockXidHash);
+ while ((entry = hash_seq_search(&status)))
+ {
+ Assert(TransactionIdIsValid(entry->xid));
+
+ /* Skip if prepared transaction. */
+ if (StandbyTransactionIdIsPrepared(entry->xid))
+ continue;
+
+ /* Skip if >= oldxid. */
+ if (!TransactionIdPrecedes(entry->xid, oldxid))
+ continue;
+
+ /* Remove all locks and hash table entry. */
+ StandbyReleaseXidEntryLocks(entry);
+ hash_search(RecoveryLockXidHash, entry, HASH_REMOVE, NULL);
+ }
+}
+
+/*
+ * --------------------------------------------------------------------
+ * Recovery handling for Rmgr RM_STANDBY_ID
+ *
+ * These record types will only be created if XLogStandbyInfoActive()
+ * --------------------------------------------------------------------
+ */
+
+void
+standby_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ /* Backup blocks are not used in standby records */
+ Assert(!XLogRecHasAnyBlockRefs(record));
+
+ /* Do nothing if we're not in hot standby mode */
+ if (standbyState == STANDBY_DISABLED)
+ return;
+
+ if (info == XLOG_STANDBY_LOCK)
+ {
+ xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
+ int i;
+
+ for (i = 0; i < xlrec->nlocks; i++)
+ StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
+ xlrec->locks[i].dbOid,
+ xlrec->locks[i].relOid);
+ }
+ else if (info == XLOG_RUNNING_XACTS)
+ {
+ xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
+ RunningTransactionsData running;
+
+ running.xcnt = xlrec->xcnt;
+ running.subxcnt = xlrec->subxcnt;
+ running.subxid_overflow = xlrec->subxid_overflow;
+ running.nextXid = xlrec->nextXid;
+ running.latestCompletedXid = xlrec->latestCompletedXid;
+ running.oldestRunningXid = xlrec->oldestRunningXid;
+ running.xids = xlrec->xids;
+
+ ProcArrayApplyRecoveryInfo(&running);
+
+ /*
+ * The startup process currently has no convenient way to schedule
+ * stats to be reported. XLOG_RUNNING_XACTS records issued at a
+ * regular cadence, making this a convenient location to report stats.
+ * While these records aren't generated with wal_level=minimal, stats
+ * also cannot be accessed during WAL replay.
+ */
+ pgstat_report_stat(true);
+ }
+ else if (info == XLOG_INVALIDATIONS)
+ {
+ xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
+
+ ProcessCommittedInvalidationMessages(xlrec->msgs,
+ xlrec->nmsgs,
+ xlrec->relcacheInitFileInval,
+ xlrec->dbId,
+ xlrec->tsId);
+ }
+ else
+ elog(PANIC, "standby_redo: unknown op code %u", info);
+}
+
+/*
+ * Log details of the current snapshot to WAL. This allows the snapshot state
+ * to be reconstructed on the standby and for logical decoding.
+ *
+ * This is used for Hot Standby as follows:
+ *
+ * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
+ * start from a shutdown checkpoint because we know nothing was running
+ * at that time and our recovery snapshot is known empty. In the more
+ * typical case of an online checkpoint we need to jump through a few
+ * hoops to get a correct recovery snapshot and this requires a two or
+ * sometimes a three stage process.
+ *
+ * The initial snapshot must contain all running xids and all current
+ * AccessExclusiveLocks at a point in time on the standby. Assembling
+ * that information while the server is running requires many and
+ * various LWLocks, so we choose to derive that information piece by
+ * piece and then re-assemble that info on the standby. When that
+ * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
+ *
+ * Since locking on the primary when we derive the information is not
+ * strict, we note that there is a time window between the derivation and
+ * writing to WAL of the derived information. That allows race conditions
+ * that we must resolve, since xids and locks may enter or leave the
+ * snapshot during that window. This creates the issue that an xid or
+ * lock may start *after* the snapshot has been derived yet *before* the
+ * snapshot is logged in the running xacts WAL record. We resolve this by
+ * starting to accumulate changes at a point just prior to when we derive
+ * the snapshot on the primary, then ignore duplicates when we later apply
+ * the snapshot from the running xacts record. This is implemented during
+ * CreateCheckPoint() where we use the logical checkpoint location as
+ * our starting point and then write the running xacts record immediately
+ * before writing the main checkpoint WAL record. Since we always start
+ * up from a checkpoint and are immediately at our starting point, we
+ * unconditionally move to STANDBY_INITIALIZED. After this point we
+ * must do 4 things:
+ * * move shared nextXid forwards as we see new xids
+ * * extend the clog and subtrans with each new xid
+ * * keep track of uncommitted known assigned xids
+ * * keep track of uncommitted AccessExclusiveLocks
+ *
+ * When we see a commit/abort we must remove known assigned xids and locks
+ * from the completing transaction. Attempted removals that cannot locate
+ * an entry are expected and must not cause an error when we are in state
+ * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
+ * KnownAssignedXidsRemove().
+ *
+ * Later, when we apply the running xact data we must be careful to ignore
+ * transactions already committed, since those commits raced ahead when
+ * making WAL entries.
+ *
+ * The loose timing also means that locks may be recorded that have a
+ * zero xid, since xids are removed from procs before locks are removed.
+ * So we must prune the lock list down to ensure we hold locks only for
+ * currently running xids, performed by StandbyReleaseOldLocks().
+ * Zero xids should no longer be possible, but we may be replaying WAL
+ * from a time when they were possible.
+ *
+ * For logical decoding only the running xacts information is needed;
+ * there's no need to look at the locking information, but it's logged anyway,
+ * as there's no independent knob to just enable logical decoding. For
+ * details of how this is used, check snapbuild.c's introductory comment.
+ *
+ *
+ * Returns the RecPtr of the last inserted record.
+ */
+XLogRecPtr
+LogStandbySnapshot(void)
+{
+ XLogRecPtr recptr;
+ RunningTransactions running;
+ xl_standby_lock *locks;
+ int nlocks;
+
+ Assert(XLogStandbyInfoActive());
+
+ /*
+ * Get details of any AccessExclusiveLocks being held at the moment.
+ */
+ locks = GetRunningTransactionLocks(&nlocks);
+ if (nlocks > 0)
+ LogAccessExclusiveLocks(nlocks, locks);
+ pfree(locks);
+
+ /*
+ * Log details of all in-progress transactions. This should be the last
+ * record we write, because standby will open up when it sees this.
+ */
+ running = GetRunningTransactionData();
+
+ /*
+ * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
+ * For Hot Standby this can be done before inserting the WAL record
+ * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
+ * the clog. For logical decoding, though, the lock can't be released
+ * early because the clog might be "in the future" from the POV of the
+ * historic snapshot. This would allow for situations where we're waiting
+ * for the end of a transaction listed in the xl_running_xacts record
+ * which, according to the WAL, has committed before the xl_running_xacts
+ * record. Fortunately this routine isn't executed frequently, and it's
+ * only a shared lock.
+ */
+ if (wal_level < WAL_LEVEL_LOGICAL)
+ LWLockRelease(ProcArrayLock);
+
+ recptr = LogCurrentRunningXacts(running);
+
+ /* Release lock if we kept it longer ... */
+ if (wal_level >= WAL_LEVEL_LOGICAL)
+ LWLockRelease(ProcArrayLock);
+
+ /* GetRunningTransactionData() acquired XidGenLock, we must release it */
+ LWLockRelease(XidGenLock);
+
+ return recptr;
+}
+
+/*
+ * Record an enhanced snapshot of running transactions into WAL.
+ *
+ * The definitions of RunningTransactionsData and xl_running_xacts are
+ * similar. We keep them separate because xl_running_xacts is a contiguous
+ * chunk of memory and never exists fully until it is assembled in WAL.
+ * The inserted records are marked as not being important for durability,
+ * to avoid triggering superfluous checkpoint / archiving activity.
+ */
+static XLogRecPtr
+LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
+{
+ xl_running_xacts xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.xcnt = CurrRunningXacts->xcnt;
+ xlrec.subxcnt = CurrRunningXacts->subxcnt;
+ xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
+ xlrec.nextXid = CurrRunningXacts->nextXid;
+ xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
+ xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
+
+ /* Header */
+ XLogBeginInsert();
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+ XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
+
+ /* array of TransactionIds */
+ if (xlrec.xcnt > 0)
+ XLogRegisterData((char *) CurrRunningXacts->xids,
+ (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
+
+ recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
+
+ if (CurrRunningXacts->subxid_overflow)
+ elog(trace_recovery(DEBUG2),
+ "snapshot of %d running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+ CurrRunningXacts->xcnt,
+ LSN_FORMAT_ARGS(recptr),
+ CurrRunningXacts->oldestRunningXid,
+ CurrRunningXacts->latestCompletedXid,
+ CurrRunningXacts->nextXid);
+ else
+ elog(trace_recovery(DEBUG2),
+ "snapshot of %d+%d running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+ CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
+ LSN_FORMAT_ARGS(recptr),
+ CurrRunningXacts->oldestRunningXid,
+ CurrRunningXacts->latestCompletedXid,
+ CurrRunningXacts->nextXid);
+
+ /*
+ * Ensure running_xacts information is synced to disk not too far in the
+ * future. We don't want to stall anything though (i.e. use XLogFlush()),
+ * so we let the wal writer do it during normal operation.
+ * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
+ * and nudge the WALWriter into action if sleeping. Check
+ * XLogBackgroundFlush() for details why a record might not be flushed
+ * without it.
+ */
+ XLogSetAsyncXactLSN(recptr);
+
+ return recptr;
+}
+
+/*
+ * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
+ * logged, as described in backend/storage/lmgr/README.
+ */
+static void
+LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
+{
+ xl_standby_locks xlrec;
+
+ xlrec.nlocks = nlocks;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
+ XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+
+ (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
+}
+
+/*
+ * Individual logging of AccessExclusiveLocks for use during LockAcquire()
+ */
+void
+LogAccessExclusiveLock(Oid dbOid, Oid relOid)
+{
+ xl_standby_lock xlrec;
+
+ xlrec.xid = GetCurrentTransactionId();
+
+ xlrec.dbOid = dbOid;
+ xlrec.relOid = relOid;
+
+ LogAccessExclusiveLocks(1, &xlrec);
+ MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
+}
+
+/*
+ * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
+ */
+void
+LogAccessExclusiveLockPrepare(void)
+{
+ /*
+ * Ensure that a TransactionId has been assigned to this transaction, for
+ * two reasons, both related to lock release on the standby. First, we
+ * must assign an xid so that RecordTransactionCommit() and
+ * RecordTransactionAbort() do not optimise away the transaction
+ * completion record which recovery relies upon to release locks. It's a
+ * hack, but for a corner case not worth adding code for into the main
+ * commit path. Second, we must assign an xid before the lock is recorded
+ * in shared memory, otherwise a concurrently executing
+ * GetRunningTransactionLocks() might see a lock associated with an
+ * InvalidTransactionId which we later assert cannot happen.
+ */
+ (void) GetCurrentTransactionId();
+}
+
+/*
+ * Emit WAL for invalidations. This currently is only used for commits without
+ * an xid but which contain invalidations.
+ */
+void
+LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
+ bool relcacheInitFileInval)
+{
+ xl_invalidations xlrec;
+
+ /* prepare record */
+ memset(&xlrec, 0, sizeof(xlrec));
+ xlrec.dbId = MyDatabaseId;
+ xlrec.tsId = MyDatabaseTableSpace;
+ xlrec.relcacheInitFileInval = relcacheInitFileInval;
+ xlrec.nmsgs = nmsgs;
+
+ /* perform insertion */
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
+ XLogRegisterData((char *) msgs,
+ nmsgs * sizeof(SharedInvalidationMessage));
+ XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
+}
+
+/* Return the description of recovery conflict */
+static const char *
+get_recovery_conflict_desc(ProcSignalReason reason)
+{
+ const char *reasonDesc = _("unknown reason");
+
+ switch (reason)
+ {
+ case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ reasonDesc = _("recovery conflict on buffer pin");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ reasonDesc = _("recovery conflict on lock");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ reasonDesc = _("recovery conflict on tablespace");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ reasonDesc = _("recovery conflict on snapshot");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_LOGICALSLOT:
+ reasonDesc = _("recovery conflict on replication slot");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ reasonDesc = _("recovery conflict on buffer deadlock");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+ reasonDesc = _("recovery conflict on database");
+ break;
+ default:
+ break;
+ }
+
+ return reasonDesc;
+}
diff --git a/src/backend/storage/large_object/Makefile b/src/backend/storage/large_object/Makefile
new file mode 100644
index 0000000..8a6bc36
--- /dev/null
+++ b/src/backend/storage/large_object/Makefile
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/large_object
+#
+# IDENTIFICATION
+# src/backend/storage/large_object/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/large_object
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ inv_api.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c
new file mode 100644
index 0000000..cab47f8
--- /dev/null
+++ b/src/backend/storage/large_object/inv_api.c
@@ -0,0 +1,954 @@
+/*-------------------------------------------------------------------------
+ *
+ * inv_api.c
+ * routines for manipulating inversion fs large objects. This file
+ * contains the user-level large object application interface routines.
+ *
+ *
+ * Note: we access pg_largeobject.data using its C struct declaration.
+ * This is safe because it immediately follows pageno which is an int4 field,
+ * and therefore the data field will always be 4-byte aligned, even if it
+ * is in the short 1-byte-header format. We have to detoast it since it's
+ * quite likely to be in compressed or short format. We also need to check
+ * for NULLs, since initdb will mark loid and pageno but not data as NOT NULL.
+ *
+ * Note: many of these routines leak memory in CurrentMemoryContext, as indeed
+ * does most of the backend code. We expect that CurrentMemoryContext will
+ * be a short-lived context. Data that must persist across function calls
+ * is kept either in CacheMemoryContext (the Relation structs) or in the
+ * memory context given to inv_open (for LargeObjectDesc structs).
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/large_object/inv_api.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <limits.h>
+
+#include "access/detoast.h"
+#include "access/genam.h"
+#include "access/htup_details.h"
+#include "access/sysattr.h"
+#include "access/table.h"
+#include "access/xact.h"
+#include "catalog/dependency.h"
+#include "catalog/indexing.h"
+#include "catalog/objectaccess.h"
+#include "catalog/pg_largeobject.h"
+#include "catalog/pg_largeobject_metadata.h"
+#include "libpq/libpq-fs.h"
+#include "miscadmin.h"
+#include "storage/large_object.h"
+#include "utils/acl.h"
+#include "utils/fmgroids.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+
+/*
+ * GUC: backwards-compatibility flag to suppress LO permission checks
+ */
+bool lo_compat_privileges;
+
+/*
+ * All accesses to pg_largeobject and its index make use of a single Relation
+ * reference, so that we only need to open pg_relation once per transaction.
+ * To avoid problems when the first such reference occurs inside a
+ * subtransaction, we execute a slightly klugy maneuver to assign ownership of
+ * the Relation reference to TopTransactionResourceOwner.
+ */
+static Relation lo_heap_r = NULL;
+static Relation lo_index_r = NULL;
+
+
+/*
+ * Open pg_largeobject and its index, if not already done in current xact
+ */
+static void
+open_lo_relation(void)
+{
+ ResourceOwner currentOwner;
+
+ if (lo_heap_r && lo_index_r)
+ return; /* already open in current xact */
+
+ /* Arrange for the top xact to own these relation references */
+ currentOwner = CurrentResourceOwner;
+ CurrentResourceOwner = TopTransactionResourceOwner;
+
+ /* Use RowExclusiveLock since we might either read or write */
+ if (lo_heap_r == NULL)
+ lo_heap_r = table_open(LargeObjectRelationId, RowExclusiveLock);
+ if (lo_index_r == NULL)
+ lo_index_r = index_open(LargeObjectLOidPNIndexId, RowExclusiveLock);
+
+ CurrentResourceOwner = currentOwner;
+}
+
+/*
+ * Clean up at main transaction end
+ */
+void
+close_lo_relation(bool isCommit)
+{
+ if (lo_heap_r || lo_index_r)
+ {
+ /*
+ * Only bother to close if committing; else abort cleanup will handle
+ * it
+ */
+ if (isCommit)
+ {
+ ResourceOwner currentOwner;
+
+ currentOwner = CurrentResourceOwner;
+ CurrentResourceOwner = TopTransactionResourceOwner;
+
+ if (lo_index_r)
+ index_close(lo_index_r, NoLock);
+ if (lo_heap_r)
+ table_close(lo_heap_r, NoLock);
+
+ CurrentResourceOwner = currentOwner;
+ }
+ lo_heap_r = NULL;
+ lo_index_r = NULL;
+ }
+}
+
+
+/*
+ * Same as pg_largeobject.c's LargeObjectExists(), except snapshot to
+ * read with can be specified.
+ */
+static bool
+myLargeObjectExists(Oid loid, Snapshot snapshot)
+{
+ Relation pg_lo_meta;
+ ScanKeyData skey[1];
+ SysScanDesc sd;
+ HeapTuple tuple;
+ bool retval = false;
+
+ ScanKeyInit(&skey[0],
+ Anum_pg_largeobject_metadata_oid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(loid));
+
+ pg_lo_meta = table_open(LargeObjectMetadataRelationId,
+ AccessShareLock);
+
+ sd = systable_beginscan(pg_lo_meta,
+ LargeObjectMetadataOidIndexId, true,
+ snapshot, 1, skey);
+
+ tuple = systable_getnext(sd);
+ if (HeapTupleIsValid(tuple))
+ retval = true;
+
+ systable_endscan(sd);
+
+ table_close(pg_lo_meta, AccessShareLock);
+
+ return retval;
+}
+
+
+/*
+ * Extract data field from a pg_largeobject tuple, detoasting if needed
+ * and verifying that the length is sane. Returns data pointer (a bytea *),
+ * data length, and an indication of whether to pfree the data pointer.
+ */
+static void
+getdatafield(Form_pg_largeobject tuple,
+ bytea **pdatafield,
+ int *plen,
+ bool *pfreeit)
+{
+ bytea *datafield;
+ int len;
+ bool freeit;
+
+ datafield = &(tuple->data); /* see note at top of file */
+ freeit = false;
+ if (VARATT_IS_EXTENDED(datafield))
+ {
+ datafield = (bytea *)
+ detoast_attr((struct varlena *) datafield);
+ freeit = true;
+ }
+ len = VARSIZE(datafield) - VARHDRSZ;
+ if (len < 0 || len > LOBLKSIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("pg_largeobject entry for OID %u, page %d has invalid data field size %d",
+ tuple->loid, tuple->pageno, len)));
+ *pdatafield = datafield;
+ *plen = len;
+ *pfreeit = freeit;
+}
+
+
+/*
+ * inv_create -- create a new large object
+ *
+ * Arguments:
+ * lobjId - OID to use for new large object, or InvalidOid to pick one
+ *
+ * Returns:
+ * OID of new object
+ *
+ * If lobjId is not InvalidOid, then an error occurs if the OID is already
+ * in use.
+ */
+Oid
+inv_create(Oid lobjId)
+{
+ Oid lobjId_new;
+
+ /*
+ * Create a new largeobject with empty data pages
+ */
+ lobjId_new = LargeObjectCreate(lobjId);
+
+ /*
+ * dependency on the owner of largeobject
+ *
+ * Note that LO dependencies are recorded using classId
+ * LargeObjectRelationId for backwards-compatibility reasons. Using
+ * LargeObjectMetadataRelationId instead would simplify matters for the
+ * backend, but it'd complicate pg_dump and possibly break other clients.
+ */
+ recordDependencyOnOwner(LargeObjectRelationId,
+ lobjId_new, GetUserId());
+
+ /* Post creation hook for new large object */
+ InvokeObjectPostCreateHook(LargeObjectRelationId, lobjId_new, 0);
+
+ /*
+ * Advance command counter to make new tuple visible to later operations.
+ */
+ CommandCounterIncrement();
+
+ return lobjId_new;
+}
+
+/*
+ * inv_open -- access an existing large object.
+ *
+ * Returns a large object descriptor, appropriately filled in.
+ * The descriptor and subsidiary data are allocated in the specified
+ * memory context, which must be suitably long-lived for the caller's
+ * purposes. If the returned descriptor has a snapshot associated
+ * with it, the caller must ensure that it also lives long enough,
+ * e.g. by calling RegisterSnapshotOnOwner
+ */
+LargeObjectDesc *
+inv_open(Oid lobjId, int flags, MemoryContext mcxt)
+{
+ LargeObjectDesc *retval;
+ Snapshot snapshot = NULL;
+ int descflags = 0;
+
+ /*
+ * Historically, no difference is made between (INV_WRITE) and (INV_WRITE
+ * | INV_READ), the caller being allowed to read the large object
+ * descriptor in either case.
+ */
+ if (flags & INV_WRITE)
+ descflags |= IFS_WRLOCK | IFS_RDLOCK;
+ if (flags & INV_READ)
+ descflags |= IFS_RDLOCK;
+
+ if (descflags == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid flags for opening a large object: %d",
+ flags)));
+
+ /* Get snapshot. If write is requested, use an instantaneous snapshot. */
+ if (descflags & IFS_WRLOCK)
+ snapshot = NULL;
+ else
+ snapshot = GetActiveSnapshot();
+
+ /* Can't use LargeObjectExists here because we need to specify snapshot */
+ if (!myLargeObjectExists(lobjId, snapshot))
+ ereport(ERROR,
+ (errcode(ERRCODE_UNDEFINED_OBJECT),
+ errmsg("large object %u does not exist", lobjId)));
+
+ /* Apply permission checks, again specifying snapshot */
+ if ((descflags & IFS_RDLOCK) != 0)
+ {
+ if (!lo_compat_privileges &&
+ pg_largeobject_aclcheck_snapshot(lobjId,
+ GetUserId(),
+ ACL_SELECT,
+ snapshot) != ACLCHECK_OK)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied for large object %u",
+ lobjId)));
+ }
+ if ((descflags & IFS_WRLOCK) != 0)
+ {
+ if (!lo_compat_privileges &&
+ pg_largeobject_aclcheck_snapshot(lobjId,
+ GetUserId(),
+ ACL_UPDATE,
+ snapshot) != ACLCHECK_OK)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied for large object %u",
+ lobjId)));
+ }
+
+ /* OK to create a descriptor */
+ retval = (LargeObjectDesc *) MemoryContextAlloc(mcxt,
+ sizeof(LargeObjectDesc));
+ retval->id = lobjId;
+ retval->offset = 0;
+ retval->flags = descflags;
+
+ /* caller sets if needed, not used by the functions in this file */
+ retval->subid = InvalidSubTransactionId;
+
+ /*
+ * The snapshot (if any) is just the currently active snapshot. The
+ * caller will replace it with a longer-lived copy if needed.
+ */
+ retval->snapshot = snapshot;
+
+ return retval;
+}
+
+/*
+ * Closes a large object descriptor previously made by inv_open(), and
+ * releases the long-term memory used by it.
+ */
+void
+inv_close(LargeObjectDesc *obj_desc)
+{
+ Assert(PointerIsValid(obj_desc));
+ pfree(obj_desc);
+}
+
+/*
+ * Destroys an existing large object (not to be confused with a descriptor!)
+ *
+ * Note we expect caller to have done any required permissions check.
+ */
+int
+inv_drop(Oid lobjId)
+{
+ ObjectAddress object;
+
+ /*
+ * Delete any comments and dependencies on the large object
+ */
+ object.classId = LargeObjectRelationId;
+ object.objectId = lobjId;
+ object.objectSubId = 0;
+ performDeletion(&object, DROP_CASCADE, 0);
+
+ /*
+ * Advance command counter so that tuple removal will be seen by later
+ * large-object operations in this transaction.
+ */
+ CommandCounterIncrement();
+
+ /* For historical reasons, we always return 1 on success. */
+ return 1;
+}
+
+/*
+ * Determine size of a large object
+ *
+ * NOTE: LOs can contain gaps, just like Unix files. We actually return
+ * the offset of the last byte + 1.
+ */
+static uint64
+inv_getsize(LargeObjectDesc *obj_desc)
+{
+ uint64 lastbyte = 0;
+ ScanKeyData skey[1];
+ SysScanDesc sd;
+ HeapTuple tuple;
+
+ Assert(PointerIsValid(obj_desc));
+
+ open_lo_relation();
+
+ ScanKeyInit(&skey[0],
+ Anum_pg_largeobject_loid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(obj_desc->id));
+
+ sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
+ obj_desc->snapshot, 1, skey);
+
+ /*
+ * Because the pg_largeobject index is on both loid and pageno, but we
+ * constrain only loid, a backwards scan should visit all pages of the
+ * large object in reverse pageno order. So, it's sufficient to examine
+ * the first valid tuple (== last valid page).
+ */
+ tuple = systable_getnext_ordered(sd, BackwardScanDirection);
+ if (HeapTupleIsValid(tuple))
+ {
+ Form_pg_largeobject data;
+ bytea *datafield;
+ int len;
+ bool pfreeit;
+
+ if (HeapTupleHasNulls(tuple)) /* paranoia */
+ elog(ERROR, "null field found in pg_largeobject");
+ data = (Form_pg_largeobject) GETSTRUCT(tuple);
+ getdatafield(data, &datafield, &len, &pfreeit);
+ lastbyte = (uint64) data->pageno * LOBLKSIZE + len;
+ if (pfreeit)
+ pfree(datafield);
+ }
+
+ systable_endscan_ordered(sd);
+
+ return lastbyte;
+}
+
+int64
+inv_seek(LargeObjectDesc *obj_desc, int64 offset, int whence)
+{
+ int64 newoffset;
+
+ Assert(PointerIsValid(obj_desc));
+
+ /*
+ * We allow seek/tell if you have either read or write permission, so no
+ * need for a permission check here.
+ */
+
+ /*
+ * Note: overflow in the additions is possible, but since we will reject
+ * negative results, we don't need any extra test for that.
+ */
+ switch (whence)
+ {
+ case SEEK_SET:
+ newoffset = offset;
+ break;
+ case SEEK_CUR:
+ newoffset = obj_desc->offset + offset;
+ break;
+ case SEEK_END:
+ newoffset = inv_getsize(obj_desc) + offset;
+ break;
+ default:
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid whence setting: %d", whence)));
+ newoffset = 0; /* keep compiler quiet */
+ break;
+ }
+
+ /*
+ * use errmsg_internal here because we don't want to expose INT64_FORMAT
+ * in translatable strings; doing better is not worth the trouble
+ */
+ if (newoffset < 0 || newoffset > MAX_LARGE_OBJECT_SIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg_internal("invalid large object seek target: " INT64_FORMAT,
+ newoffset)));
+
+ obj_desc->offset = newoffset;
+ return newoffset;
+}
+
+int64
+inv_tell(LargeObjectDesc *obj_desc)
+{
+ Assert(PointerIsValid(obj_desc));
+
+ /*
+ * We allow seek/tell if you have either read or write permission, so no
+ * need for a permission check here.
+ */
+
+ return obj_desc->offset;
+}
+
+int
+inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
+{
+ int nread = 0;
+ int64 n;
+ int64 off;
+ int len;
+ int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE);
+ uint64 pageoff;
+ ScanKeyData skey[2];
+ SysScanDesc sd;
+ HeapTuple tuple;
+
+ Assert(PointerIsValid(obj_desc));
+ Assert(buf != NULL);
+
+ if ((obj_desc->flags & IFS_RDLOCK) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied for large object %u",
+ obj_desc->id)));
+
+ if (nbytes <= 0)
+ return 0;
+
+ open_lo_relation();
+
+ ScanKeyInit(&skey[0],
+ Anum_pg_largeobject_loid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(obj_desc->id));
+
+ ScanKeyInit(&skey[1],
+ Anum_pg_largeobject_pageno,
+ BTGreaterEqualStrategyNumber, F_INT4GE,
+ Int32GetDatum(pageno));
+
+ sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
+ obj_desc->snapshot, 2, skey);
+
+ while ((tuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
+ {
+ Form_pg_largeobject data;
+ bytea *datafield;
+ bool pfreeit;
+
+ if (HeapTupleHasNulls(tuple)) /* paranoia */
+ elog(ERROR, "null field found in pg_largeobject");
+ data = (Form_pg_largeobject) GETSTRUCT(tuple);
+
+ /*
+ * We expect the indexscan will deliver pages in order. However,
+ * there may be missing pages if the LO contains unwritten "holes". We
+ * want missing sections to read out as zeroes.
+ */
+ pageoff = ((uint64) data->pageno) * LOBLKSIZE;
+ if (pageoff > obj_desc->offset)
+ {
+ n = pageoff - obj_desc->offset;
+ n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
+ MemSet(buf + nread, 0, n);
+ nread += n;
+ obj_desc->offset += n;
+ }
+
+ if (nread < nbytes)
+ {
+ Assert(obj_desc->offset >= pageoff);
+ off = (int) (obj_desc->offset - pageoff);
+ Assert(off >= 0 && off < LOBLKSIZE);
+
+ getdatafield(data, &datafield, &len, &pfreeit);
+ if (len > off)
+ {
+ n = len - off;
+ n = (n <= (nbytes - nread)) ? n : (nbytes - nread);
+ memcpy(buf + nread, VARDATA(datafield) + off, n);
+ nread += n;
+ obj_desc->offset += n;
+ }
+ if (pfreeit)
+ pfree(datafield);
+ }
+
+ if (nread >= nbytes)
+ break;
+ }
+
+ systable_endscan_ordered(sd);
+
+ return nread;
+}
+
+int
+inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes)
+{
+ int nwritten = 0;
+ int n;
+ int off;
+ int len;
+ int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE);
+ ScanKeyData skey[2];
+ SysScanDesc sd;
+ HeapTuple oldtuple;
+ Form_pg_largeobject olddata;
+ bool neednextpage;
+ bytea *datafield;
+ bool pfreeit;
+ union
+ {
+ bytea hdr;
+ /* this is to make the union big enough for a LO data chunk: */
+ char data[LOBLKSIZE + VARHDRSZ];
+ /* ensure union is aligned well enough: */
+ int32 align_it;
+ } workbuf;
+ char *workb = VARDATA(&workbuf.hdr);
+ HeapTuple newtup;
+ Datum values[Natts_pg_largeobject];
+ bool nulls[Natts_pg_largeobject];
+ bool replace[Natts_pg_largeobject];
+ CatalogIndexState indstate;
+
+ Assert(PointerIsValid(obj_desc));
+ Assert(buf != NULL);
+
+ /* enforce writability because snapshot is probably wrong otherwise */
+ if ((obj_desc->flags & IFS_WRLOCK) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied for large object %u",
+ obj_desc->id)));
+
+ if (nbytes <= 0)
+ return 0;
+
+ /* this addition can't overflow because nbytes is only int32 */
+ if ((nbytes + obj_desc->offset) > MAX_LARGE_OBJECT_SIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid large object write request size: %d",
+ nbytes)));
+
+ open_lo_relation();
+
+ indstate = CatalogOpenIndexes(lo_heap_r);
+
+ ScanKeyInit(&skey[0],
+ Anum_pg_largeobject_loid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(obj_desc->id));
+
+ ScanKeyInit(&skey[1],
+ Anum_pg_largeobject_pageno,
+ BTGreaterEqualStrategyNumber, F_INT4GE,
+ Int32GetDatum(pageno));
+
+ sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
+ obj_desc->snapshot, 2, skey);
+
+ oldtuple = NULL;
+ olddata = NULL;
+ neednextpage = true;
+
+ while (nwritten < nbytes)
+ {
+ /*
+ * If possible, get next pre-existing page of the LO. We expect the
+ * indexscan will deliver these in order --- but there may be holes.
+ */
+ if (neednextpage)
+ {
+ if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
+ {
+ if (HeapTupleHasNulls(oldtuple)) /* paranoia */
+ elog(ERROR, "null field found in pg_largeobject");
+ olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
+ Assert(olddata->pageno >= pageno);
+ }
+ neednextpage = false;
+ }
+
+ /*
+ * If we have a pre-existing page, see if it is the page we want to
+ * write, or a later one.
+ */
+ if (olddata != NULL && olddata->pageno == pageno)
+ {
+ /*
+ * Update an existing page with fresh data.
+ *
+ * First, load old data into workbuf
+ */
+ getdatafield(olddata, &datafield, &len, &pfreeit);
+ memcpy(workb, VARDATA(datafield), len);
+ if (pfreeit)
+ pfree(datafield);
+
+ /*
+ * Fill any hole
+ */
+ off = (int) (obj_desc->offset % LOBLKSIZE);
+ if (off > len)
+ MemSet(workb + len, 0, off - len);
+
+ /*
+ * Insert appropriate portion of new data
+ */
+ n = LOBLKSIZE - off;
+ n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
+ memcpy(workb + off, buf + nwritten, n);
+ nwritten += n;
+ obj_desc->offset += n;
+ off += n;
+ /* compute valid length of new page */
+ len = (len >= off) ? len : off;
+ SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ);
+
+ /*
+ * Form and insert updated tuple
+ */
+ memset(values, 0, sizeof(values));
+ memset(nulls, false, sizeof(nulls));
+ memset(replace, false, sizeof(replace));
+ values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
+ replace[Anum_pg_largeobject_data - 1] = true;
+ newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r),
+ values, nulls, replace);
+ CatalogTupleUpdateWithInfo(lo_heap_r, &newtup->t_self, newtup,
+ indstate);
+ heap_freetuple(newtup);
+
+ /*
+ * We're done with this old page.
+ */
+ oldtuple = NULL;
+ olddata = NULL;
+ neednextpage = true;
+ }
+ else
+ {
+ /*
+ * Write a brand new page.
+ *
+ * First, fill any hole
+ */
+ off = (int) (obj_desc->offset % LOBLKSIZE);
+ if (off > 0)
+ MemSet(workb, 0, off);
+
+ /*
+ * Insert appropriate portion of new data
+ */
+ n = LOBLKSIZE - off;
+ n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten);
+ memcpy(workb + off, buf + nwritten, n);
+ nwritten += n;
+ obj_desc->offset += n;
+ /* compute valid length of new page */
+ len = off + n;
+ SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ);
+
+ /*
+ * Form and insert updated tuple
+ */
+ memset(values, 0, sizeof(values));
+ memset(nulls, false, sizeof(nulls));
+ values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
+ values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
+ values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
+ newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls);
+ CatalogTupleInsertWithInfo(lo_heap_r, newtup, indstate);
+ heap_freetuple(newtup);
+ }
+ pageno++;
+ }
+
+ systable_endscan_ordered(sd);
+
+ CatalogCloseIndexes(indstate);
+
+ /*
+ * Advance command counter so that my tuple updates will be seen by later
+ * large-object operations in this transaction.
+ */
+ CommandCounterIncrement();
+
+ return nwritten;
+}
+
+void
+inv_truncate(LargeObjectDesc *obj_desc, int64 len)
+{
+ int32 pageno = (int32) (len / LOBLKSIZE);
+ int32 off;
+ ScanKeyData skey[2];
+ SysScanDesc sd;
+ HeapTuple oldtuple;
+ Form_pg_largeobject olddata;
+ union
+ {
+ bytea hdr;
+ /* this is to make the union big enough for a LO data chunk: */
+ char data[LOBLKSIZE + VARHDRSZ];
+ /* ensure union is aligned well enough: */
+ int32 align_it;
+ } workbuf;
+ char *workb = VARDATA(&workbuf.hdr);
+ HeapTuple newtup;
+ Datum values[Natts_pg_largeobject];
+ bool nulls[Natts_pg_largeobject];
+ bool replace[Natts_pg_largeobject];
+ CatalogIndexState indstate;
+
+ Assert(PointerIsValid(obj_desc));
+
+ /* enforce writability because snapshot is probably wrong otherwise */
+ if ((obj_desc->flags & IFS_WRLOCK) == 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("permission denied for large object %u",
+ obj_desc->id)));
+
+ /*
+ * use errmsg_internal here because we don't want to expose INT64_FORMAT
+ * in translatable strings; doing better is not worth the trouble
+ */
+ if (len < 0 || len > MAX_LARGE_OBJECT_SIZE)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg_internal("invalid large object truncation target: " INT64_FORMAT,
+ len)));
+
+ open_lo_relation();
+
+ indstate = CatalogOpenIndexes(lo_heap_r);
+
+ /*
+ * Set up to find all pages with desired loid and pageno >= target
+ */
+ ScanKeyInit(&skey[0],
+ Anum_pg_largeobject_loid,
+ BTEqualStrategyNumber, F_OIDEQ,
+ ObjectIdGetDatum(obj_desc->id));
+
+ ScanKeyInit(&skey[1],
+ Anum_pg_largeobject_pageno,
+ BTGreaterEqualStrategyNumber, F_INT4GE,
+ Int32GetDatum(pageno));
+
+ sd = systable_beginscan_ordered(lo_heap_r, lo_index_r,
+ obj_desc->snapshot, 2, skey);
+
+ /*
+ * If possible, get the page the truncation point is in. The truncation
+ * point may be beyond the end of the LO or in a hole.
+ */
+ olddata = NULL;
+ if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
+ {
+ if (HeapTupleHasNulls(oldtuple)) /* paranoia */
+ elog(ERROR, "null field found in pg_largeobject");
+ olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple);
+ Assert(olddata->pageno >= pageno);
+ }
+
+ /*
+ * If we found the page of the truncation point we need to truncate the
+ * data in it. Otherwise if we're in a hole, we need to create a page to
+ * mark the end of data.
+ */
+ if (olddata != NULL && olddata->pageno == pageno)
+ {
+ /* First, load old data into workbuf */
+ bytea *datafield;
+ int pagelen;
+ bool pfreeit;
+
+ getdatafield(olddata, &datafield, &pagelen, &pfreeit);
+ memcpy(workb, VARDATA(datafield), pagelen);
+ if (pfreeit)
+ pfree(datafield);
+
+ /*
+ * Fill any hole
+ */
+ off = len % LOBLKSIZE;
+ if (off > pagelen)
+ MemSet(workb + pagelen, 0, off - pagelen);
+
+ /* compute length of new page */
+ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ);
+
+ /*
+ * Form and insert updated tuple
+ */
+ memset(values, 0, sizeof(values));
+ memset(nulls, false, sizeof(nulls));
+ memset(replace, false, sizeof(replace));
+ values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
+ replace[Anum_pg_largeobject_data - 1] = true;
+ newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r),
+ values, nulls, replace);
+ CatalogTupleUpdateWithInfo(lo_heap_r, &newtup->t_self, newtup,
+ indstate);
+ heap_freetuple(newtup);
+ }
+ else
+ {
+ /*
+ * If the first page we found was after the truncation point, we're in
+ * a hole that we'll fill, but we need to delete the later page
+ * because the loop below won't visit it again.
+ */
+ if (olddata != NULL)
+ {
+ Assert(olddata->pageno > pageno);
+ CatalogTupleDelete(lo_heap_r, &oldtuple->t_self);
+ }
+
+ /*
+ * Write a brand new page.
+ *
+ * Fill the hole up to the truncation point
+ */
+ off = len % LOBLKSIZE;
+ if (off > 0)
+ MemSet(workb, 0, off);
+
+ /* compute length of new page */
+ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ);
+
+ /*
+ * Form and insert new tuple
+ */
+ memset(values, 0, sizeof(values));
+ memset(nulls, false, sizeof(nulls));
+ values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
+ values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
+ values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
+ newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls);
+ CatalogTupleInsertWithInfo(lo_heap_r, newtup, indstate);
+ heap_freetuple(newtup);
+ }
+
+ /*
+ * Delete any pages after the truncation point. If the initial search
+ * didn't find a page, then of course there's nothing more to do.
+ */
+ if (olddata != NULL)
+ {
+ while ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL)
+ {
+ CatalogTupleDelete(lo_heap_r, &oldtuple->t_self);
+ }
+ }
+
+ systable_endscan_ordered(sd);
+
+ CatalogCloseIndexes(indstate);
+
+ /*
+ * Advance command counter so that tuple updates will be seen by later
+ * large-object operations in this transaction.
+ */
+ CommandCounterIncrement();
+}
diff --git a/src/backend/storage/large_object/meson.build b/src/backend/storage/large_object/meson.build
new file mode 100644
index 0000000..4d9d893
--- /dev/null
+++ b/src/backend/storage/large_object/meson.build
@@ -0,0 +1,5 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'inv_api.c',
+)
diff --git a/src/backend/storage/lmgr/.gitignore b/src/backend/storage/lmgr/.gitignore
new file mode 100644
index 0000000..dab4c3f
--- /dev/null
+++ b/src/backend/storage/lmgr/.gitignore
@@ -0,0 +1,3 @@
+/lwlocknames.c
+/lwlocknames.h
+/s_lock_test
diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile
new file mode 100644
index 0000000..b25b7ee
--- /dev/null
+++ b/src/backend/storage/lmgr/Makefile
@@ -0,0 +1,52 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/lmgr
+#
+# IDENTIFICATION
+# src/backend/storage/lmgr/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/lmgr
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ condition_variable.o \
+ deadlock.o \
+ lmgr.o \
+ lock.o \
+ lwlock.o \
+ lwlocknames.o \
+ predicate.o \
+ proc.o \
+ s_lock.o \
+ spin.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+ifdef TAS
+TASPATH = $(top_builddir)/src/backend/port/tas.o
+endif
+
+s_lock_test: s_lock.c $(top_builddir)/src/common/libpgcommon.a $(top_builddir)/src/port/libpgport.a
+ $(CC) $(CPPFLAGS) $(CFLAGS) -DS_LOCK_TEST=1 $(srcdir)/s_lock.c \
+ $(TASPATH) -L $(top_builddir)/src/common -lpgcommon \
+ -L $(top_builddir)/src/port -lpgport -o s_lock_test
+
+# see notes in src/backend/parser/Makefile
+lwlocknames.c: lwlocknames.h
+ touch $@
+
+lwlocknames.h: $(top_srcdir)/src/backend/storage/lmgr/lwlocknames.txt generate-lwlocknames.pl
+ $(PERL) $(srcdir)/generate-lwlocknames.pl $<
+
+check: s_lock_test
+ ./s_lock_test
+
+clean distclean:
+ rm -f s_lock_test
+
+maintainer-clean: clean
+ rm -f lwlocknames.h lwlocknames.c
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README
new file mode 100644
index 0000000..45de0fd
--- /dev/null
+++ b/src/backend/storage/lmgr/README
@@ -0,0 +1,731 @@
+src/backend/storage/lmgr/README
+
+Locking Overview
+================
+
+Postgres uses four types of interprocess locks:
+
+* Spinlocks. These are intended for *very* short-term locks. If a lock
+is to be held more than a few dozen instructions, or across any sort of
+kernel call (or even a call to a nontrivial subroutine), don't use a
+spinlock. Spinlocks are primarily used as infrastructure for lightweight
+locks. They are implemented using a hardware atomic-test-and-set
+instruction, if available. Waiting processes busy-loop until they can
+get the lock. There is no provision for deadlock detection, automatic
+release on error, or any other nicety. There is a timeout if the lock
+cannot be gotten after a minute or so (which is approximately forever in
+comparison to the intended lock hold time, so this is certainly an error
+condition).
+
+* Lightweight locks (LWLocks). These locks are typically used to
+interlock access to datastructures in shared memory. LWLocks support
+both exclusive and shared lock modes (for read/write and read-only
+access to a shared object). There is no provision for deadlock
+detection, but the LWLock manager will automatically release held
+LWLocks during elog() recovery, so it is safe to raise an error while
+holding LWLocks. Obtaining or releasing an LWLock is quite fast (a few
+dozen instructions) when there is no contention for the lock. When a
+process has to wait for an LWLock, it blocks on a SysV semaphore so as
+to not consume CPU time. Waiting processes will be granted the lock in
+arrival order. There is no timeout.
+
+* Regular locks (a/k/a heavyweight locks). The regular lock manager
+supports a variety of lock modes with table-driven semantics, and it has
+full deadlock detection and automatic release at transaction end.
+Regular locks should be used for all user-driven lock requests.
+
+* SIReadLock predicate locks. See separate README-SSI file for details.
+
+Acquisition of either a spinlock or a lightweight lock causes query
+cancel and die() interrupts to be held off until all such locks are
+released. No such restriction exists for regular locks, however. Also
+note that we can accept query cancel and die() interrupts while waiting
+for a regular lock, but we will not accept them while waiting for
+spinlocks or LW locks. It is therefore not a good idea to use LW locks
+when the wait time might exceed a few seconds.
+
+The rest of this README file discusses the regular lock manager in detail.
+
+
+Lock Data Structures
+--------------------
+
+Lock methods describe the overall locking behavior. Currently there are
+two lock methods: DEFAULT and USER.
+
+Lock modes describe the type of the lock (read/write or shared/exclusive).
+In principle, each lock method can have its own set of lock modes with
+different conflict rules, but currently DEFAULT and USER methods use
+identical lock mode sets. See src/include/storage/lock.h for more details.
+(Lock modes are also called lock types in some places in the code and
+documentation.)
+
+There are two main methods for recording locks in shared memory. The primary
+mechanism uses two main structures: the per-lockable-object LOCK struct, and
+the per-lock-and-requestor PROCLOCK struct. A LOCK object exists for each
+lockable object that currently has locks held or requested on it. A PROCLOCK
+struct exists for each backend that is holding or requesting lock(s) on each
+LOCK object.
+
+There is also a special "fast path" mechanism which backends may use to
+record a limited number of locks with very specific characteristics: they must
+use the DEFAULT lockmethod; they must represent a lock on a database relation
+(not a shared relation), they must be a "weak" lock which is unlikely to
+conflict (AccessShareLock, RowShareLock, or RowExclusiveLock); and the system
+must be able to quickly verify that no conflicting locks could possibly be
+present. See "Fast Path Locking", below, for more details.
+
+Each backend also maintains an unshared LOCALLOCK structure for each lockable
+object and lock mode that it is currently holding or requesting. The shared
+lock structures only allow a single lock grant to be made per lockable
+object/lock mode/backend. Internally to a backend, however, the same lock may
+be requested and perhaps released multiple times in a transaction, and it can
+also be held both transactionally and session-wide. The internal request
+counts are held in LOCALLOCK so that the shared data structures need not be
+accessed to alter them.
+
+---------------------------------------------------------------------------
+
+The lock manager's LOCK objects contain:
+
+tag -
+ The key fields that are used for hashing locks in the shared memory
+ lock hash table. The contents of the tag essentially define an
+ individual lockable object. See include/storage/lock.h for details
+ about the supported types of lockable objects. This is declared as
+ a separate struct to ensure that we always zero out the correct number
+ of bytes. It is critical that any alignment-padding bytes the compiler
+ might insert in the struct be zeroed out, else the hash computation
+ will be random. (Currently, we are careful to define struct LOCKTAG
+ so that there are no padding bytes.)
+
+grantMask -
+ This bitmask indicates what types of locks are currently held on the
+ given lockable object. It is used (against the lock table's conflict
+ table) to determine if a new lock request will conflict with existing
+ lock types held. Conflicts are determined by bitwise AND operations
+ between the grantMask and the conflict table entry for the requested
+ lock type. Bit i of grantMask is 1 if and only if granted[i] > 0.
+
+waitMask -
+ This bitmask shows the types of locks being waited for. Bit i of waitMask
+ is 1 if and only if requested[i] > granted[i].
+
+procLocks -
+ This is a shared memory queue of all the PROCLOCK structs associated with
+ the lock object. Note that both granted and waiting PROCLOCKs are in this
+ list (indeed, the same PROCLOCK might have some already-granted locks and
+ be waiting for more!).
+
+waitProcs -
+ This is a shared memory queue of all PGPROC structures corresponding to
+ backends that are waiting (sleeping) until another backend releases this
+ lock. The process structure holds the information needed to determine
+ if it should be woken up when the lock is released.
+
+nRequested -
+ Keeps a count of how many times this lock has been attempted to be
+ acquired. The count includes attempts by processes which were put
+ to sleep due to conflicts. It also counts the same backend twice
+ if, for example, a backend process first acquires a read and then
+ acquires a write. (But multiple acquisitions of the same lock/lock mode
+ within a backend are not multiply counted here; they are recorded
+ only in the backend's LOCALLOCK structure.)
+
+requested -
+ Keeps a count of how many locks of each type have been attempted. Only
+ elements 1 through MAX_LOCKMODES-1 are used as they correspond to the lock
+ type defined constants. Summing the values of requested[] should come out
+ equal to nRequested.
+
+nGranted -
+ Keeps count of how many times this lock has been successfully acquired.
+ This count does not include attempts that are waiting due to conflicts.
+ Otherwise the counting rules are the same as for nRequested.
+
+granted -
+ Keeps count of how many locks of each type are currently held. Once again
+ only elements 1 through MAX_LOCKMODES-1 are used (0 is not). Also, like
+ requested[], summing the values of granted[] should total to the value
+ of nGranted.
+
+We should always have 0 <= nGranted <= nRequested, and
+0 <= granted[i] <= requested[i] for each i. When all the request counts
+go to zero, the LOCK object is no longer needed and can be freed.
+
+---------------------------------------------------------------------------
+
+The lock manager's PROCLOCK objects contain:
+
+tag -
+ The key fields that are used for hashing entries in the shared memory
+ PROCLOCK hash table. This is declared as a separate struct to ensure that
+ we always zero out the correct number of bytes. It is critical that any
+ alignment-padding bytes the compiler might insert in the struct be zeroed
+ out, else the hash computation will be random. (Currently, we are careful
+ to define struct PROCLOCKTAG so that there are no padding bytes.)
+
+ tag.myLock
+ Pointer to the shared LOCK object this PROCLOCK is for.
+
+ tag.myProc
+ Pointer to the PGPROC of backend process that owns this PROCLOCK.
+
+ Note: it's OK to use pointers here because a PROCLOCK never outlives
+ either its lock or its proc. The tag is therefore unique for as long
+ as it needs to be, even though the same tag values might mean something
+ else at other times.
+
+holdMask -
+ A bitmask for the lock modes successfully acquired by this PROCLOCK.
+ This should be a subset of the LOCK object's grantMask, and also a
+ subset of the PGPROC object's heldLocks mask (if the PGPROC is
+ currently waiting for another lock mode on this lock).
+
+releaseMask -
+ A bitmask for the lock modes due to be released during LockReleaseAll.
+ This must be a subset of the holdMask. Note that it is modified without
+ taking the partition LWLock, and therefore it is unsafe for any
+ backend except the one owning the PROCLOCK to examine/change it.
+
+lockLink -
+ List link for shared memory queue of all the PROCLOCK objects for the
+ same LOCK.
+
+procLink -
+ List link for shared memory queue of all the PROCLOCK objects for the
+ same backend.
+
+---------------------------------------------------------------------------
+
+
+Lock Manager Internal Locking
+-----------------------------
+
+Before PostgreSQL 8.2, all of the shared-memory data structures used by
+the lock manager were protected by a single LWLock, the LockMgrLock;
+any operation involving these data structures had to exclusively lock
+LockMgrLock. Not too surprisingly, this became a contention bottleneck.
+To reduce contention, the lock manager's data structures have been split
+into multiple "partitions", each protected by an independent LWLock.
+Most operations only need to lock the single partition they are working in.
+Here are the details:
+
+* Each possible lock is assigned to one partition according to a hash of
+its LOCKTAG value. The partition's LWLock is considered to protect all the
+LOCK objects of that partition as well as their subsidiary PROCLOCKs.
+
+* The shared-memory hash tables for LOCKs and PROCLOCKs are organized
+so that different partitions use different hash chains, and thus there
+is no conflict in working with objects in different partitions. This
+is supported directly by dynahash.c's "partitioned table" mechanism
+for the LOCK table: we need only ensure that the partition number is
+taken from the low-order bits of the dynahash hash value for the LOCKTAG.
+To make it work for PROCLOCKs, we have to ensure that a PROCLOCK's hash
+value has the same low-order bits as its associated LOCK. This requires
+a specialized hash function (see proclock_hash).
+
+* Formerly, each PGPROC had a single list of PROCLOCKs belonging to it.
+This has now been split into per-partition lists, so that access to a
+particular PROCLOCK list can be protected by the associated partition's
+LWLock. (This rule allows one backend to manipulate another backend's
+PROCLOCK lists, which was not originally necessary but is now required in
+connection with fast-path locking; see below.)
+
+* The other lock-related fields of a PGPROC are only interesting when
+the PGPROC is waiting for a lock, so we consider that they are protected
+by the partition LWLock of the awaited lock.
+
+For normal lock acquisition and release, it is sufficient to lock the
+partition containing the desired lock. Deadlock checking needs to touch
+multiple partitions in general; for simplicity, we just make it lock all
+the partitions in partition-number order. (To prevent LWLock deadlock,
+we establish the rule that any backend needing to lock more than one
+partition at once must lock them in partition-number order.) It's
+possible that deadlock checking could be done without touching every
+partition in typical cases, but since in a properly functioning system
+deadlock checking should not occur often enough to be performance-critical,
+trying to make this work does not seem a productive use of effort.
+
+A backend's internal LOCALLOCK hash table is not partitioned. We do store
+a copy of the locktag hash code in LOCALLOCK table entries, from which the
+partition number can be computed, but this is a straight speed-for-space
+tradeoff: we could instead recalculate the partition number from the LOCKTAG
+when needed.
+
+
+Fast Path Locking
+-----------------
+
+Fast path locking is a special purpose mechanism designed to reduce the
+overhead of taking and releasing certain types of locks which are taken
+and released very frequently but rarely conflict. Currently, this includes
+two categories of locks:
+
+(1) Weak relation locks. SELECT, INSERT, UPDATE, and DELETE must acquire a
+lock on every relation they operate on, as well as various system catalogs
+that can be used internally. Many DML operations can proceed in parallel
+against the same table at the same time; only DDL operations such as
+CLUSTER, ALTER TABLE, or DROP -- or explicit user action such as LOCK TABLE
+-- will create lock conflicts with the "weak" locks (AccessShareLock,
+RowShareLock, RowExclusiveLock) acquired by DML operations.
+
+(2) VXID locks. Every transaction takes a lock on its own virtual
+transaction ID. Currently, the only operations that wait for these locks
+are CREATE INDEX CONCURRENTLY and Hot Standby (in the case of a conflict),
+so most VXID locks are taken and released by the owner without anyone else
+needing to care.
+
+The primary locking mechanism does not cope well with this workload. Even
+though the lock manager locks are partitioned, the locktag for any given
+relation still falls in one, and only one, partition. Thus, if many short
+queries are accessing the same relation, the lock manager partition lock for
+that partition becomes a contention bottleneck. This effect is measurable
+even on 2-core servers, and becomes very pronounced as core count increases.
+
+To alleviate this bottleneck, beginning in PostgreSQL 9.2, each backend is
+permitted to record a limited number of locks on unshared relations in an
+array within its PGPROC structure, rather than using the primary lock table.
+This mechanism can only be used when the locker can verify that no conflicting
+locks exist at the time of taking the lock.
+
+A key point of this algorithm is that it must be possible to verify the
+absence of possibly conflicting locks without fighting over a shared LWLock or
+spinlock. Otherwise, this effort would simply move the contention bottleneck
+from one place to another. We accomplish this using an array of 1024 integer
+counters, which are in effect a 1024-way partitioning of the lock space.
+Each counter records the number of "strong" locks (that is, ShareLock,
+ShareRowExclusiveLock, ExclusiveLock, and AccessExclusiveLock) on unshared
+relations that fall into that partition. When this counter is non-zero, the
+fast path mechanism may not be used to take new relation locks within that
+partition. A strong locker bumps the counter and then scans each per-backend
+array for matching fast-path locks; any which are found must be transferred to
+the primary lock table before attempting to acquire the lock, to ensure proper
+lock conflict and deadlock detection.
+
+On an SMP system, we must guarantee proper memory synchronization. Here we
+rely on the fact that LWLock acquisition acts as a memory sequence point: if
+A performs a store, A and B both acquire an LWLock in either order, and B
+then performs a load on the same memory location, it is guaranteed to see
+A's store. In this case, each backend's fast-path lock queue is protected
+by an LWLock. A backend wishing to acquire a fast-path lock grabs this
+LWLock before examining FastPathStrongRelationLocks to check for the presence
+of a conflicting strong lock. And the backend attempting to acquire a strong
+lock, because it must transfer any matching weak locks taken via the fast-path
+mechanism to the shared lock table, will acquire every LWLock protecting a
+backend fast-path queue in turn. So, if we examine
+FastPathStrongRelationLocks and see a zero, then either the value is truly
+zero, or if it is a stale value, the strong locker has yet to acquire the
+per-backend LWLock we now hold (or, indeed, even the first per-backend LWLock)
+and will notice any weak lock we take when it does.
+
+Fast-path VXID locks do not use the FastPathStrongRelationLocks table. The
+first lock taken on a VXID is always the ExclusiveLock taken by its owner.
+Any subsequent lockers are share lockers waiting for the VXID to terminate.
+Indeed, the only reason VXID locks use the lock manager at all (rather than
+waiting for the VXID to terminate via some other method) is for deadlock
+detection. Thus, the initial VXID lock can *always* be taken via the fast
+path without checking for conflicts. Any subsequent locker must check
+whether the lock has been transferred to the main lock table, and if not,
+do so. The backend owning the VXID must be careful to clean up any entry
+made in the main lock table at end of transaction.
+
+Deadlock detection does not need to examine the fast-path data structures,
+because any lock that could possibly be involved in a deadlock must have
+been transferred to the main tables beforehand.
+
+
+The Deadlock Detection Algorithm
+--------------------------------
+
+Since we allow user transactions to request locks in any order, deadlock
+is possible. We use a deadlock detection/breaking algorithm that is
+fairly standard in essence, but there are many special considerations
+needed to deal with Postgres' generalized locking model.
+
+A key design consideration is that we want to make routine operations
+(lock grant and release) run quickly when there is no deadlock, and
+avoid the overhead of deadlock handling as much as possible. We do this
+using an "optimistic waiting" approach: if a process cannot acquire the
+lock it wants immediately, it goes to sleep without any deadlock check.
+But it also sets a delay timer, with a delay of DeadlockTimeout
+milliseconds (typically set to one second). If the delay expires before
+the process is granted the lock it wants, it runs the deadlock
+detection/breaking code. Normally this code will determine that there is
+no deadlock condition, and then the process will go back to sleep and
+wait quietly until it is granted the lock. But if a deadlock condition
+does exist, it will be resolved, usually by aborting the detecting
+process' transaction. In this way, we avoid deadlock handling overhead
+whenever the wait time for a lock is less than DeadlockTimeout, while
+not imposing an unreasonable delay of detection when there is an error.
+
+Lock acquisition (routines LockAcquire and ProcSleep) follows these rules:
+
+1. A lock request is granted immediately if it does not conflict with
+any existing or waiting lock request, or if the process already holds an
+instance of the same lock type (eg, there's no penalty to acquire a read
+lock twice). Note that a process never conflicts with itself, eg one
+can obtain read lock when one already holds exclusive lock.
+
+2. Otherwise the process joins the lock's wait queue. Normally it will
+be added to the end of the queue, but there is an exception: if the
+process already holds locks on this same lockable object that conflict
+with the request of any pending waiter, then the process will be
+inserted in the wait queue just ahead of the first such waiter. (If we
+did not make this check, the deadlock detection code would adjust the
+queue order to resolve the conflict, but it's relatively cheap to make
+the check in ProcSleep and avoid a deadlock timeout delay in this case.)
+Note special case when inserting before the end of the queue: if the
+process's request does not conflict with any existing lock nor any
+waiting request before its insertion point, then go ahead and grant the
+lock without waiting.
+
+When a lock is released, the lock release routine (ProcLockWakeup) scans
+the lock object's wait queue. Each waiter is awoken if (a) its request
+does not conflict with already-granted locks, and (b) its request does
+not conflict with the requests of prior un-wakable waiters. Rule (b)
+ensures that conflicting requests are granted in order of arrival. There
+are cases where a later waiter must be allowed to go in front of
+conflicting earlier waiters to avoid deadlock, but it is not
+ProcLockWakeup's responsibility to recognize these cases; instead, the
+deadlock detection code will re-order the wait queue when necessary.
+
+To perform deadlock checking, we use the standard method of viewing the
+various processes as nodes in a directed graph (the waits-for graph or
+WFG). There is a graph edge leading from process A to process B if A
+waits for B, ie, A is waiting for some lock and B holds a conflicting
+lock. There is a deadlock condition if and only if the WFG contains a
+cycle. We detect cycles by searching outward along waits-for edges to
+see if we return to our starting point. There are three possible
+outcomes:
+
+1. All outgoing paths terminate at a running process (which has no
+outgoing edge).
+
+2. A deadlock is detected by looping back to the start point. We
+resolve such a deadlock by canceling the start point's lock request and
+reporting an error in that transaction, which normally leads to
+transaction abort and release of that transaction's held locks. Note
+that it's sufficient to cancel one request to remove the cycle; we don't
+need to kill all the transactions involved.
+
+3. Some path(s) loop back to a node other than the start point. This
+indicates a deadlock, but one that does not involve our starting
+process. We ignore this condition on the grounds that resolving such a
+deadlock is the responsibility of the processes involved --- killing our
+start-point process would not resolve the deadlock. So, cases 1 and 3
+both report "no deadlock".
+
+Postgres' situation is a little more complex than the standard discussion
+of deadlock detection, for two reasons:
+
+1. A process can be waiting for more than one other process, since there
+might be multiple PROCLOCKs of (non-conflicting) lock types that all
+conflict with the waiter's request. This creates no real difficulty
+however; we simply need to be prepared to trace more than one outgoing
+edge.
+
+2. If a process A is behind a process B in some lock's wait queue, and
+their requested locks conflict, then we must say that A waits for B, since
+ProcLockWakeup will never awaken A before B. This creates additional
+edges in the WFG. We call these "soft" edges, as opposed to the "hard"
+edges induced by locks already held. Note that if B already holds any
+locks conflicting with A's request, then their relationship is a hard edge
+not a soft edge.
+
+A "soft" block, or wait-priority block, has the same potential for
+inducing deadlock as a hard block. However, we may be able to resolve
+a soft block without aborting the transactions involved: we can instead
+rearrange the order of the wait queue. This rearrangement reverses the
+direction of the soft edge between two processes with conflicting requests
+whose queue order is reversed. If we can find a rearrangement that
+eliminates a cycle without creating new ones, then we can avoid an abort.
+Checking for such possible rearrangements is the trickiest part of the
+algorithm.
+
+The workhorse of the deadlock detector is a routine FindLockCycle() which
+is given a starting point process (which must be a waiting process).
+It recursively scans outward across waits-for edges as discussed above.
+If it finds no cycle involving the start point, it returns "false".
+(As discussed above, we can ignore cycles not involving the start point.)
+When such a cycle is found, FindLockCycle() returns "true", and as it
+unwinds it also builds a list of any "soft" edges involved in the cycle.
+If the resulting list is empty then there is a hard deadlock and the
+configuration cannot succeed. However, if the list is not empty, then
+reversing any one of the listed edges through wait-queue rearrangement
+will eliminate that cycle. Since such a reversal might create cycles
+elsewhere, we may need to try every possibility. Therefore, we need to
+be able to invoke FindLockCycle() on hypothetical configurations (wait
+orders) as well as the current real order.
+
+The easiest way to handle this seems to be to have a lookaside table that
+shows the proposed new queue order for each wait queue that we are
+considering rearranging. This table is checked by FindLockCycle, and it
+believes the proposed queue order rather than the real order for each lock
+that has an entry in the lookaside table.
+
+We build a proposed new queue order by doing a "topological sort" of the
+existing entries. Each soft edge that we are currently considering
+reversing creates a property of the partial order that the topological sort
+has to enforce. We must use a sort method that preserves the input
+ordering as much as possible, so as not to gratuitously break arrival
+order for processes not involved in a deadlock. (This is not true of the
+tsort method shown in Knuth, for example, but it's easily done by a simple
+doubly-nested-loop method that emits the first legal candidate at each
+step. Fortunately, we don't need a highly efficient sort algorithm, since
+the number of partial order constraints is not likely to be large.) Note
+that failure of the topological sort tells us we have conflicting ordering
+constraints, and therefore that the last-added soft edge reversal
+conflicts with a prior edge reversal. We need to detect this case to
+avoid an infinite loop in the case where no possible rearrangement will
+work: otherwise, we might try a reversal, find that it still leads to
+a cycle, then try to un-reverse the reversal while trying to get rid of
+that cycle, etc etc. Topological sort failure tells us the un-reversal
+is not a legitimate move in this context.
+
+So, the basic step in our rearrangement method is to take a list of
+soft edges in a cycle (as returned by FindLockCycle()) and successively
+try the reversal of each one as a topological-sort constraint added to
+whatever constraints we are already considering. We recursively search
+through all such sets of constraints to see if any one eliminates all
+the deadlock cycles at once. Although this might seem impossibly
+inefficient, it shouldn't be a big problem in practice, because there
+will normally be very few, and not very large, deadlock cycles --- if
+any at all. So the combinatorial inefficiency isn't going to hurt us.
+Besides, it's better to spend some time to guarantee that we've checked
+all possible escape routes than to abort a transaction when we didn't
+really have to.
+
+Each edge reversal constraint can be viewed as requesting that the waiting
+process A be moved to before the blocking process B in the wait queue they
+are both in. This action will reverse the desired soft edge, as well as
+any other soft edges between A and other processes it is advanced over.
+No other edges will be affected (note this is actually a constraint on our
+topological sort method to not re-order the queue more than necessary.)
+Therefore, we can be sure we have not created any new deadlock cycles if
+neither FindLockCycle(A) nor FindLockCycle(B) discovers any cycle. Given
+the above-defined behavior of FindLockCycle, each of these searches is
+necessary as well as sufficient, since FindLockCycle starting at the
+original start point will not complain about cycles that include A or B
+but not the original start point.
+
+In short then, a proposed rearrangement of the wait queue(s) is determined
+by one or more broken soft edges A->B, fully specified by the output of
+topological sorts of each wait queue involved, and then tested by invoking
+FindLockCycle() starting at the original start point as well as each of
+the mentioned processes (A's and B's). If none of the tests detect a
+cycle, then we have a valid configuration and can implement it by
+reordering the wait queues per the sort outputs (and then applying
+ProcLockWakeup on each reordered queue, in case a waiter has become wakable).
+If any test detects a soft cycle, we can try to resolve it by adding each
+soft link in that cycle, in turn, to the proposed rearrangement list.
+This is repeated recursively until we either find a workable rearrangement
+or determine that none exists. In the latter case, the outer level
+resolves the deadlock by aborting the original start-point transaction.
+
+The particular order in which rearrangements are tried depends on the
+order FindLockCycle() happens to scan in, so if there are multiple
+workable rearrangements of the wait queues, then it is unspecified which
+one will be chosen. What's more important is that we guarantee to try
+every queue rearrangement that could lead to success. (For example,
+if we have A before B before C and the needed order constraints are
+C before A and B before C, we would first discover that A before C
+doesn't work and try the rearrangement C before A before B. This would
+eventually lead to the discovery of the additional constraint B before C.)
+
+Got that?
+
+Miscellaneous Notes
+-------------------
+
+1. It is easily proven that no deadlock will be missed due to our
+asynchronous invocation of deadlock checking. A deadlock cycle in the WFG
+is formed when the last edge in the cycle is added; therefore the last
+process in the cycle to wait (the one from which that edge is outgoing) is
+certain to detect and resolve the cycle when it later runs CheckDeadLock.
+This holds even if that edge addition created multiple cycles; the process
+may indeed abort without ever noticing those additional cycles, but we
+don't particularly care. The only other possible creation of deadlocks is
+during deadlock resolution's rearrangement of wait queues, and we already
+saw that that algorithm will prove that it creates no new deadlocks before
+it attempts to actually execute any rearrangement.
+
+2. It is not certain that a deadlock will be resolved by aborting the
+last-to-wait process. If earlier waiters in the cycle have not yet run
+CheckDeadLock, then the first one to do so will be the victim.
+
+3. No live (wakable) process can be missed by ProcLockWakeup, since it
+examines every member of the wait queue (this was not true in the 7.0
+implementation, BTW). Therefore, if ProcLockWakeup is always invoked
+after a lock is released or a wait queue is rearranged, there can be no
+failure to wake a wakable process. One should also note that
+LockErrorCleanup (abort a waiter due to outside factors) must run
+ProcLockWakeup, in case the canceled waiter was soft-blocking other
+waiters.
+
+4. We can minimize excess rearrangement-trial work by being careful to
+scan the wait queue from the front when looking for soft edges. For
+example, if we have queue order A,B,C and C has deadlock conflicts with
+both A and B, we want to generate the "C before A" constraint first,
+rather than wasting time with "C before B", which won't move C far
+enough up. So we look for soft edges outgoing from C starting at the
+front of the wait queue.
+
+5. The working data structures needed by the deadlock detection code can
+be limited to numbers of entries computed from MaxBackends. Therefore,
+we can allocate the worst-case space needed during backend startup. This
+seems a safer approach than trying to allocate workspace on the fly; we
+don't want to risk having the deadlock detector run out of memory, else
+we really have no guarantees at all that deadlock will be detected.
+
+6. We abuse the deadlock detector to implement autovacuum cancellation.
+When we run the detector and we find that there's an autovacuum worker
+involved in the waits-for graph, we store a pointer to its PGPROC, and
+return a special return code (unless a hard deadlock has been detected).
+The caller can then send a cancellation signal. This implements the
+principle that autovacuum has a low locking priority (eg it must not block
+DDL on the table).
+
+Group Locking
+-------------
+
+As if all of that weren't already complicated enough, PostgreSQL now supports
+parallelism (see src/backend/access/transam/README.parallel), which means that
+we might need to resolve deadlocks that occur between gangs of related
+processes rather than individual processes. This doesn't change the basic
+deadlock detection algorithm very much, but it makes the bookkeeping more
+complicated.
+
+We choose to regard locks held by processes in the same parallel group as
+non-conflicting with the exception of relation extension lock. This means that
+two processes in a parallel group can hold a self-exclusive lock on the same
+relation at the same time, or one process can acquire an AccessShareLock while
+the other already holds AccessExclusiveLock. This might seem dangerous and
+could be in some cases (more on that below), but if we didn't do this then
+parallel query would be extremely prone to self-deadlock. For example, a
+parallel query against a relation on which the leader already had
+AccessExclusiveLock would hang, because the workers would try to lock the same
+relation and be blocked by the leader; yet the leader can't finish until it
+receives completion indications from all workers. An undetected deadlock
+results. This is far from the only scenario where such a problem happens. The
+same thing will occur if the leader holds only AccessShareLock, the worker
+seeks AccessShareLock, but between the time the leader attempts to acquire the
+lock and the time the worker attempts to acquire it, some other process queues
+up waiting for an AccessExclusiveLock. In this case, too, an indefinite hang
+results.
+
+It might seem that we could predict which locks the workers will attempt to
+acquire and ensure before going parallel that those locks would be acquired
+successfully. But this is very difficult to make work in a general way. For
+example, a parallel worker's portion of the query plan could involve an
+SQL-callable function which generates a query dynamically, and that query
+might happen to hit a table on which the leader happens to hold
+AccessExclusiveLock. By imposing enough restrictions on what workers can do,
+we could eventually create a situation where their behavior can be adequately
+restricted, but these restrictions would be fairly onerous, and even then, the
+system required to decide whether the workers will succeed at acquiring the
+necessary locks would be complex and possibly buggy.
+
+So, instead, we take the approach of deciding that locks within a lock group
+do not conflict. This eliminates the possibility of an undetected deadlock,
+but also opens up some problem cases: if the leader and worker try to do some
+operation at the same time which would ordinarily be prevented by the
+heavyweight lock mechanism, undefined behavior might result. In practice, the
+dangers are modest. The leader and worker share the same transaction,
+snapshot, and combo CID hash, and neither can perform any DDL or, indeed,
+write any data at all. Thus, for either to read a table locked exclusively by
+the other is safe enough. Problems would occur if the leader initiated
+parallelism from a point in the code at which it had some backend-private
+state that made table access from another process unsafe, for example after
+calling SetReindexProcessing and before calling ResetReindexProcessing,
+catastrophe could ensue, because the worker won't have that state. Similarly,
+problems could occur with certain kinds of non-relation locks, such as
+GIN page locks. It's no safer for two related processes to perform GIN clean
+up at the same time than for unrelated processes to do the same.
+However, since parallel mode is strictly read-only at present, neither this
+nor most of the similar cases can arise at present. To allow parallel writes,
+we'll either need to (1) further enhance the deadlock detector to handle those
+types of locks in a different way than other types; or (2) have parallel
+workers use some other mutual exclusion method for such cases.
+
+Group locking adds three new members to each PGPROC: lockGroupLeader,
+lockGroupMembers, and lockGroupLink. A PGPROC's lockGroupLeader is NULL for
+processes not involved in parallel query. When a process wants to cooperate
+with parallel workers, it becomes a lock group leader, which means setting
+this field to point to its own PGPROC. When a parallel worker starts up, it
+points this field at the leader. The lockGroupMembers field is only used in
+the leader; it is a list of the member PGPROCs of the lock group (the leader
+and all workers). The lockGroupLink field is the list link for this list.
+
+All three of these fields are considered to be protected by a lock manager
+partition lock. The partition lock that protects these fields within a given
+lock group is chosen by taking the leader's pgprocno modulo the number of lock
+manager partitions. This unusual arrangement has a major advantage: the
+deadlock detector can count on the fact that no lockGroupLeader field can
+change while the deadlock detector is running, because it knows that it holds
+all the lock manager locks. Also, holding this single lock allows safe
+manipulation of the lockGroupMembers list for the lock group.
+
+We need an additional interlock when setting these fields, because a newly
+started parallel worker has to try to join the leader's lock group, but it
+has no guarantee that the group leader is still alive by the time it gets
+started. We try to ensure that the parallel leader dies after all workers
+in normal cases, but also that the system could survive relatively intact
+if that somehow fails to happen. This is one of the precautions against
+such a scenario: the leader relays its PGPROC and also its PID to the
+worker, and the worker fails to join the lock group unless the given PGPROC
+still has the same PID and is still a lock group leader. We assume that
+PIDs are not recycled quickly enough for this interlock to fail.
+
+
+User Locks (Advisory Locks)
+---------------------------
+
+User locks are handled totally on the application side as long term
+cooperative locks which may extend beyond the normal transaction boundaries.
+Their purpose is to indicate to an application that someone is `working'
+on an item. So it is possible to put a user lock on a tuple's oid,
+retrieve the tuple, work on it for an hour and then update it and remove
+the lock. While the lock is active other clients can still read and write
+the tuple but they can be aware that it has been locked at the application
+level by someone.
+
+User locks and normal locks are completely orthogonal and they don't
+interfere with each other.
+
+User locks can be acquired either at session level or transaction level.
+A session-level lock request is not automatically released at transaction
+end, but must be explicitly released by the application. (However, any
+remaining locks are always released at session end.) Transaction-level
+user lock requests behave the same as normal lock requests, in that they
+are released at transaction end and do not need explicit unlocking.
+
+Locking during Hot Standby
+--------------------------
+
+The Startup process is the only backend that can make changes during
+recovery, all other backends are read only. As a result the Startup
+process does not acquire locks on relations or objects except when the lock
+level is AccessExclusiveLock.
+
+Regular backends are only allowed to take locks on relations or objects
+at RowExclusiveLock or lower. This ensures that they do not conflict with
+each other or with the Startup process, unless AccessExclusiveLocks are
+requested by the Startup process.
+
+Deadlocks involving AccessExclusiveLocks are not possible, so we need
+not be concerned that a user initiated deadlock can prevent recovery from
+progressing.
+
+AccessExclusiveLocks on the primary node generate WAL records
+that are then applied by the Startup process. Locks are released at end
+of transaction just as they are in normal processing. These locks are
+held by the Startup process, acting as a proxy for the backends that
+originally acquired these locks. Again, these locks cannot conflict with
+one another, so the Startup process cannot deadlock itself either.
+
+Although deadlock is not possible, a regular backend's weak lock can
+prevent the Startup process from making progress in applying WAL, which is
+usually not something that should be tolerated for very long. Mechanisms
+exist to forcibly cancel a regular backend's query if it blocks the
+Startup process for too long.
diff --git a/src/backend/storage/lmgr/README-SSI b/src/backend/storage/lmgr/README-SSI
new file mode 100644
index 0000000..50d2ecc
--- /dev/null
+++ b/src/backend/storage/lmgr/README-SSI
@@ -0,0 +1,646 @@
+src/backend/storage/lmgr/README-SSI
+
+Serializable Snapshot Isolation (SSI) and Predicate Locking
+===========================================================
+
+This code is in the lmgr directory because about 90% of it is an
+implementation of predicate locking, which is required for SSI,
+rather than being directly related to SSI itself. When another use
+for predicate locking justifies the effort to tease these two things
+apart, this README file should probably be split.
+
+
+Credits
+-------
+
+This feature was developed by Kevin Grittner and Dan R. K. Ports,
+with review and suggestions from Joe Conway, Heikki Linnakangas, and
+Jeff Davis. It is based on work published in these papers:
+
+ Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
+ Serializable isolation for snapshot databases.
+ In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
+ international conference on Management of data,
+ pages 729-738, New York, NY, USA. ACM.
+ http://doi.acm.org/10.1145/1376616.1376690
+
+ Michael James Cahill. 2009.
+ Serializable Isolation for Snapshot Databases.
+ Sydney Digital Theses.
+ University of Sydney, School of Information Technologies.
+ http://hdl.handle.net/2123/5353
+
+
+Overview
+--------
+
+With true serializable transactions, if you can show that your
+transaction will do the right thing if there are no concurrent
+transactions, it will do the right thing in any mix of serializable
+transactions or be rolled back with a serialization failure. This
+feature has been implemented in PostgreSQL using SSI.
+
+
+Serializable and Snapshot Transaction Isolation Levels
+------------------------------------------------------
+
+Serializable transaction isolation is attractive for shops with
+active development by many programmers against a complex schema
+because it guarantees data integrity with very little staff time --
+if a transaction can be shown to always do the right thing when it is
+run alone (before or after any other transaction), it will always do
+the right thing in any mix of concurrent serializable transactions.
+Where conflicts with other transactions would result in an
+inconsistent state within the database or an inconsistent view of
+the data, a serializable transaction will block or roll back to
+prevent the anomaly. The SQL standard provides a specific SQLSTATE
+for errors generated when a transaction rolls back for this reason,
+so that transactions can be retried automatically.
+
+Before version 9.1, PostgreSQL did not support a full serializable
+isolation level. A request for serializable transaction isolation
+actually provided snapshot isolation. This has well known anomalies
+which can allow data corruption or inconsistent views of the data
+during concurrent transactions; although these anomalies only occur
+when certain patterns of read-write dependencies exist within a set
+of concurrent transactions. Where these patterns exist, the anomalies
+can be prevented by introducing conflicts through explicitly
+programmed locks or otherwise unnecessary writes to the database.
+Snapshot isolation is popular because performance is better than
+serializable isolation and the integrity guarantees which it does
+provide allow anomalies to be avoided or managed with reasonable
+effort in many environments.
+
+
+Serializable Isolation Implementation Strategies
+------------------------------------------------
+
+Techniques for implementing full serializable isolation have been
+published and in use in many database products for decades. The
+primary technique which has been used is Strict Two-Phase Locking
+(S2PL), which operates by blocking writes against data which has been
+read by concurrent transactions and blocking any access (read or
+write) against data which has been written by concurrent
+transactions. A cycle in a graph of blocking indicates a deadlock,
+requiring a rollback. Blocking and deadlocks under S2PL in high
+contention workloads can be debilitating, crippling throughput and
+response time.
+
+A new technique for implementing full serializable isolation in an
+MVCC database appears in the literature beginning in 2008. This
+technique, known as Serializable Snapshot Isolation (SSI) has many of
+the advantages of snapshot isolation. In particular, reads don't
+block anything and writes don't block reads. Essentially, it runs
+snapshot isolation but monitors the read-write conflicts between
+transactions to identify dangerous structures in the transaction
+graph which indicate that a set of concurrent transactions might
+produce an anomaly, and rolls back transactions to ensure that no
+anomalies occur. It will produce some false positives (where a
+transaction is rolled back even though there would not have been an
+anomaly), but will never let an anomaly occur. In the two known
+prototype implementations, performance for many workloads (even with
+the need to restart transactions which are rolled back) is very close
+to snapshot isolation and generally far better than an S2PL
+implementation.
+
+
+Apparent Serial Order of Execution
+----------------------------------
+
+One way to understand when snapshot anomalies can occur, and to
+visualize the difference between the serializable implementations
+described above, is to consider that among transactions executing at
+the serializable transaction isolation level, the results are
+required to be consistent with some serial (one-at-a-time) execution
+of the transactions [1]. How is that order determined in each?
+
+In S2PL, each transaction locks any data it accesses. It holds the
+locks until committing, preventing other transactions from making
+conflicting accesses to the same data in the interim. Some
+transactions may have to be rolled back to prevent deadlock. But
+successful transactions can always be viewed as having occurred
+sequentially, in the order they committed.
+
+With snapshot isolation, reads never block writes, nor vice versa, so
+more concurrency is possible. The order in which transactions appear
+to have executed is determined by something more subtle than in S2PL:
+read/write dependencies. If a transaction reads data, it appears to
+execute after the transaction that wrote the data it is reading.
+Similarly, if it updates data, it appears to execute after the
+transaction that wrote the previous version. These dependencies, which
+we call "wr-dependencies" and "ww-dependencies", are consistent with
+the commit order, because the first transaction must have committed
+before the second starts. However, there can also be dependencies
+between two *concurrent* transactions, i.e. where one was running when
+the other acquired its snapshot. These "rw-conflicts" occur when one
+transaction attempts to read data which is not visible to it because
+the transaction which wrote it (or will later write it) is
+concurrent. The reading transaction appears to have executed first,
+regardless of the actual sequence of transaction starts or commits,
+because it sees a database state prior to that in which the other
+transaction leaves it.
+
+Anomalies occur when a cycle is created in the graph of dependencies:
+when a dependency or series of dependencies causes transaction A to
+appear to have executed before transaction B, but another series of
+dependencies causes B to appear before A. If that's the case, then
+the results can't be consistent with any serial execution of the
+transactions.
+
+
+SSI Algorithm
+-------------
+
+As of 9.1, serializable transactions in PostgreSQL are implemented using
+Serializable Snapshot Isolation (SSI), based on the work of Cahill
+et al. Fundamentally, this allows snapshot isolation to run as it
+previously did, while monitoring for conditions which could create a
+serialization anomaly.
+
+SSI is based on the observation [2] that each snapshot isolation
+anomaly corresponds to a cycle that contains a "dangerous structure"
+of two adjacent rw-conflict edges:
+
+ Tin ------> Tpivot ------> Tout
+ rw rw
+
+SSI works by watching for this dangerous structure, and rolling
+back a transaction when needed to prevent any anomaly. This means it
+only needs to track rw-conflicts between concurrent transactions, not
+wr- and ww-dependencies. It also means there is a risk of false
+positives, because not every dangerous structure is embedded in an
+actual cycle. The number of false positives is low in practice, so
+this represents an acceptable tradeoff for keeping the detection
+overhead low.
+
+The PostgreSQL implementation uses two additional optimizations:
+
+* Tout must commit before any other transaction in the cycle
+ (see proof of Theorem 2.1 of [2]). We only roll back a transaction
+ if Tout commits before Tpivot and Tin.
+
+* if Tin is read-only, there can only be an anomaly if Tout committed
+ before Tin takes its snapshot. This optimization is an original
+ one. Proof:
+
+ - Because there is a cycle, there must be some transaction T0 that
+ precedes Tin in the cycle. (T0 might be the same as Tout.)
+
+ - The edge between T0 and Tin can't be a rw-conflict or ww-dependency,
+ because Tin was read-only, so it must be a wr-dependency.
+ Those can only occur if T0 committed before Tin took its snapshot,
+ else Tin would have ignored T0's output.
+
+ - Because Tout must commit before any other transaction in the
+ cycle, it must commit before T0 commits -- and thus before Tin
+ starts.
+
+
+PostgreSQL Implementation
+-------------------------
+
+ * Since this technique is based on Snapshot Isolation (SI), those
+areas in PostgreSQL which don't use SI can't be brought under SSI.
+This includes system tables, temporary tables, sequences, hint bit
+rewrites, etc. SSI can not eliminate existing anomalies in these
+areas.
+
+ * Any transaction which is run at a transaction isolation level
+other than SERIALIZABLE will not be affected by SSI. If you want to
+enforce business rules through SSI, all transactions should be run at
+the SERIALIZABLE transaction isolation level, and that should
+probably be set as the default.
+
+ * If all transactions are run at the SERIALIZABLE transaction
+isolation level, business rules can be enforced in triggers or
+application code without ever having a need to acquire an explicit
+lock or to use SELECT FOR SHARE or SELECT FOR UPDATE.
+
+ * Those who want to continue to use snapshot isolation without
+the additional protections of SSI (and the associated costs of
+enforcing those protections), can use the REPEATABLE READ transaction
+isolation level. This level retains its legacy behavior, which
+is identical to the old SERIALIZABLE implementation and fully
+consistent with the standard's requirements for the REPEATABLE READ
+transaction isolation level.
+
+ * Performance under this SSI implementation will be significantly
+improved if transactions which don't modify permanent tables are
+declared to be READ ONLY before they begin reading data.
+
+ * Performance under SSI will tend to degrade more rapidly with a
+large number of active database transactions than under less strict
+isolation levels. Limiting the number of active transactions through
+use of a connection pool or similar techniques may be necessary to
+maintain good performance.
+
+ * Any transaction which must be rolled back to prevent
+serialization anomalies will fail with SQLSTATE 40001, which has a
+standard meaning of "serialization failure".
+
+ * This SSI implementation makes an effort to choose the
+transaction to be canceled such that an immediate retry of the
+transaction will not fail due to conflicts with exactly the same
+transactions. Pursuant to this goal, no transaction is canceled
+until one of the other transactions in the set of conflicts which
+could generate an anomaly has successfully committed. This is
+conceptually similar to how write conflicts are handled. To fully
+implement this guarantee there needs to be a way to roll back the
+active transaction for another process with a serialization failure
+SQLSTATE, even if it is "idle in transaction".
+
+
+Predicate Locking
+-----------------
+
+Both S2PL and SSI require some form of predicate locking to handle
+situations where reads conflict with later inserts or with later
+updates which move data into the selected range. PostgreSQL didn't
+already have predicate locking, so it needed to be added to support
+full serializable transactions under either strategy. Practical
+implementations of predicate locking generally involve acquiring
+locks against data as it is accessed, using multiple granularities
+(tuple, page, table, etc.) with escalation as needed to keep the lock
+count to a number which can be tracked within RAM structures. This
+approach was used in PostgreSQL. Coarse granularities can cause some
+false positive indications of conflict. The number of false positives
+can be influenced by plan choice.
+
+
+Implementation overview
+-----------------------
+
+New RAM structures, inspired by those used to track traditional locks
+in PostgreSQL, but tailored to the needs of SIREAD predicate locking,
+are used. These refer to physical objects actually accessed in the
+course of executing the query, to model the predicates through
+inference. Anyone interested in this subject should review the
+Hellerstein, Stonebraker and Hamilton paper [3], along with the
+locking papers referenced from that and the Cahill papers.
+
+Because the SIREAD locks don't block, traditional locking techniques
+have to be modified. Intent locking (locking higher level objects
+before locking lower level objects) doesn't work with non-blocking
+"locks" (which are, in some respects, more like flags than locks).
+
+A configurable amount of shared memory is reserved at postmaster
+start-up to track predicate locks. This size cannot be changed
+without a restart.
+
+To prevent resource exhaustion, multiple fine-grained locks may
+be promoted to a single coarser-grained lock as needed.
+
+An attempt to acquire an SIREAD lock on a tuple when the same
+transaction already holds an SIREAD lock on the page or the relation
+will be ignored. Likewise, an attempt to lock a page when the
+relation is locked will be ignored, and the acquisition of a coarser
+lock will result in the automatic release of all finer-grained locks
+it covers.
+
+
+Heap locking
+------------
+
+Predicate locks will be acquired for the heap based on the following:
+
+ * For a table scan, the entire relation will be locked.
+
+ * Each tuple read which is visible to the reading transaction
+will be locked, whether or not it meets selection criteria; except
+that there is no need to acquire an SIREAD lock on a tuple when the
+transaction already holds a write lock on any tuple representing the
+row, since a rw-conflict would also create a ww-dependency which
+has more aggressive enforcement and thus will prevent any anomaly.
+
+ * Modifying a heap tuple creates a rw-conflict with any transaction
+that holds a SIREAD lock on that tuple, or on the page or relation
+that contains it.
+
+ * Inserting a new tuple creates a rw-conflict with any transaction
+holding a SIREAD lock on the entire relation. It doesn't conflict with
+page-level locks, because page-level locks are only used to aggregate
+tuple locks. Unlike index page locks, they don't lock "gaps" on the page.
+
+
+Index AM implementations
+------------------------
+
+Since predicate locks only exist to detect writes which conflict with
+earlier reads, and heap tuple locks are acquired to cover all heap
+tuples actually read, including those read through indexes, the index
+tuples which were actually scanned are not of interest in themselves;
+we only care about their "new neighbors" -- later inserts into the
+index which would have been included in the scan had they existed at
+the time. Conceptually, we want to lock the gaps between and
+surrounding index entries within the scanned range.
+
+Correctness requires that any insert into an index generates a
+rw-conflict with a concurrent serializable transaction if, after that
+insert, re-execution of any index scan of the other transaction would
+access the heap for a row not accessed during the previous execution.
+Note that a non-HOT update which expires an old index entry covered
+by the scan and adds a new entry for the modified row's new tuple
+need not generate a conflict, although an update which "moves" a row
+into the scan must generate a conflict. While correctness allows
+false positives, they should be minimized for performance reasons.
+
+Several optimizations are possible, though not all are implemented yet:
+
+ * An index scan which is just finding the right position for an
+index insertion or deletion need not acquire a predicate lock.
+
+ * An index scan which is comparing for equality on the entire key
+for a unique index need not acquire a predicate lock as long as a key
+is found corresponding to a visible tuple which has not been modified
+by another transaction -- there are no "between or around" gaps to
+cover.
+
+ * As long as built-in foreign key enforcement continues to use
+its current "special tricks" to deal with MVCC issues, predicate
+locks should not be needed for scans done by enforcement code.
+
+ * If a search determines that no rows can be found regardless of
+index contents because the search conditions are contradictory (e.g.,
+x = 1 AND x = 2), then no predicate lock is needed.
+
+Other index AM implementation considerations:
+
+ * For an index AM that doesn't have support for predicate locking,
+we just acquire a predicate lock on the whole index for any search.
+
+ * B-tree index searches acquire predicate locks only on the
+index *leaf* pages needed to lock the appropriate index range. If,
+however, a search discovers that no root page has yet been created, a
+predicate lock on the index relation is required.
+
+ * Like a B-tree, GIN searches acquire predicate locks only on the
+leaf pages of entry tree. When performing an equality scan, and an
+entry has a posting tree, the posting tree root is locked instead, to
+lock only that key value. However, fastupdate=on postpones the
+insertion of tuples into index structure by temporarily storing them
+into pending list. That makes us unable to detect r-w conflicts using
+page-level locks. To cope with that, insertions to the pending list
+conflict with all scans.
+
+ * GiST searches can determine that there are no matches at any
+level of the index, so we acquire predicate lock at each index
+level during a GiST search. An index insert at the leaf level can
+then be trusted to ripple up to all levels and locations where
+conflicting predicate locks may exist. In case there is a page split,
+we need to copy predicate lock from the original page to all the new
+pages.
+
+ * Hash index searches acquire predicate locks on the primary
+page of a bucket. It acquires a lock on both the old and new buckets
+for scans that happen concurrently with page splits. During a bucket
+split, a predicate lock is copied from the primary page of an old
+bucket to the primary page of a new bucket.
+
+ * The effects of page splits, overflows, consolidations, and
+removals must be carefully reviewed to ensure that predicate locks
+aren't "lost" during those operations, or kept with pages which could
+get re-used for different parts of the index.
+
+
+Innovations
+-----------
+
+The PostgreSQL implementation of Serializable Snapshot Isolation
+differs from what is described in the cited papers for several
+reasons:
+
+ 1. PostgreSQL didn't have any existing predicate locking. It had
+to be added from scratch.
+
+ 2. The existing in-memory lock structures were not suitable for
+tracking SIREAD locks.
+ * In PostgreSQL, tuple level locks are not held in RAM for
+any length of time; lock information is written to the tuples
+involved in the transactions.
+ * In PostgreSQL, existing lock structures have pointers to
+memory which is related to a session. SIREAD locks need to persist
+past the end of the originating transaction and even the session
+which ran it.
+ * PostgreSQL needs to be able to tolerate a large number of
+transactions executing while one long-running transaction stays open
+-- the in-RAM techniques discussed in the papers wouldn't support
+that.
+
+ 3. Unlike the database products used for the prototypes described
+in the papers, PostgreSQL didn't already have a true serializable
+isolation level distinct from snapshot isolation.
+
+ 4. PostgreSQL supports subtransactions -- an issue not mentioned
+in the papers.
+
+ 5. PostgreSQL doesn't assign a transaction number to a database
+transaction until and unless necessary (normally, when the transaction
+attempts to modify data).
+
+ 6. PostgreSQL has pluggable data types with user-definable
+operators, as well as pluggable index types, not all of which are
+based around data types which support ordering.
+
+ 7. Some possible optimizations became apparent during development
+and testing.
+
+Differences from the implementation described in the papers are
+listed below.
+
+ * New structures needed to be created in shared memory to track
+the proper information for serializable transactions and their SIREAD
+locks.
+
+ * Because PostgreSQL does not have the same concept of an "oldest
+transaction ID" for all serializable transactions as assumed in the
+Cahill thesis, we track the oldest snapshot xmin among serializable
+transactions, and a count of how many active transactions use that
+xmin. When the count hits zero we find the new oldest xmin and run a
+clean-up based on that.
+
+ * Because reads in a subtransaction may cause that subtransaction
+to roll back, thereby affecting what is written by the top level
+transaction, predicate locks must survive a subtransaction rollback.
+As a consequence, all xid usage in SSI, including predicate locking,
+is based on the top level xid. When looking at an xid that comes
+from a tuple's xmin or xmax, for example, we always call
+SubTransGetTopmostTransaction() before doing much else with it.
+
+ * PostgreSQL does not use "update in place" with a rollback log
+for its MVCC implementation. Where possible it uses "HOT" updates on
+the same page (if there is room and no indexed value is changed).
+For non-HOT updates the old tuple is expired in place and a new tuple
+is inserted at a new location. Because of this difference, a tuple
+lock in PostgreSQL doesn't automatically lock any other versions of a
+row. We don't try to copy or expand a tuple lock to any other
+versions of the row, based on the following proof that any additional
+serialization failures we would get from that would be false
+positives:
+
+ o If transaction T1 reads a row version (thus acquiring a
+predicate lock on it) and a second transaction T2 updates that row
+version (thus creating a rw-conflict graph edge from T1 to T2), must a
+third transaction T3 which re-updates the new version of the row also
+have a rw-conflict in from T1 to prevent anomalies? In other words,
+does it matter whether we recognize the edge T1 -> T3?
+
+ o If T1 has a conflict in, it certainly doesn't. Adding the
+edge T1 -> T3 would create a dangerous structure, but we already had
+one from the edge T1 -> T2, so we would have aborted something anyway.
+(T2 has already committed, else T3 could not have updated its output;
+but we would have aborted either T1 or T1's predecessor(s). Hence
+no cycle involving T1 and T3 can survive.)
+
+ o Now let's consider the case where T1 doesn't have a
+rw-conflict in. If that's the case, for this edge T1 -> T3 to make a
+difference, T3 must have a rw-conflict out that induces a cycle in the
+dependency graph, i.e. a conflict out to some transaction preceding T1
+in the graph. (A conflict out to T1 itself would be problematic too,
+but that would mean T1 has a conflict in, the case we already
+eliminated.)
+
+ o So now we're trying to figure out if there can be an
+rw-conflict edge T3 -> T0, where T0 is some transaction that precedes
+T1. For T0 to precede T1, there has to be some edge, or sequence of
+edges, from T0 to T1. At least the last edge has to be a wr-dependency
+or ww-dependency rather than a rw-conflict, because T1 doesn't have a
+rw-conflict in. And that gives us enough information about the order
+of transactions to see that T3 can't have a rw-conflict to T0:
+ - T0 committed before T1 started (the wr/ww-dependency implies this)
+ - T1 started before T2 committed (the T1->T2 rw-conflict implies this)
+ - T2 committed before T3 started (otherwise, T3 would get aborted
+ because of an update conflict)
+
+ o That means T0 committed before T3 started, and therefore
+there can't be a rw-conflict from T3 to T0.
+
+ o So in all cases, we don't need the T1 -> T3 edge to
+recognize cycles. Therefore it's not necessary for T1's SIREAD lock
+on the original tuple version to cover later versions as well.
+
+ * Predicate locking in PostgreSQL starts at the tuple level
+when possible. Multiple fine-grained locks are promoted to a single
+coarser-granularity lock as needed to avoid resource exhaustion. The
+amount of memory used for these structures is configurable, to balance
+RAM usage against SIREAD lock granularity.
+
+ * Each backend keeps a process-local table of the locks it holds.
+To support granularity promotion decisions with low CPU and locking
+overhead, this table also includes the coarser covering locks and the
+number of finer-granularity locks they cover.
+
+ * Conflicts are identified by looking for predicate locks
+when tuples are written, and by looking at the MVCC information when
+tuples are read. There is no matching between two RAM-based locks.
+
+ * Because write locks are stored in the heap tuples rather than a
+RAM-based lock table, the optimization described in the Cahill thesis
+which eliminates an SIREAD lock where there is a write lock is
+implemented by the following:
+ 1. When checking a heap write for conflicts against existing
+predicate locks, a tuple lock on the tuple being written is removed.
+ 2. When acquiring a predicate lock on a heap tuple, we
+return quickly without doing anything if it is a tuple written by the
+reading transaction.
+
+ * Rather than using conflictIn and conflictOut pointers which use
+NULL to indicate no conflict and a self-reference to indicate
+multiple conflicts or conflicts with committed transactions, we use a
+list of rw-conflicts. With the more complete information, false
+positives are reduced and we have sufficient data for more aggressive
+clean-up and other optimizations:
+
+ o We can avoid ever rolling back a transaction until and
+unless there is a pivot where a transaction on the conflict *out*
+side of the pivot committed before either of the other transactions.
+
+ o We can avoid ever rolling back a transaction when the
+transaction on the conflict *in* side of the pivot is explicitly or
+implicitly READ ONLY unless the transaction on the conflict *out*
+side of the pivot committed before the READ ONLY transaction acquired
+its snapshot. (An implicit READ ONLY transaction is one which
+committed without writing, even though it was not explicitly declared
+to be READ ONLY.)
+
+ o We can more aggressively clean up conflicts, predicate
+locks, and SSI transaction information.
+
+ * We allow a READ ONLY transaction to "opt out" of SSI if there are
+no READ WRITE transactions which could cause the READ ONLY
+transaction to ever become part of a "dangerous structure" of
+overlapping transaction dependencies.
+
+ * We allow the user to request that a READ ONLY transaction wait
+until the conditions are right for it to start in the "opt out" state
+described above. We add a DEFERRABLE state to transactions, which is
+specified and maintained in a way similar to READ ONLY. It is
+ignored for transactions that are not SERIALIZABLE and READ ONLY.
+
+ * When a transaction must be rolled back, we pick among the
+active transactions such that an immediate retry will not fail again
+on conflicts with the same transactions.
+
+ * We use the PostgreSQL SLRU system to hold summarized
+information about older committed transactions to put an upper bound
+on RAM used. Beyond that limit, information spills to disk.
+Performance can degrade in a pessimal situation, but it should be
+tolerable, and transactions won't need to be canceled or blocked
+from starting.
+
+
+R&D Issues
+----------
+
+This is intended to be the place to record specific issues which need
+more detailed review or analysis.
+
+ * WAL file replay. While serializable implementations using S2PL
+can guarantee that the write-ahead log contains commits in a sequence
+consistent with some serial execution of serializable transactions,
+SSI cannot make that guarantee. While the WAL replay is no less
+consistent than under snapshot isolation, it is possible that under
+PITR recovery or hot standby a database could reach a readable state
+where some transactions appear before other transactions which would
+have had to precede them to maintain serializable consistency. In
+essence, if we do nothing, WAL replay will be at snapshot isolation
+even for serializable transactions. Is this OK? If not, how do we
+address it?
+
+ * External replication. Look at how this impacts external
+replication solutions, like Postgres-R, Slony, pgpool, HS/SR, etc.
+This is related to the "WAL file replay" issue.
+
+ * UNIQUE btree search for equality on all columns. Since a search
+of a UNIQUE index using equality tests on all columns will lock the
+heap tuple if an entry is found, it appears that there is no need to
+get a predicate lock on the index in that case. A predicate lock is
+still needed for such a search if a matching index entry which points
+to a visible tuple is not found.
+
+ * Minimize touching of shared memory. Should lists in shared
+memory push entries which have just been returned to the front of the
+available list, so they will be popped back off soon and some memory
+might never be touched, or should we keep adding returned items to
+the end of the available list?
+
+
+References
+----------
+
+[1] http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
+Search for serial execution to find the relevant section.
+
+[2] A. Fekete et al. Making Snapshot Isolation Serializable. In ACM
+Transactions on Database Systems 30:2, Jun. 2005.
+http://dx.doi.org/10.1145/1071610.1071615
+
+[3] Joseph M. Hellerstein, Michael Stonebraker and James Hamilton. 2007.
+Architecture of a Database System. Foundations and Trends(R) in
+Databases Vol. 1, No. 2 (2007) 141-259.
+http://db.cs.berkeley.edu/papers/fntdb07-architecture.pdf
+ Of particular interest:
+ * 6.1 A Note on ACID
+ * 6.2 A Brief Review of Serializability
+ * 6.3 Locking and Latching
+ * 6.3.1 Transaction Isolation Levels
+ * 6.5.3 Next-Key Locking: Physical Surrogates for Logical Properties
diff --git a/src/backend/storage/lmgr/README.barrier b/src/backend/storage/lmgr/README.barrier
new file mode 100644
index 0000000..f78e5ac
--- /dev/null
+++ b/src/backend/storage/lmgr/README.barrier
@@ -0,0 +1,197 @@
+Memory Barriers
+===============
+
+Modern CPUs make extensive use of pipe-lining and out-of-order execution,
+meaning that the CPU is often executing more than one instruction at a
+time, and not necessarily in the order that the source code would suggest.
+Furthermore, even before the CPU gets a chance to reorder operations, the
+compiler may (and often does) reorganize the code for greater efficiency,
+particularly at higher optimization levels. Optimizing compilers and
+out-of-order execution are both critical for good performance, but they
+can lead to surprising results when multiple processes access the same
+memory space.
+
+Example
+=======
+
+Suppose x is a pointer to a structure stored in shared memory, and that the
+entire structure has been initialized to zero bytes. One backend executes
+the following code fragment:
+
+ x->foo = 1;
+ x->bar = 1;
+
+Meanwhile, at approximately the same time, another backend executes this
+code fragment:
+
+ bar = x->bar;
+ foo = x->foo;
+
+The second backend might end up with foo = 1 and bar = 1 (if it executes
+both statements after the first backend), or with foo = 0 and bar = 0 (if
+it executes both statements before the first backend), or with foo = 1 and
+bar = 0 (if the first backend executes the first statement, the second
+backend executes both statements, and then the first backend executes the
+second statement).
+
+Surprisingly, however, the second backend could also end up with foo = 0
+and bar = 1. The compiler might swap the order of the two stores performed
+by the first backend, or the two loads performed by the second backend.
+Even if it doesn't, on a machine with weak memory ordering (such as PowerPC
+or ARM) the CPU might choose to execute either the loads or the stores
+out of order. This surprising result can lead to bugs.
+
+A common pattern where this actually does result in a bug is when adding items
+onto a queue. The writer does this:
+
+ q->items[q->num_items] = new_item;
+ ++q->num_items;
+
+The reader does this:
+
+ num_items = q->num_items;
+ for (i = 0; i < num_items; ++i)
+ /* do something with q->items[i] */
+
+This code turns out to be unsafe, because the writer might increment
+q->num_items before it finishes storing the new item into the appropriate slot.
+More subtly, the reader might prefetch the contents of the q->items array
+before reading q->num_items. Thus, there's still a bug here *even if the
+writer does everything in the order we expect*. We need the writer to update
+the array before bumping the item counter, and the reader to examine the item
+counter before examining the array.
+
+Note that these types of highly counterintuitive bugs can *only* occur when
+multiple processes are interacting with the same memory segment. A given
+process always perceives its *own* writes to memory in program order.
+
+Avoiding Memory Ordering Bugs
+=============================
+
+The simplest (and often best) way to avoid memory ordering bugs is to
+protect the data structures involved with an lwlock. For more details, see
+src/backend/storage/lmgr/README. For instance, in the above example, the
+writer could acquire an lwlock in exclusive mode before appending to the
+queue, and each reader could acquire the same lock in shared mode before
+reading it. If the data structure is not heavily trafficked, this solution is
+generally entirely adequate.
+
+However, in some cases, it is desirable to avoid the overhead of acquiring
+and releasing locks. In this case, memory barriers may be used to ensure
+that the apparent order of execution is as the programmer desires. In
+PostgreSQL backend code, the pg_memory_barrier() macro may be used to achieve
+this result. In the example above, we can prevent the reader from seeing a
+garbage value by having the writer do this:
+
+ q->items[q->num_items] = new_item;
+ pg_memory_barrier();
+ ++q->num_items;
+
+And by having the reader do this:
+
+ num_items = q->num_items;
+ pg_memory_barrier();
+ for (i = 0; i < num_items; ++i)
+ /* do something with q->items[i] */
+
+The pg_memory_barrier() macro will (1) prevent the compiler from rearranging
+the code in such a way as to allow the memory accesses to occur out of order
+and (2) generate any code (often, inline assembly) that is needed to prevent
+the CPU from executing the memory accesses out of order. Specifically, the
+barrier prevents loads and stores written after the barrier from being
+performed before the barrier, and vice-versa.
+
+Although this code will work, it is needlessly inefficient. On systems with
+strong memory ordering (such as x86), the CPU never reorders loads with other
+loads, nor stores with other stores. It can, however, allow a load to be
+performed before a subsequent store. To avoid emitting unnecessary memory
+instructions, we provide two additional primitives: pg_read_barrier(), and
+pg_write_barrier(). When a memory barrier is being used to separate two
+loads, use pg_read_barrier(); when it is separating two stores, use
+pg_write_barrier(); when it is a separating a load and a store (in either
+order), use pg_memory_barrier(). pg_memory_barrier() can always substitute
+for either a read or a write barrier, but is typically more expensive, and
+therefore should be used only when needed.
+
+With these guidelines in mind, the writer can do this:
+
+ q->items[q->num_items] = new_item;
+ pg_write_barrier();
+ ++q->num_items;
+
+And the reader can do this:
+
+ num_items = q->num_items;
+ pg_read_barrier();
+ for (i = 0; i < num_items; ++i)
+ /* do something with q->items[i] */
+
+On machines with strong memory ordering, these weaker barriers will simply
+prevent compiler rearrangement, without emitting any actual machine code.
+On machines with weak memory ordering, they will prevent compiler
+reordering and also emit whatever hardware barrier may be required. Even
+on machines with weak memory ordering, a read or write barrier may be able
+to use a less expensive instruction than a full barrier.
+
+Weaknesses of Memory Barriers
+=============================
+
+While memory barriers are a powerful tool, and much cheaper than locks, they
+are also much less capable than locks. Here are some of the problems.
+
+1. Concurrent writers are unsafe. In the above example of a queue, using
+memory barriers doesn't make it safe for two processes to add items to the
+same queue at the same time. If more than one process can write to the queue,
+a spinlock or lwlock must be used to synchronize access. The readers can
+perhaps proceed without any lock, but the writers may not.
+
+Even very simple write operations often require additional synchronization.
+For example, it's not safe for multiple writers to simultaneously execute
+this code (supposing x is a pointer into shared memory):
+
+ x->foo++;
+
+Although this may compile down to a single machine-language instruction,
+the CPU will execute that instruction by reading the current value of foo,
+adding one to it, and then storing the result back to the original address.
+If two CPUs try to do this simultaneously, both may do their reads before
+either one does their writes. Such a case could be made safe by using an
+atomic variable and an atomic add. See port/atomics.h.
+
+2. Eight-byte loads and stores aren't necessarily atomic. We assume in
+various places in the source code that an aligned four-byte load or store is
+atomic, and that other processes therefore won't see a half-set value.
+Sadly, the same can't be said for eight-byte value: on some platforms, an
+aligned eight-byte load or store will generate two four-byte operations. If
+you need an atomic eight-byte read or write, you must either serialize access
+with a lock or use an atomic variable.
+
+3. No ordering guarantees. While memory barriers ensure that any given
+process performs loads and stores to shared memory in order, they don't
+guarantee synchronization. In the queue example above, we can use memory
+barriers to be sure that readers won't see garbage, but there's nothing to
+say whether a given reader will run before or after a given writer. If this
+matters in a given situation, some other mechanism must be used instead of
+or in addition to memory barriers.
+
+4. Barrier proliferation. Many algorithms that at first seem appealing
+require multiple barriers. If the number of barriers required is more than
+one or two, you may be better off just using a lock. Keep in mind that, on
+some platforms, a barrier may be implemented by acquiring and releasing a
+backend-private spinlock. This may be better than a centralized lock under
+contention, but it may also be slower in the uncontended case.
+
+Further Reading
+===============
+
+Much of the documentation about memory barriers appears to be quite
+Linux-specific. The following papers may be helpful:
+
+Memory Ordering in Modern Microprocessors, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/ordering.2007.09.19a.pdf
+
+Memory Barriers: a Hardware View for Software Hackers, by Paul E. McKenney
+* http://www.rdrop.com/users/paulmck/scalability/paper/whymb.2010.06.07c.pdf
+
+The Linux kernel also has some useful documentation on this topic. Start
+with Documentation/memory-barriers.txt
diff --git a/src/backend/storage/lmgr/condition_variable.c b/src/backend/storage/lmgr/condition_variable.c
new file mode 100644
index 0000000..910a768
--- /dev/null
+++ b/src/backend/storage/lmgr/condition_variable.c
@@ -0,0 +1,360 @@
+/*-------------------------------------------------------------------------
+ *
+ * condition_variable.c
+ * Implementation of condition variables. Condition variables provide
+ * a way for one process to wait until a specific condition occurs,
+ * without needing to know the specific identity of the process for
+ * which they are waiting. Waits for condition variables can be
+ * interrupted, unlike LWLock waits. Condition variables are safe
+ * to use within dynamic shared memory segments.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/lmgr/condition_variable.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "portability/instr_time.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+/* Initially, we are not prepared to sleep on any condition variable. */
+static ConditionVariable *cv_sleep_target = NULL;
+
+/*
+ * Initialize a condition variable.
+ */
+void
+ConditionVariableInit(ConditionVariable *cv)
+{
+ SpinLockInit(&cv->mutex);
+ proclist_init(&cv->wakeup);
+}
+
+/*
+ * Prepare to wait on a given condition variable.
+ *
+ * This can optionally be called before entering a test/sleep loop.
+ * Doing so is more efficient if we'll need to sleep at least once.
+ * However, if the first test of the exit condition is likely to succeed,
+ * it's more efficient to omit the ConditionVariablePrepareToSleep call.
+ * See comments in ConditionVariableSleep for more detail.
+ *
+ * Caution: "before entering the loop" means you *must* test the exit
+ * condition between calling ConditionVariablePrepareToSleep and calling
+ * ConditionVariableSleep. If that is inconvenient, omit calling
+ * ConditionVariablePrepareToSleep.
+ */
+void
+ConditionVariablePrepareToSleep(ConditionVariable *cv)
+{
+ int pgprocno = MyProc->pgprocno;
+
+ /*
+ * If some other sleep is already prepared, cancel it; this is necessary
+ * because we have just one static variable tracking the prepared sleep,
+ * and also only one cvWaitLink in our PGPROC. It's okay to do this
+ * because whenever control does return to the other test-and-sleep loop,
+ * its ConditionVariableSleep call will just re-establish that sleep as
+ * the prepared one.
+ */
+ if (cv_sleep_target != NULL)
+ ConditionVariableCancelSleep();
+
+ /* Record the condition variable on which we will sleep. */
+ cv_sleep_target = cv;
+
+ /* Add myself to the wait queue. */
+ SpinLockAcquire(&cv->mutex);
+ proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink);
+ SpinLockRelease(&cv->mutex);
+}
+
+/*
+ * Wait for the given condition variable to be signaled.
+ *
+ * This should be called in a predicate loop that tests for a specific exit
+ * condition and otherwise sleeps, like so:
+ *
+ * ConditionVariablePrepareToSleep(cv); // optional
+ * while (condition for which we are waiting is not true)
+ * ConditionVariableSleep(cv, wait_event_info);
+ * ConditionVariableCancelSleep();
+ *
+ * wait_event_info should be a value from one of the WaitEventXXX enums
+ * defined in pgstat.h. This controls the contents of pg_stat_activity's
+ * wait_event_type and wait_event columns while waiting.
+ */
+void
+ConditionVariableSleep(ConditionVariable *cv, uint32 wait_event_info)
+{
+ (void) ConditionVariableTimedSleep(cv, -1 /* no timeout */ ,
+ wait_event_info);
+}
+
+/*
+ * Wait for a condition variable to be signaled or a timeout to be reached.
+ *
+ * Returns true when timeout expires, otherwise returns false.
+ *
+ * See ConditionVariableSleep() for general usage.
+ */
+bool
+ConditionVariableTimedSleep(ConditionVariable *cv, long timeout,
+ uint32 wait_event_info)
+{
+ long cur_timeout = -1;
+ instr_time start_time;
+ instr_time cur_time;
+ int wait_events;
+
+ /*
+ * If the caller didn't prepare to sleep explicitly, then do so now and
+ * return immediately. The caller's predicate loop should immediately
+ * call again if its exit condition is not yet met. This will result in
+ * the exit condition being tested twice before we first sleep. The extra
+ * test can be prevented by calling ConditionVariablePrepareToSleep(cv)
+ * first. Whether it's worth doing that depends on whether you expect the
+ * exit condition to be met initially, in which case skipping the prepare
+ * is recommended because it avoids manipulations of the wait list, or not
+ * met initially, in which case preparing first is better because it
+ * avoids one extra test of the exit condition.
+ *
+ * If we are currently prepared to sleep on some other CV, we just cancel
+ * that and prepare this one; see ConditionVariablePrepareToSleep.
+ */
+ if (cv_sleep_target != cv)
+ {
+ ConditionVariablePrepareToSleep(cv);
+ return false;
+ }
+
+ /*
+ * Record the current time so that we can calculate the remaining timeout
+ * if we are woken up spuriously.
+ */
+ if (timeout >= 0)
+ {
+ INSTR_TIME_SET_CURRENT(start_time);
+ Assert(timeout >= 0 && timeout <= INT_MAX);
+ cur_timeout = timeout;
+ wait_events = WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH;
+ }
+ else
+ wait_events = WL_LATCH_SET | WL_EXIT_ON_PM_DEATH;
+
+ while (true)
+ {
+ bool done = false;
+
+ /*
+ * Wait for latch to be set. (If we're awakened for some other
+ * reason, the code below will cope anyway.)
+ */
+ (void) WaitLatch(MyLatch, wait_events, cur_timeout, wait_event_info);
+
+ /* Reset latch before examining the state of the wait list. */
+ ResetLatch(MyLatch);
+
+ /*
+ * If this process has been taken out of the wait list, then we know
+ * that it has been signaled by ConditionVariableSignal (or
+ * ConditionVariableBroadcast), so we should return to the caller. But
+ * that doesn't guarantee that the exit condition is met, only that we
+ * ought to check it. So we must put the process back into the wait
+ * list, to ensure we don't miss any additional wakeup occurring while
+ * the caller checks its exit condition. We can take ourselves out of
+ * the wait list only when the caller calls
+ * ConditionVariableCancelSleep.
+ *
+ * If we're still in the wait list, then the latch must have been set
+ * by something other than ConditionVariableSignal; though we don't
+ * guarantee not to return spuriously, we'll avoid this obvious case.
+ */
+ SpinLockAcquire(&cv->mutex);
+ if (!proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink))
+ {
+ done = true;
+ proclist_push_tail(&cv->wakeup, MyProc->pgprocno, cvWaitLink);
+ }
+ SpinLockRelease(&cv->mutex);
+
+ /*
+ * Check for interrupts, and return spuriously if that caused the
+ * current sleep target to change (meaning that interrupt handler code
+ * waited for a different condition variable).
+ */
+ CHECK_FOR_INTERRUPTS();
+ if (cv != cv_sleep_target)
+ done = true;
+
+ /* We were signaled, so return */
+ if (done)
+ return false;
+
+ /* If we're not done, update cur_timeout for next iteration */
+ if (timeout >= 0)
+ {
+ INSTR_TIME_SET_CURRENT(cur_time);
+ INSTR_TIME_SUBTRACT(cur_time, start_time);
+ cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+
+ /* Have we crossed the timeout threshold? */
+ if (cur_timeout <= 0)
+ return true;
+ }
+ }
+}
+
+/*
+ * Cancel any pending sleep operation.
+ *
+ * We just need to remove ourselves from the wait queue of any condition
+ * variable for which we have previously prepared a sleep.
+ *
+ * Do nothing if nothing is pending; this allows this function to be called
+ * during transaction abort to clean up any unfinished CV sleep.
+ *
+ * Return true if we've been signaled.
+ */
+bool
+ConditionVariableCancelSleep(void)
+{
+ ConditionVariable *cv = cv_sleep_target;
+ bool signaled = false;
+
+ if (cv == NULL)
+ return false;
+
+ SpinLockAcquire(&cv->mutex);
+ if (proclist_contains(&cv->wakeup, MyProc->pgprocno, cvWaitLink))
+ proclist_delete(&cv->wakeup, MyProc->pgprocno, cvWaitLink);
+ else
+ signaled = true;
+ SpinLockRelease(&cv->mutex);
+
+ cv_sleep_target = NULL;
+
+ return signaled;
+}
+
+/*
+ * Wake up the oldest process sleeping on the CV, if there is any.
+ *
+ * Note: it's difficult to tell whether this has any real effect: we know
+ * whether we took an entry off the list, but the entry might only be a
+ * sentinel. Hence, think twice before proposing that this should return
+ * a flag telling whether it woke somebody.
+ */
+void
+ConditionVariableSignal(ConditionVariable *cv)
+{
+ PGPROC *proc = NULL;
+
+ /* Remove the first process from the wakeup queue (if any). */
+ SpinLockAcquire(&cv->mutex);
+ if (!proclist_is_empty(&cv->wakeup))
+ proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+ SpinLockRelease(&cv->mutex);
+
+ /* If we found someone sleeping, set their latch to wake them up. */
+ if (proc != NULL)
+ SetLatch(&proc->procLatch);
+}
+
+/*
+ * Wake up all processes sleeping on the given CV.
+ *
+ * This guarantees to wake all processes that were sleeping on the CV
+ * at time of call, but processes that add themselves to the list mid-call
+ * will typically not get awakened.
+ */
+void
+ConditionVariableBroadcast(ConditionVariable *cv)
+{
+ int pgprocno = MyProc->pgprocno;
+ PGPROC *proc = NULL;
+ bool have_sentinel = false;
+
+ /*
+ * In some use-cases, it is common for awakened processes to immediately
+ * re-queue themselves. If we just naively try to reduce the wakeup list
+ * to empty, we'll get into a potentially-indefinite loop against such a
+ * process. The semantics we really want are just to be sure that we have
+ * wakened all processes that were in the list at entry. We can use our
+ * own cvWaitLink as a sentinel to detect when we've finished.
+ *
+ * A seeming flaw in this approach is that someone else might signal the
+ * CV and in doing so remove our sentinel entry. But that's fine: since
+ * CV waiters are always added and removed in order, that must mean that
+ * every previous waiter has been wakened, so we're done. We'll get an
+ * extra "set" on our latch from the someone else's signal, which is
+ * slightly inefficient but harmless.
+ *
+ * We can't insert our cvWaitLink as a sentinel if it's already in use in
+ * some other proclist. While that's not expected to be true for typical
+ * uses of this function, we can deal with it by simply canceling any
+ * prepared CV sleep. The next call to ConditionVariableSleep will take
+ * care of re-establishing the lost state.
+ */
+ if (cv_sleep_target != NULL)
+ ConditionVariableCancelSleep();
+
+ /*
+ * Inspect the state of the queue. If it's empty, we have nothing to do.
+ * If there's exactly one entry, we need only remove and signal that
+ * entry. Otherwise, remove the first entry and insert our sentinel.
+ */
+ SpinLockAcquire(&cv->mutex);
+ /* While we're here, let's assert we're not in the list. */
+ Assert(!proclist_contains(&cv->wakeup, pgprocno, cvWaitLink));
+
+ if (!proclist_is_empty(&cv->wakeup))
+ {
+ proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+ if (!proclist_is_empty(&cv->wakeup))
+ {
+ proclist_push_tail(&cv->wakeup, pgprocno, cvWaitLink);
+ have_sentinel = true;
+ }
+ }
+ SpinLockRelease(&cv->mutex);
+
+ /* Awaken first waiter, if there was one. */
+ if (proc != NULL)
+ SetLatch(&proc->procLatch);
+
+ while (have_sentinel)
+ {
+ /*
+ * Each time through the loop, remove the first wakeup list entry, and
+ * signal it unless it's our sentinel. Repeat as long as the sentinel
+ * remains in the list.
+ *
+ * Notice that if someone else removes our sentinel, we will waken one
+ * additional process before exiting. That's intentional, because if
+ * someone else signals the CV, they may be intending to waken some
+ * third process that added itself to the list after we added the
+ * sentinel. Better to give a spurious wakeup (which should be
+ * harmless beyond wasting some cycles) than to lose a wakeup.
+ */
+ proc = NULL;
+ SpinLockAcquire(&cv->mutex);
+ if (!proclist_is_empty(&cv->wakeup))
+ proc = proclist_pop_head_node(&cv->wakeup, cvWaitLink);
+ have_sentinel = proclist_contains(&cv->wakeup, pgprocno, cvWaitLink);
+ SpinLockRelease(&cv->mutex);
+
+ if (proc != NULL && proc != MyProc)
+ SetLatch(&proc->procLatch);
+ }
+}
diff --git a/src/backend/storage/lmgr/deadlock.c b/src/backend/storage/lmgr/deadlock.c
new file mode 100644
index 0000000..2bdd20b
--- /dev/null
+++ b/src/backend/storage/lmgr/deadlock.c
@@ -0,0 +1,1159 @@
+/*-------------------------------------------------------------------------
+ *
+ * deadlock.c
+ * POSTGRES deadlock detection code
+ *
+ * See src/backend/storage/lmgr/README for a description of the deadlock
+ * detection and resolution algorithms.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/deadlock.c
+ *
+ * Interface:
+ *
+ * DeadLockCheck()
+ * DeadLockReport()
+ * RememberSimpleDeadLock()
+ * InitDeadLockChecking()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "utils/memutils.h"
+
+
+/*
+ * One edge in the waits-for graph.
+ *
+ * waiter and blocker may or may not be members of a lock group, but if either
+ * is, it will be the leader rather than any other member of the lock group.
+ * The group leaders act as representatives of the whole group even though
+ * those particular processes need not be waiting at all. There will be at
+ * least one member of the waiter's lock group on the wait queue for the given
+ * lock, maybe more.
+ */
+typedef struct
+{
+ PGPROC *waiter; /* the leader of the waiting lock group */
+ PGPROC *blocker; /* the leader of the group it is waiting for */
+ LOCK *lock; /* the lock being waited for */
+ int pred; /* workspace for TopoSort */
+ int link; /* workspace for TopoSort */
+} EDGE;
+
+/* One potential reordering of a lock's wait queue */
+typedef struct
+{
+ LOCK *lock; /* the lock whose wait queue is described */
+ PGPROC **procs; /* array of PGPROC *'s in new wait order */
+ int nProcs;
+} WAIT_ORDER;
+
+/*
+ * Information saved about each edge in a detected deadlock cycle. This
+ * is used to print a diagnostic message upon failure.
+ *
+ * Note: because we want to examine this info after releasing the lock
+ * manager's partition locks, we can't just store LOCK and PGPROC pointers;
+ * we must extract out all the info we want to be able to print.
+ */
+typedef struct
+{
+ LOCKTAG locktag; /* ID of awaited lock object */
+ LOCKMODE lockmode; /* type of lock we're waiting for */
+ int pid; /* PID of blocked backend */
+} DEADLOCK_INFO;
+
+
+static bool DeadLockCheckRecurse(PGPROC *proc);
+static int TestConfiguration(PGPROC *startProc);
+static bool FindLockCycle(PGPROC *checkProc,
+ EDGE *softEdges, int *nSoftEdges);
+static bool FindLockCycleRecurse(PGPROC *checkProc, int depth,
+ EDGE *softEdges, int *nSoftEdges);
+static bool FindLockCycleRecurseMember(PGPROC *checkProc,
+ PGPROC *checkProcLeader,
+ int depth, EDGE *softEdges, int *nSoftEdges);
+static bool ExpandConstraints(EDGE *constraints, int nConstraints);
+static bool TopoSort(LOCK *lock, EDGE *constraints, int nConstraints,
+ PGPROC **ordering);
+
+#ifdef DEBUG_DEADLOCK
+static void PrintLockQueue(LOCK *lock, const char *info);
+#endif
+
+
+/*
+ * Working space for the deadlock detector
+ */
+
+/* Workspace for FindLockCycle */
+static PGPROC **visitedProcs; /* Array of visited procs */
+static int nVisitedProcs;
+
+/* Workspace for TopoSort */
+static PGPROC **topoProcs; /* Array of not-yet-output procs */
+static int *beforeConstraints; /* Counts of remaining before-constraints */
+static int *afterConstraints; /* List head for after-constraints */
+
+/* Output area for ExpandConstraints */
+static WAIT_ORDER *waitOrders; /* Array of proposed queue rearrangements */
+static int nWaitOrders;
+static PGPROC **waitOrderProcs; /* Space for waitOrders queue contents */
+
+/* Current list of constraints being considered */
+static EDGE *curConstraints;
+static int nCurConstraints;
+static int maxCurConstraints;
+
+/* Storage space for results from FindLockCycle */
+static EDGE *possibleConstraints;
+static int nPossibleConstraints;
+static int maxPossibleConstraints;
+static DEADLOCK_INFO *deadlockDetails;
+static int nDeadlockDetails;
+
+/* PGPROC pointer of any blocking autovacuum worker found */
+static PGPROC *blocking_autovacuum_proc = NULL;
+
+
+/*
+ * InitDeadLockChecking -- initialize deadlock checker during backend startup
+ *
+ * This does per-backend initialization of the deadlock checker; primarily,
+ * allocation of working memory for DeadLockCheck. We do this per-backend
+ * since there's no percentage in making the kernel do copy-on-write
+ * inheritance of workspace from the postmaster. We want to allocate the
+ * space at startup because (a) the deadlock checker might be invoked when
+ * there's no free memory left, and (b) the checker is normally run inside a
+ * signal handler, which is a very dangerous place to invoke palloc from.
+ */
+void
+InitDeadLockChecking(void)
+{
+ MemoryContext oldcxt;
+
+ /* Make sure allocations are permanent */
+ oldcxt = MemoryContextSwitchTo(TopMemoryContext);
+
+ /*
+ * FindLockCycle needs at most MaxBackends entries in visitedProcs[] and
+ * deadlockDetails[].
+ */
+ visitedProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
+ deadlockDetails = (DEADLOCK_INFO *) palloc(MaxBackends * sizeof(DEADLOCK_INFO));
+
+ /*
+ * TopoSort needs to consider at most MaxBackends wait-queue entries, and
+ * it needn't run concurrently with FindLockCycle.
+ */
+ topoProcs = visitedProcs; /* re-use this space */
+ beforeConstraints = (int *) palloc(MaxBackends * sizeof(int));
+ afterConstraints = (int *) palloc(MaxBackends * sizeof(int));
+
+ /*
+ * We need to consider rearranging at most MaxBackends/2 wait queues
+ * (since it takes at least two waiters in a queue to create a soft edge),
+ * and the expanded form of the wait queues can't involve more than
+ * MaxBackends total waiters.
+ */
+ waitOrders = (WAIT_ORDER *)
+ palloc((MaxBackends / 2) * sizeof(WAIT_ORDER));
+ waitOrderProcs = (PGPROC **) palloc(MaxBackends * sizeof(PGPROC *));
+
+ /*
+ * Allow at most MaxBackends distinct constraints in a configuration. (Is
+ * this enough? In practice it seems it should be, but I don't quite see
+ * how to prove it. If we run out, we might fail to find a workable wait
+ * queue rearrangement even though one exists.) NOTE that this number
+ * limits the maximum recursion depth of DeadLockCheckRecurse. Making it
+ * really big might potentially allow a stack-overflow problem.
+ */
+ maxCurConstraints = MaxBackends;
+ curConstraints = (EDGE *) palloc(maxCurConstraints * sizeof(EDGE));
+
+ /*
+ * Allow up to 3*MaxBackends constraints to be saved without having to
+ * re-run TestConfiguration. (This is probably more than enough, but we
+ * can survive if we run low on space by doing excess runs of
+ * TestConfiguration to re-compute constraint lists each time needed.) The
+ * last MaxBackends entries in possibleConstraints[] are reserved as
+ * output workspace for FindLockCycle.
+ */
+ maxPossibleConstraints = MaxBackends * 4;
+ possibleConstraints =
+ (EDGE *) palloc(maxPossibleConstraints * sizeof(EDGE));
+
+ MemoryContextSwitchTo(oldcxt);
+}
+
+/*
+ * DeadLockCheck -- Checks for deadlocks for a given process
+ *
+ * This code looks for deadlocks involving the given process. If any
+ * are found, it tries to rearrange lock wait queues to resolve the
+ * deadlock. If resolution is impossible, return DS_HARD_DEADLOCK ---
+ * the caller is then expected to abort the given proc's transaction.
+ *
+ * Caller must already have locked all partitions of the lock tables.
+ *
+ * On failure, deadlock details are recorded in deadlockDetails[] for
+ * subsequent printing by DeadLockReport(). That activity is separate
+ * because (a) we don't want to do it while holding all those LWLocks,
+ * and (b) we are typically invoked inside a signal handler.
+ */
+DeadLockState
+DeadLockCheck(PGPROC *proc)
+{
+ /* Initialize to "no constraints" */
+ nCurConstraints = 0;
+ nPossibleConstraints = 0;
+ nWaitOrders = 0;
+
+ /* Initialize to not blocked by an autovacuum worker */
+ blocking_autovacuum_proc = NULL;
+
+ /* Search for deadlocks and possible fixes */
+ if (DeadLockCheckRecurse(proc))
+ {
+ /*
+ * Call FindLockCycle one more time, to record the correct
+ * deadlockDetails[] for the basic state with no rearrangements.
+ */
+ int nSoftEdges;
+
+ TRACE_POSTGRESQL_DEADLOCK_FOUND();
+
+ nWaitOrders = 0;
+ if (!FindLockCycle(proc, possibleConstraints, &nSoftEdges))
+ elog(FATAL, "deadlock seems to have disappeared");
+
+ return DS_HARD_DEADLOCK; /* cannot find a non-deadlocked state */
+ }
+
+ /* Apply any needed rearrangements of wait queues */
+ for (int i = 0; i < nWaitOrders; i++)
+ {
+ LOCK *lock = waitOrders[i].lock;
+ PGPROC **procs = waitOrders[i].procs;
+ int nProcs = waitOrders[i].nProcs;
+ dclist_head *waitQueue = &lock->waitProcs;
+
+ Assert(nProcs == dclist_count(waitQueue));
+
+#ifdef DEBUG_DEADLOCK
+ PrintLockQueue(lock, "DeadLockCheck:");
+#endif
+
+ /* Reset the queue and re-add procs in the desired order */
+ dclist_init(waitQueue);
+ for (int j = 0; j < nProcs; j++)
+ dclist_push_tail(waitQueue, &procs[j]->links);
+
+#ifdef DEBUG_DEADLOCK
+ PrintLockQueue(lock, "rearranged to:");
+#endif
+
+ /* See if any waiters for the lock can be woken up now */
+ ProcLockWakeup(GetLocksMethodTable(lock), lock);
+ }
+
+ /* Return code tells caller if we had to escape a deadlock or not */
+ if (nWaitOrders > 0)
+ return DS_SOFT_DEADLOCK;
+ else if (blocking_autovacuum_proc != NULL)
+ return DS_BLOCKED_BY_AUTOVACUUM;
+ else
+ return DS_NO_DEADLOCK;
+}
+
+/*
+ * Return the PGPROC of the autovacuum that's blocking a process.
+ *
+ * We reset the saved pointer as soon as we pass it back.
+ */
+PGPROC *
+GetBlockingAutoVacuumPgproc(void)
+{
+ PGPROC *ptr;
+
+ ptr = blocking_autovacuum_proc;
+ blocking_autovacuum_proc = NULL;
+
+ return ptr;
+}
+
+/*
+ * DeadLockCheckRecurse -- recursively search for valid orderings
+ *
+ * curConstraints[] holds the current set of constraints being considered
+ * by an outer level of recursion. Add to this each possible solution
+ * constraint for any cycle detected at this level.
+ *
+ * Returns true if no solution exists. Returns false if a deadlock-free
+ * state is attainable, in which case waitOrders[] shows the required
+ * rearrangements of lock wait queues (if any).
+ */
+static bool
+DeadLockCheckRecurse(PGPROC *proc)
+{
+ int nEdges;
+ int oldPossibleConstraints;
+ bool savedList;
+ int i;
+
+ nEdges = TestConfiguration(proc);
+ if (nEdges < 0)
+ return true; /* hard deadlock --- no solution */
+ if (nEdges == 0)
+ return false; /* good configuration found */
+ if (nCurConstraints >= maxCurConstraints)
+ return true; /* out of room for active constraints? */
+ oldPossibleConstraints = nPossibleConstraints;
+ if (nPossibleConstraints + nEdges + MaxBackends <= maxPossibleConstraints)
+ {
+ /* We can save the edge list in possibleConstraints[] */
+ nPossibleConstraints += nEdges;
+ savedList = true;
+ }
+ else
+ {
+ /* Not room; will need to regenerate the edges on-the-fly */
+ savedList = false;
+ }
+
+ /*
+ * Try each available soft edge as an addition to the configuration.
+ */
+ for (i = 0; i < nEdges; i++)
+ {
+ if (!savedList && i > 0)
+ {
+ /* Regenerate the list of possible added constraints */
+ if (nEdges != TestConfiguration(proc))
+ elog(FATAL, "inconsistent results during deadlock check");
+ }
+ curConstraints[nCurConstraints] =
+ possibleConstraints[oldPossibleConstraints + i];
+ nCurConstraints++;
+ if (!DeadLockCheckRecurse(proc))
+ return false; /* found a valid solution! */
+ /* give up on that added constraint, try again */
+ nCurConstraints--;
+ }
+ nPossibleConstraints = oldPossibleConstraints;
+ return true; /* no solution found */
+}
+
+
+/*--------------------
+ * Test a configuration (current set of constraints) for validity.
+ *
+ * Returns:
+ * 0: the configuration is good (no deadlocks)
+ * -1: the configuration has a hard deadlock or is not self-consistent
+ * >0: the configuration has one or more soft deadlocks
+ *
+ * In the soft-deadlock case, one of the soft cycles is chosen arbitrarily
+ * and a list of its soft edges is returned beginning at
+ * possibleConstraints+nPossibleConstraints. The return value is the
+ * number of soft edges.
+ *--------------------
+ */
+static int
+TestConfiguration(PGPROC *startProc)
+{
+ int softFound = 0;
+ EDGE *softEdges = possibleConstraints + nPossibleConstraints;
+ int nSoftEdges;
+ int i;
+
+ /*
+ * Make sure we have room for FindLockCycle's output.
+ */
+ if (nPossibleConstraints + MaxBackends > maxPossibleConstraints)
+ return -1;
+
+ /*
+ * Expand current constraint set into wait orderings. Fail if the
+ * constraint set is not self-consistent.
+ */
+ if (!ExpandConstraints(curConstraints, nCurConstraints))
+ return -1;
+
+ /*
+ * Check for cycles involving startProc or any of the procs mentioned in
+ * constraints. We check startProc last because if it has a soft cycle
+ * still to be dealt with, we want to deal with that first.
+ */
+ for (i = 0; i < nCurConstraints; i++)
+ {
+ if (FindLockCycle(curConstraints[i].waiter, softEdges, &nSoftEdges))
+ {
+ if (nSoftEdges == 0)
+ return -1; /* hard deadlock detected */
+ softFound = nSoftEdges;
+ }
+ if (FindLockCycle(curConstraints[i].blocker, softEdges, &nSoftEdges))
+ {
+ if (nSoftEdges == 0)
+ return -1; /* hard deadlock detected */
+ softFound = nSoftEdges;
+ }
+ }
+ if (FindLockCycle(startProc, softEdges, &nSoftEdges))
+ {
+ if (nSoftEdges == 0)
+ return -1; /* hard deadlock detected */
+ softFound = nSoftEdges;
+ }
+ return softFound;
+}
+
+
+/*
+ * FindLockCycle -- basic check for deadlock cycles
+ *
+ * Scan outward from the given proc to see if there is a cycle in the
+ * waits-for graph that includes this proc. Return true if a cycle
+ * is found, else false. If a cycle is found, we return a list of
+ * the "soft edges", if any, included in the cycle. These edges could
+ * potentially be eliminated by rearranging wait queues. We also fill
+ * deadlockDetails[] with information about the detected cycle; this info
+ * is not used by the deadlock algorithm itself, only to print a useful
+ * message after failing.
+ *
+ * Since we need to be able to check hypothetical configurations that would
+ * exist after wait queue rearrangement, the routine pays attention to the
+ * table of hypothetical queue orders in waitOrders[]. These orders will
+ * be believed in preference to the actual ordering seen in the locktable.
+ */
+static bool
+FindLockCycle(PGPROC *checkProc,
+ EDGE *softEdges, /* output argument */
+ int *nSoftEdges) /* output argument */
+{
+ nVisitedProcs = 0;
+ nDeadlockDetails = 0;
+ *nSoftEdges = 0;
+ return FindLockCycleRecurse(checkProc, 0, softEdges, nSoftEdges);
+}
+
+static bool
+FindLockCycleRecurse(PGPROC *checkProc,
+ int depth,
+ EDGE *softEdges, /* output argument */
+ int *nSoftEdges) /* output argument */
+{
+ int i;
+ dlist_iter iter;
+
+ /*
+ * If this process is a lock group member, check the leader instead. (Note
+ * that we might be the leader, in which case this is a no-op.)
+ */
+ if (checkProc->lockGroupLeader != NULL)
+ checkProc = checkProc->lockGroupLeader;
+
+ /*
+ * Have we already seen this proc?
+ */
+ for (i = 0; i < nVisitedProcs; i++)
+ {
+ if (visitedProcs[i] == checkProc)
+ {
+ /* If we return to starting point, we have a deadlock cycle */
+ if (i == 0)
+ {
+ /*
+ * record total length of cycle --- outer levels will now fill
+ * deadlockDetails[]
+ */
+ Assert(depth <= MaxBackends);
+ nDeadlockDetails = depth;
+
+ return true;
+ }
+
+ /*
+ * Otherwise, we have a cycle but it does not include the start
+ * point, so say "no deadlock".
+ */
+ return false;
+ }
+ }
+ /* Mark proc as seen */
+ Assert(nVisitedProcs < MaxBackends);
+ visitedProcs[nVisitedProcs++] = checkProc;
+
+ /*
+ * If the process is waiting, there is an outgoing waits-for edge to each
+ * process that blocks it.
+ */
+ if (checkProc->links.next != NULL && checkProc->waitLock != NULL &&
+ FindLockCycleRecurseMember(checkProc, checkProc, depth, softEdges,
+ nSoftEdges))
+ return true;
+
+ /*
+ * If the process is not waiting, there could still be outgoing waits-for
+ * edges if it is part of a lock group, because other members of the lock
+ * group might be waiting even though this process is not. (Given lock
+ * groups {A1, A2} and {B1, B2}, if A1 waits for B1 and B2 waits for A2,
+ * that is a deadlock even neither of B1 and A2 are waiting for anything.)
+ */
+ dlist_foreach(iter, &checkProc->lockGroupMembers)
+ {
+ PGPROC *memberProc;
+
+ memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur);
+
+ if (memberProc->links.next != NULL && memberProc->waitLock != NULL &&
+ memberProc != checkProc &&
+ FindLockCycleRecurseMember(memberProc, checkProc, depth, softEdges,
+ nSoftEdges))
+ return true;
+ }
+
+ return false;
+}
+
+static bool
+FindLockCycleRecurseMember(PGPROC *checkProc,
+ PGPROC *checkProcLeader,
+ int depth,
+ EDGE *softEdges, /* output argument */
+ int *nSoftEdges) /* output argument */
+{
+ PGPROC *proc;
+ LOCK *lock = checkProc->waitLock;
+ dlist_iter proclock_iter;
+ LockMethod lockMethodTable;
+ int conflictMask;
+ int i;
+ int numLockModes,
+ lm;
+
+ /*
+ * The relation extension lock can never participate in actual deadlock
+ * cycle. See Assert in LockAcquireExtended. So, there is no advantage
+ * in checking wait edges from it.
+ */
+ if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND)
+ return false;
+
+ lockMethodTable = GetLocksMethodTable(lock);
+ numLockModes = lockMethodTable->numLockModes;
+ conflictMask = lockMethodTable->conflictTab[checkProc->waitLockMode];
+
+ /*
+ * Scan for procs that already hold conflicting locks. These are "hard"
+ * edges in the waits-for graph.
+ */
+ dlist_foreach(proclock_iter, &lock->procLocks)
+ {
+ PROCLOCK *proclock = dlist_container(PROCLOCK, lockLink, proclock_iter.cur);
+ PGPROC *leader;
+
+ proc = proclock->tag.myProc;
+ leader = proc->lockGroupLeader == NULL ? proc : proc->lockGroupLeader;
+
+ /* A proc never blocks itself or any other lock group member */
+ if (leader != checkProcLeader)
+ {
+ for (lm = 1; lm <= numLockModes; lm++)
+ {
+ if ((proclock->holdMask & LOCKBIT_ON(lm)) &&
+ (conflictMask & LOCKBIT_ON(lm)))
+ {
+ /* This proc hard-blocks checkProc */
+ if (FindLockCycleRecurse(proc, depth + 1,
+ softEdges, nSoftEdges))
+ {
+ /* fill deadlockDetails[] */
+ DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+ info->locktag = lock->tag;
+ info->lockmode = checkProc->waitLockMode;
+ info->pid = checkProc->pid;
+
+ return true;
+ }
+
+ /*
+ * No deadlock here, but see if this proc is an autovacuum
+ * that is directly hard-blocking our own proc. If so,
+ * report it so that the caller can send a cancel signal
+ * to it, if appropriate. If there's more than one such
+ * proc, it's indeterminate which one will be reported.
+ *
+ * We don't touch autovacuums that are indirectly blocking
+ * us; it's up to the direct blockee to take action. This
+ * rule simplifies understanding the behavior and ensures
+ * that an autovacuum won't be canceled with less than
+ * deadlock_timeout grace period.
+ *
+ * Note we read statusFlags without any locking. This is
+ * OK only for checking the PROC_IS_AUTOVACUUM flag,
+ * because that flag is set at process start and never
+ * reset. There is logic elsewhere to avoid canceling an
+ * autovacuum that is working to prevent XID wraparound
+ * problems (which needs to read a different statusFlags
+ * bit), but we don't do that here to avoid grabbing
+ * ProcArrayLock.
+ */
+ if (checkProc == MyProc &&
+ proc->statusFlags & PROC_IS_AUTOVACUUM)
+ blocking_autovacuum_proc = proc;
+
+ /* We're done looking at this proclock */
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * Scan for procs that are ahead of this one in the lock's wait queue.
+ * Those that have conflicting requests soft-block this one. This must be
+ * done after the hard-block search, since if another proc both hard- and
+ * soft-blocks this one, we want to call it a hard edge.
+ *
+ * If there is a proposed re-ordering of the lock's wait order, use that
+ * rather than the current wait order.
+ */
+ for (i = 0; i < nWaitOrders; i++)
+ {
+ if (waitOrders[i].lock == lock)
+ break;
+ }
+
+ if (i < nWaitOrders)
+ {
+ /* Use the given hypothetical wait queue order */
+ PGPROC **procs = waitOrders[i].procs;
+ int queue_size = waitOrders[i].nProcs;
+
+ for (i = 0; i < queue_size; i++)
+ {
+ PGPROC *leader;
+
+ proc = procs[i];
+ leader = proc->lockGroupLeader == NULL ? proc :
+ proc->lockGroupLeader;
+
+ /*
+ * TopoSort will always return an ordering with group members
+ * adjacent to each other in the wait queue (see comments
+ * therein). So, as soon as we reach a process in the same lock
+ * group as checkProc, we know we've found all the conflicts that
+ * precede any member of the lock group lead by checkProcLeader.
+ */
+ if (leader == checkProcLeader)
+ break;
+
+ /* Is there a conflict with this guy's request? */
+ if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0)
+ {
+ /* This proc soft-blocks checkProc */
+ if (FindLockCycleRecurse(proc, depth + 1,
+ softEdges, nSoftEdges))
+ {
+ /* fill deadlockDetails[] */
+ DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+ info->locktag = lock->tag;
+ info->lockmode = checkProc->waitLockMode;
+ info->pid = checkProc->pid;
+
+ /*
+ * Add this edge to the list of soft edges in the cycle
+ */
+ Assert(*nSoftEdges < MaxBackends);
+ softEdges[*nSoftEdges].waiter = checkProcLeader;
+ softEdges[*nSoftEdges].blocker = leader;
+ softEdges[*nSoftEdges].lock = lock;
+ (*nSoftEdges)++;
+ return true;
+ }
+ }
+ }
+ }
+ else
+ {
+ PGPROC *lastGroupMember = NULL;
+ dlist_iter proc_iter;
+ dclist_head *waitQueue;
+
+ /* Use the true lock wait queue order */
+ waitQueue = &lock->waitProcs;
+
+ /*
+ * Find the last member of the lock group that is present in the wait
+ * queue. Anything after this is not a soft lock conflict. If group
+ * locking is not in use, then we know immediately which process we're
+ * looking for, but otherwise we've got to search the wait queue to
+ * find the last process actually present.
+ */
+ if (checkProc->lockGroupLeader == NULL)
+ lastGroupMember = checkProc;
+ else
+ {
+ dclist_foreach(proc_iter, waitQueue)
+ {
+ proc = dlist_container(PGPROC, links, proc_iter.cur);
+
+ if (proc->lockGroupLeader == checkProcLeader)
+ lastGroupMember = proc;
+ }
+ Assert(lastGroupMember != NULL);
+ }
+
+ /*
+ * OK, now rescan (or scan) the queue to identify the soft conflicts.
+ */
+ dclist_foreach(proc_iter, waitQueue)
+ {
+ PGPROC *leader;
+
+ proc = dlist_container(PGPROC, links, proc_iter.cur);
+
+ leader = proc->lockGroupLeader == NULL ? proc :
+ proc->lockGroupLeader;
+
+ /* Done when we reach the target proc */
+ if (proc == lastGroupMember)
+ break;
+
+ /* Is there a conflict with this guy's request? */
+ if ((LOCKBIT_ON(proc->waitLockMode) & conflictMask) != 0 &&
+ leader != checkProcLeader)
+ {
+ /* This proc soft-blocks checkProc */
+ if (FindLockCycleRecurse(proc, depth + 1,
+ softEdges, nSoftEdges))
+ {
+ /* fill deadlockDetails[] */
+ DEADLOCK_INFO *info = &deadlockDetails[depth];
+
+ info->locktag = lock->tag;
+ info->lockmode = checkProc->waitLockMode;
+ info->pid = checkProc->pid;
+
+ /*
+ * Add this edge to the list of soft edges in the cycle
+ */
+ Assert(*nSoftEdges < MaxBackends);
+ softEdges[*nSoftEdges].waiter = checkProcLeader;
+ softEdges[*nSoftEdges].blocker = leader;
+ softEdges[*nSoftEdges].lock = lock;
+ (*nSoftEdges)++;
+ return true;
+ }
+ }
+ }
+ }
+
+ /*
+ * No conflict detected here.
+ */
+ return false;
+}
+
+
+/*
+ * ExpandConstraints -- expand a list of constraints into a set of
+ * specific new orderings for affected wait queues
+ *
+ * Input is a list of soft edges to be reversed. The output is a list
+ * of nWaitOrders WAIT_ORDER structs in waitOrders[], with PGPROC array
+ * workspace in waitOrderProcs[].
+ *
+ * Returns true if able to build an ordering that satisfies all the
+ * constraints, false if not (there are contradictory constraints).
+ */
+static bool
+ExpandConstraints(EDGE *constraints,
+ int nConstraints)
+{
+ int nWaitOrderProcs = 0;
+ int i,
+ j;
+
+ nWaitOrders = 0;
+
+ /*
+ * Scan constraint list backwards. This is because the last-added
+ * constraint is the only one that could fail, and so we want to test it
+ * for inconsistency first.
+ */
+ for (i = nConstraints; --i >= 0;)
+ {
+ LOCK *lock = constraints[i].lock;
+
+ /* Did we already make a list for this lock? */
+ for (j = nWaitOrders; --j >= 0;)
+ {
+ if (waitOrders[j].lock == lock)
+ break;
+ }
+ if (j >= 0)
+ continue;
+ /* No, so allocate a new list */
+ waitOrders[nWaitOrders].lock = lock;
+ waitOrders[nWaitOrders].procs = waitOrderProcs + nWaitOrderProcs;
+ waitOrders[nWaitOrders].nProcs = dclist_count(&lock->waitProcs);
+ nWaitOrderProcs += dclist_count(&lock->waitProcs);
+ Assert(nWaitOrderProcs <= MaxBackends);
+
+ /*
+ * Do the topo sort. TopoSort need not examine constraints after this
+ * one, since they must be for different locks.
+ */
+ if (!TopoSort(lock, constraints, i + 1,
+ waitOrders[nWaitOrders].procs))
+ return false;
+ nWaitOrders++;
+ }
+ return true;
+}
+
+
+/*
+ * TopoSort -- topological sort of a wait queue
+ *
+ * Generate a re-ordering of a lock's wait queue that satisfies given
+ * constraints about certain procs preceding others. (Each such constraint
+ * is a fact of a partial ordering.) Minimize rearrangement of the queue
+ * not needed to achieve the partial ordering.
+ *
+ * This is a lot simpler and slower than, for example, the topological sort
+ * algorithm shown in Knuth's Volume 1. However, Knuth's method doesn't
+ * try to minimize the damage to the existing order. In practice we are
+ * not likely to be working with more than a few constraints, so the apparent
+ * slowness of the algorithm won't really matter.
+ *
+ * The initial queue ordering is taken directly from the lock's wait queue.
+ * The output is an array of PGPROC pointers, of length equal to the lock's
+ * wait queue length (the caller is responsible for providing this space).
+ * The partial order is specified by an array of EDGE structs. Each EDGE
+ * is one that we need to reverse, therefore the "waiter" must appear before
+ * the "blocker" in the output array. The EDGE array may well contain
+ * edges associated with other locks; these should be ignored.
+ *
+ * Returns true if able to build an ordering that satisfies all the
+ * constraints, false if not (there are contradictory constraints).
+ */
+static bool
+TopoSort(LOCK *lock,
+ EDGE *constraints,
+ int nConstraints,
+ PGPROC **ordering) /* output argument */
+{
+ dclist_head *waitQueue = &lock->waitProcs;
+ int queue_size = dclist_count(waitQueue);
+ PGPROC *proc;
+ int i,
+ j,
+ jj,
+ k,
+ kk,
+ last;
+ dlist_iter proc_iter;
+
+ /* First, fill topoProcs[] array with the procs in their current order */
+ i = 0;
+ dclist_foreach(proc_iter, waitQueue)
+ {
+ proc = dlist_container(PGPROC, links, proc_iter.cur);
+ topoProcs[i++] = proc;
+ }
+ Assert(i == queue_size);
+
+ /*
+ * Scan the constraints, and for each proc in the array, generate a count
+ * of the number of constraints that say it must be before something else,
+ * plus a list of the constraints that say it must be after something
+ * else. The count for the j'th proc is stored in beforeConstraints[j],
+ * and the head of its list in afterConstraints[j]. Each constraint
+ * stores its list link in constraints[i].link (note any constraint will
+ * be in just one list). The array index for the before-proc of the i'th
+ * constraint is remembered in constraints[i].pred.
+ *
+ * Note that it's not necessarily the case that every constraint affects
+ * this particular wait queue. Prior to group locking, a process could be
+ * waiting for at most one lock. But a lock group can be waiting for
+ * zero, one, or multiple locks. Since topoProcs[] is an array of the
+ * processes actually waiting, while constraints[] is an array of group
+ * leaders, we've got to scan through topoProcs[] for each constraint,
+ * checking whether both a waiter and a blocker for that group are
+ * present. If so, the constraint is relevant to this wait queue; if not,
+ * it isn't.
+ */
+ MemSet(beforeConstraints, 0, queue_size * sizeof(int));
+ MemSet(afterConstraints, 0, queue_size * sizeof(int));
+ for (i = 0; i < nConstraints; i++)
+ {
+ /*
+ * Find a representative process that is on the lock queue and part of
+ * the waiting lock group. This may or may not be the leader, which
+ * may or may not be waiting at all. If there are any other processes
+ * in the same lock group on the queue, set their number of
+ * beforeConstraints to -1 to indicate that they should be emitted
+ * with their groupmates rather than considered separately.
+ *
+ * In this loop and the similar one just below, it's critical that we
+ * consistently select the same representative member of any one lock
+ * group, so that all the constraints are associated with the same
+ * proc, and the -1's are only associated with not-representative
+ * members. We select the last one in the topoProcs array.
+ */
+ proc = constraints[i].waiter;
+ Assert(proc != NULL);
+ jj = -1;
+ for (j = queue_size; --j >= 0;)
+ {
+ PGPROC *waiter = topoProcs[j];
+
+ if (waiter == proc || waiter->lockGroupLeader == proc)
+ {
+ Assert(waiter->waitLock == lock);
+ if (jj == -1)
+ jj = j;
+ else
+ {
+ Assert(beforeConstraints[j] <= 0);
+ beforeConstraints[j] = -1;
+ }
+ }
+ }
+
+ /* If no matching waiter, constraint is not relevant to this lock. */
+ if (jj < 0)
+ continue;
+
+ /*
+ * Similarly, find a representative process that is on the lock queue
+ * and waiting for the blocking lock group. Again, this could be the
+ * leader but does not need to be.
+ */
+ proc = constraints[i].blocker;
+ Assert(proc != NULL);
+ kk = -1;
+ for (k = queue_size; --k >= 0;)
+ {
+ PGPROC *blocker = topoProcs[k];
+
+ if (blocker == proc || blocker->lockGroupLeader == proc)
+ {
+ Assert(blocker->waitLock == lock);
+ if (kk == -1)
+ kk = k;
+ else
+ {
+ Assert(beforeConstraints[k] <= 0);
+ beforeConstraints[k] = -1;
+ }
+ }
+ }
+
+ /* If no matching blocker, constraint is not relevant to this lock. */
+ if (kk < 0)
+ continue;
+
+ Assert(beforeConstraints[jj] >= 0);
+ beforeConstraints[jj]++; /* waiter must come before */
+ /* add this constraint to list of after-constraints for blocker */
+ constraints[i].pred = jj;
+ constraints[i].link = afterConstraints[kk];
+ afterConstraints[kk] = i + 1;
+ }
+
+ /*--------------------
+ * Now scan the topoProcs array backwards. At each step, output the
+ * last proc that has no remaining before-constraints plus any other
+ * members of the same lock group; then decrease the beforeConstraints
+ * count of each of the procs it was constrained against.
+ * i = index of ordering[] entry we want to output this time
+ * j = search index for topoProcs[]
+ * k = temp for scanning constraint list for proc j
+ * last = last non-null index in topoProcs (avoid redundant searches)
+ *--------------------
+ */
+ last = queue_size - 1;
+ for (i = queue_size - 1; i >= 0;)
+ {
+ int c;
+ int nmatches = 0;
+
+ /* Find next candidate to output */
+ while (topoProcs[last] == NULL)
+ last--;
+ for (j = last; j >= 0; j--)
+ {
+ if (topoProcs[j] != NULL && beforeConstraints[j] == 0)
+ break;
+ }
+
+ /* If no available candidate, topological sort fails */
+ if (j < 0)
+ return false;
+
+ /*
+ * Output everything in the lock group. There's no point in
+ * outputting an ordering where members of the same lock group are not
+ * consecutive on the wait queue: if some other waiter is between two
+ * requests that belong to the same group, then either it conflicts
+ * with both of them and is certainly not a solution; or it conflicts
+ * with at most one of them and is thus isomorphic to an ordering
+ * where the group members are consecutive.
+ */
+ proc = topoProcs[j];
+ if (proc->lockGroupLeader != NULL)
+ proc = proc->lockGroupLeader;
+ Assert(proc != NULL);
+ for (c = 0; c <= last; ++c)
+ {
+ if (topoProcs[c] == proc || (topoProcs[c] != NULL &&
+ topoProcs[c]->lockGroupLeader == proc))
+ {
+ ordering[i - nmatches] = topoProcs[c];
+ topoProcs[c] = NULL;
+ ++nmatches;
+ }
+ }
+ Assert(nmatches > 0);
+ i -= nmatches;
+
+ /* Update beforeConstraints counts of its predecessors */
+ for (k = afterConstraints[j]; k > 0; k = constraints[k - 1].link)
+ beforeConstraints[constraints[k - 1].pred]--;
+ }
+
+ /* Done */
+ return true;
+}
+
+#ifdef DEBUG_DEADLOCK
+static void
+PrintLockQueue(LOCK *lock, const char *info)
+{
+ dclist_head *waitQueue = &lock->waitProcs;
+ dlist_iter proc_iter;
+
+ printf("%s lock %p queue ", info, lock);
+
+ dclist_foreach(proc_iter, waitQueue)
+ {
+ PGPROC *proc = dlist_container(PGPROC, links, proc_iter.cur);
+
+ printf(" %d", proc->pid);
+ }
+ printf("\n");
+ fflush(stdout);
+}
+#endif
+
+/*
+ * Report a detected deadlock, with available details.
+ */
+void
+DeadLockReport(void)
+{
+ StringInfoData clientbuf; /* errdetail for client */
+ StringInfoData logbuf; /* errdetail for server log */
+ StringInfoData locktagbuf;
+ int i;
+
+ initStringInfo(&clientbuf);
+ initStringInfo(&logbuf);
+ initStringInfo(&locktagbuf);
+
+ /* Generate the "waits for" lines sent to the client */
+ for (i = 0; i < nDeadlockDetails; i++)
+ {
+ DEADLOCK_INFO *info = &deadlockDetails[i];
+ int nextpid;
+
+ /* The last proc waits for the first one... */
+ if (i < nDeadlockDetails - 1)
+ nextpid = info[1].pid;
+ else
+ nextpid = deadlockDetails[0].pid;
+
+ /* reset locktagbuf to hold next object description */
+ resetStringInfo(&locktagbuf);
+
+ DescribeLockTag(&locktagbuf, &info->locktag);
+
+ if (i > 0)
+ appendStringInfoChar(&clientbuf, '\n');
+
+ appendStringInfo(&clientbuf,
+ _("Process %d waits for %s on %s; blocked by process %d."),
+ info->pid,
+ GetLockmodeName(info->locktag.locktag_lockmethodid,
+ info->lockmode),
+ locktagbuf.data,
+ nextpid);
+ }
+
+ /* Duplicate all the above for the server ... */
+ appendBinaryStringInfo(&logbuf, clientbuf.data, clientbuf.len);
+
+ /* ... and add info about query strings */
+ for (i = 0; i < nDeadlockDetails; i++)
+ {
+ DEADLOCK_INFO *info = &deadlockDetails[i];
+
+ appendStringInfoChar(&logbuf, '\n');
+
+ appendStringInfo(&logbuf,
+ _("Process %d: %s"),
+ info->pid,
+ pgstat_get_backend_current_activity(info->pid, false));
+ }
+
+ pgstat_report_deadlock();
+
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
+ errmsg("deadlock detected"),
+ errdetail_internal("%s", clientbuf.data),
+ errdetail_log("%s", logbuf.data),
+ errhint("See server log for query details.")));
+}
+
+/*
+ * RememberSimpleDeadLock: set up info for DeadLockReport when ProcSleep
+ * detects a trivial (two-way) deadlock. proc1 wants to block for lockmode
+ * on lock, but proc2 is already waiting and would be blocked by proc1.
+ */
+void
+RememberSimpleDeadLock(PGPROC *proc1,
+ LOCKMODE lockmode,
+ LOCK *lock,
+ PGPROC *proc2)
+{
+ DEADLOCK_INFO *info = &deadlockDetails[0];
+
+ info->locktag = lock->tag;
+ info->lockmode = lockmode;
+ info->pid = proc1->pid;
+ info++;
+ info->locktag = proc2->waitLock->tag;
+ info->lockmode = proc2->waitLockMode;
+ info->pid = proc2->pid;
+ nDeadlockDetails = 2;
+}
diff --git a/src/backend/storage/lmgr/generate-lwlocknames.pl b/src/backend/storage/lmgr/generate-lwlocknames.pl
new file mode 100644
index 0000000..863c882
--- /dev/null
+++ b/src/backend/storage/lmgr/generate-lwlocknames.pl
@@ -0,0 +1,77 @@
+#!/usr/bin/perl
+#
+# Generate lwlocknames.h and lwlocknames.c from lwlocknames.txt
+# Copyright (c) 2000-2023, PostgreSQL Global Development Group
+
+use strict;
+use warnings;
+use Getopt::Long;
+
+my $output_path = '.';
+
+my $lastlockidx = -1;
+my $continue = "\n";
+
+GetOptions('outdir:s' => \$output_path);
+
+open my $lwlocknames, '<', $ARGV[0] or die;
+
+# Include PID in suffix in case parallel make runs this multiple times.
+my $htmp = "$output_path/lwlocknames.h.tmp$$";
+my $ctmp = "$output_path/lwlocknames.c.tmp$$";
+open my $h, '>', $htmp or die "Could not open $htmp: $!";
+open my $c, '>', $ctmp or die "Could not open $ctmp: $!";
+
+my $autogen =
+ "/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */\n";
+print $h $autogen;
+print $h "/* there is deliberately not an #ifndef LWLOCKNAMES_H here */\n\n";
+print $c $autogen, "\n";
+
+print $c "const char *const IndividualLWLockNames[] = {";
+
+while (<$lwlocknames>)
+{
+ chomp;
+
+ # Skip comments
+ next if /^#/;
+ next if /^\s*$/;
+
+ die "unable to parse lwlocknames.txt"
+ unless /^(\w+)\s+(\d+)$/;
+
+ (my $lockname, my $lockidx) = ($1, $2);
+
+ my $trimmedlockname = $lockname;
+ $trimmedlockname =~ s/Lock$//;
+ die "lock names must end with 'Lock'" if $trimmedlockname eq $lockname;
+
+ die "lwlocknames.txt not in order" if $lockidx < $lastlockidx;
+ die "lwlocknames.txt has duplicates" if $lockidx == $lastlockidx;
+
+ while ($lastlockidx < $lockidx - 1)
+ {
+ ++$lastlockidx;
+ printf $c "%s \"<unassigned:%d>\"", $continue, $lastlockidx;
+ $continue = ",\n";
+ }
+ printf $c "%s \"%s\"", $continue, $trimmedlockname;
+ $lastlockidx = $lockidx;
+ $continue = ",\n";
+
+ print $h "#define $lockname (&MainLWLockArray[$lockidx].lock)\n";
+}
+
+printf $c "\n};\n";
+print $h "\n";
+printf $h "#define NUM_INDIVIDUAL_LWLOCKS %s\n", $lastlockidx + 1;
+
+close $h;
+close $c;
+
+rename($htmp, "$output_path/lwlocknames.h")
+ || die "rename: $htmp to $output_path/lwlocknames.h: $!";
+rename($ctmp, "$output_path/lwlocknames.c") || die "rename: $ctmp: $!";
+
+close $lwlocknames;
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
new file mode 100644
index 0000000..ee9b89a
--- /dev/null
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -0,0 +1,1270 @@
+/*-------------------------------------------------------------------------
+ *
+ * lmgr.c
+ * POSTGRES lock manager code
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/lmgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/xact.h"
+#include "catalog/catalog.h"
+#include "commands/progress.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "utils/inval.h"
+
+
+/*
+ * Per-backend counter for generating speculative insertion tokens.
+ *
+ * This may wrap around, but that's OK as it's only used for the short
+ * duration between inserting a tuple and checking that there are no (unique)
+ * constraint violations. It's theoretically possible that a backend sees a
+ * tuple that was speculatively inserted by another backend, but before it has
+ * started waiting on the token, the other backend completes its insertion,
+ * and then performs 2^32 unrelated insertions. And after all that, the
+ * first backend finally calls SpeculativeInsertionLockAcquire(), with the
+ * intention of waiting for the first insertion to complete, but ends up
+ * waiting for the latest unrelated insertion instead. Even then, nothing
+ * particularly bad happens: in the worst case they deadlock, causing one of
+ * the transactions to abort.
+ */
+static uint32 speculativeInsertionToken = 0;
+
+
+/*
+ * Struct to hold context info for transaction lock waits.
+ *
+ * 'oper' is the operation that needs to wait for the other transaction; 'rel'
+ * and 'ctid' specify the address of the tuple being waited for.
+ */
+typedef struct XactLockTableWaitInfo
+{
+ XLTW_Oper oper;
+ Relation rel;
+ ItemPointer ctid;
+} XactLockTableWaitInfo;
+
+static void XactLockTableWaitErrorCb(void *arg);
+
+/*
+ * RelationInitLockInfo
+ * Initializes the lock information in a relation descriptor.
+ *
+ * relcache.c must call this during creation of any reldesc.
+ */
+void
+RelationInitLockInfo(Relation relation)
+{
+ Assert(RelationIsValid(relation));
+ Assert(OidIsValid(RelationGetRelid(relation)));
+
+ relation->rd_lockInfo.lockRelId.relId = RelationGetRelid(relation);
+
+ if (relation->rd_rel->relisshared)
+ relation->rd_lockInfo.lockRelId.dbId = InvalidOid;
+ else
+ relation->rd_lockInfo.lockRelId.dbId = MyDatabaseId;
+}
+
+/*
+ * SetLocktagRelationOid
+ * Set up a locktag for a relation, given only relation OID
+ */
+static inline void
+SetLocktagRelationOid(LOCKTAG *tag, Oid relid)
+{
+ Oid dbid;
+
+ if (IsSharedRelation(relid))
+ dbid = InvalidOid;
+ else
+ dbid = MyDatabaseId;
+
+ SET_LOCKTAG_RELATION(*tag, dbid, relid);
+}
+
+/*
+ * LockRelationOid
+ *
+ * Lock a relation given only its OID. This should generally be used
+ * before attempting to open the relation's relcache entry.
+ */
+void
+LockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SetLocktagRelationOid(&tag, relid);
+
+ res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+ /*
+ * Now that we have the lock, check for invalidation messages, so that we
+ * will update or flush any stale relcache entry before we try to use it.
+ * RangeVarGetRelid() specifically relies on us for this. We can skip
+ * this in the not-uncommon case that we already had the same type of lock
+ * being requested, since then no one else could have modified the
+ * relcache entry in an undesirable way. (In the case where our own xact
+ * modifies the rel, the relcache update happens via
+ * CommandCounterIncrement, not here.)
+ *
+ * However, in corner cases where code acts on tables (usually catalogs)
+ * recursively, we might get here while still processing invalidation
+ * messages in some outer execution of this function or a sibling. The
+ * "cleared" status of the lock tells us whether we really are done
+ * absorbing relevant inval messages.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+}
+
+/*
+ * ConditionalLockRelationOid
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ *
+ * NOTE: we do not currently need conditional versions of all the
+ * LockXXX routines in this file, but they could easily be added if needed.
+ */
+bool
+ConditionalLockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SetLocktagRelationOid(&tag, relid);
+
+ res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+ if (res == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+
+ /*
+ * Now that we have the lock, check for invalidation messages; see notes
+ * in LockRelationOid.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+
+ return true;
+}
+
+/*
+ * LockRelationId
+ *
+ * Lock, given a LockRelId. Same as LockRelationOid but take LockRelId as an
+ * input.
+ */
+void
+LockRelationId(LockRelId *relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+ res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+ /*
+ * Now that we have the lock, check for invalidation messages; see notes
+ * in LockRelationOid.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+}
+
+/*
+ * UnlockRelationId
+ *
+ * Unlock, given a LockRelId. This is preferred over UnlockRelationOid
+ * for speed reasons.
+ */
+void
+UnlockRelationId(LockRelId *relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * UnlockRelationOid
+ *
+ * Unlock, given only a relation Oid. Use UnlockRelationId if you can.
+ */
+void
+UnlockRelationOid(Oid relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SetLocktagRelationOid(&tag, relid);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockRelation
+ *
+ * This is a convenience routine for acquiring an additional lock on an
+ * already-open relation. Never try to do "relation_open(foo, NoLock)"
+ * and then lock with this.
+ */
+void
+LockRelation(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ res = LockAcquireExtended(&tag, lockmode, false, false, true, &locallock);
+
+ /*
+ * Now that we have the lock, check for invalidation messages; see notes
+ * in LockRelationOid.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+}
+
+/*
+ * ConditionalLockRelation
+ *
+ * This is a convenience routine for acquiring an additional lock on an
+ * already-open relation. Never try to do "relation_open(foo, NoLock)"
+ * and then lock with this.
+ */
+bool
+ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+ if (res == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+
+ /*
+ * Now that we have the lock, check for invalidation messages; see notes
+ * in LockRelationOid.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+
+ return true;
+}
+
+/*
+ * UnlockRelation
+ *
+ * This is a convenience routine for unlocking a relation without also
+ * closing it.
+ */
+void
+UnlockRelation(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * CheckRelationLockedByMe
+ *
+ * Returns true if current transaction holds a lock on 'relation' of mode
+ * 'lockmode'. If 'orstronger' is true, a stronger lockmode is also OK.
+ * ("Stronger" is defined as "numerically higher", which is a bit
+ * semantically dubious but is OK for the purposes we use this for.)
+ */
+bool
+CheckRelationLockedByMe(Relation relation, LOCKMODE lockmode, bool orstronger)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ if (LockHeldByMe(&tag, lockmode))
+ return true;
+
+ if (orstronger)
+ {
+ LOCKMODE slockmode;
+
+ for (slockmode = lockmode + 1;
+ slockmode <= MaxLockMode;
+ slockmode++)
+ {
+ if (LockHeldByMe(&tag, slockmode))
+ {
+#ifdef NOT_USED
+ /* Sometimes this might be useful for debugging purposes */
+ elog(WARNING, "lock mode %s substituted for %s on relation %s",
+ GetLockmodeName(tag.locktag_lockmethodid, slockmode),
+ GetLockmodeName(tag.locktag_lockmethodid, lockmode),
+ RelationGetRelationName(relation));
+#endif
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
+/*
+ * LockHasWaitersRelation
+ *
+ * This is a function to check whether someone else is waiting for a
+ * lock which we are currently holding.
+ */
+bool
+LockHasWaitersRelation(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ return LockHasWaiters(&tag, lockmode, false);
+}
+
+/*
+ * LockRelationIdForSession
+ *
+ * This routine grabs a session-level lock on the target relation. The
+ * session lock persists across transaction boundaries. It will be removed
+ * when UnlockRelationIdForSession() is called, or if an ereport(ERROR) occurs,
+ * or if the backend exits.
+ *
+ * Note that one should also grab a transaction-level lock on the rel
+ * in any transaction that actually uses the rel, to ensure that the
+ * relcache entry is up to date.
+ */
+void
+LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+ (void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ * UnlockRelationIdForSession
+ */
+void
+UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION(tag, relid->dbId, relid->relId);
+
+ LockRelease(&tag, lockmode, true);
+}
+
+/*
+ * LockRelationForExtension
+ *
+ * This lock tag is used to interlock addition of pages to relations.
+ * We need such locking because bufmgr/smgr definition of P_NEW is not
+ * race-condition-proof.
+ *
+ * We assume the caller is already holding some type of regular lock on
+ * the relation, so no AcceptInvalidationMessages call is needed here.
+ */
+void
+LockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION_EXTEND(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ * ConditionalLockRelationForExtension
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION_EXTEND(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ * RelationExtensionLockWaiterCount
+ *
+ * Count the number of processes waiting for the given relation extension lock.
+ */
+int
+RelationExtensionLockWaiterCount(Relation relation)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION_EXTEND(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ return LockWaiterCount(&tag);
+}
+
+/*
+ * UnlockRelationForExtension
+ */
+void
+UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_RELATION_EXTEND(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockDatabaseFrozenIds
+ *
+ * This allows one backend per database to execute vac_update_datfrozenxid().
+ */
+void
+LockDatabaseFrozenIds(LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_DATABASE_FROZEN_IDS(tag, MyDatabaseId);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ * LockPage
+ *
+ * Obtain a page-level lock. This is currently used by some index access
+ * methods to lock individual index pages.
+ */
+void
+LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_PAGE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ blkno);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ * ConditionalLockPage
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_PAGE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ blkno);
+
+ return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ * UnlockPage
+ */
+void
+UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_PAGE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ blkno);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockTuple
+ *
+ * Obtain a tuple-level lock. This is used in a less-than-intuitive fashion
+ * because we can't afford to keep a separate lock in shared memory for every
+ * tuple. See heap_lock_tuple before using this!
+ */
+void
+LockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TUPLE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+}
+
+/*
+ * ConditionalLockTuple
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TUPLE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+
+ return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
+}
+
+/*
+ * UnlockTuple
+ */
+void
+UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TUPLE(tag,
+ relation->rd_lockInfo.lockRelId.dbId,
+ relation->rd_lockInfo.lockRelId.relId,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * XactLockTableInsert
+ *
+ * Insert a lock showing that the given transaction ID is running ---
+ * this is done when an XID is acquired by a transaction or subtransaction.
+ * The lock can then be used to wait for the transaction to finish.
+ */
+void
+XactLockTableInsert(TransactionId xid)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+
+ (void) LockAcquire(&tag, ExclusiveLock, false, false);
+}
+
+/*
+ * XactLockTableDelete
+ *
+ * Delete the lock showing that the given transaction ID is running.
+ * (This is never used for main transaction IDs; those locks are only
+ * released implicitly at transaction end. But we do use it for subtrans IDs.)
+ */
+void
+XactLockTableDelete(TransactionId xid)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+
+ LockRelease(&tag, ExclusiveLock, false);
+}
+
+/*
+ * XactLockTableWait
+ *
+ * Wait for the specified transaction to commit or abort. If an operation
+ * is specified, an error context callback is set up. If 'oper' is passed as
+ * None, no error context callback is set up.
+ *
+ * Note that this does the right thing for subtransactions: if we wait on a
+ * subtransaction, we will exit as soon as it aborts or its top parent commits.
+ * It takes some extra work to ensure this, because to save on shared memory
+ * the XID lock of a subtransaction is released when it ends, whether
+ * successfully or unsuccessfully. So we have to check if it's "still running"
+ * and if so wait for its parent.
+ */
+void
+XactLockTableWait(TransactionId xid, Relation rel, ItemPointer ctid,
+ XLTW_Oper oper)
+{
+ LOCKTAG tag;
+ XactLockTableWaitInfo info;
+ ErrorContextCallback callback;
+ bool first = true;
+
+ /*
+ * If an operation is specified, set up our verbose error context
+ * callback.
+ */
+ if (oper != XLTW_None)
+ {
+ Assert(RelationIsValid(rel));
+ Assert(ItemPointerIsValid(ctid));
+
+ info.rel = rel;
+ info.ctid = ctid;
+ info.oper = oper;
+
+ callback.callback = XactLockTableWaitErrorCb;
+ callback.arg = &info;
+ callback.previous = error_context_stack;
+ error_context_stack = &callback;
+ }
+
+ for (;;)
+ {
+ Assert(TransactionIdIsValid(xid));
+ Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+
+ (void) LockAcquire(&tag, ShareLock, false, false);
+
+ LockRelease(&tag, ShareLock, false);
+
+ if (!TransactionIdIsInProgress(xid))
+ break;
+
+ /*
+ * If the Xid belonged to a subtransaction, then the lock would have
+ * gone away as soon as it was finished; for correct tuple visibility,
+ * the right action is to wait on its parent transaction to go away.
+ * But instead of going levels up one by one, we can just wait for the
+ * topmost transaction to finish with the same end result, which also
+ * incurs less locktable traffic.
+ *
+ * Some uses of this function don't involve tuple visibility -- such
+ * as when building snapshots for logical decoding. It is possible to
+ * see a transaction in ProcArray before it registers itself in the
+ * locktable. The topmost transaction in that case is the same xid,
+ * so we try again after a short sleep. (Don't sleep the first time
+ * through, to avoid slowing down the normal case.)
+ */
+ if (!first)
+ pg_usleep(1000L);
+ first = false;
+ xid = SubTransGetTopmostTransaction(xid);
+ }
+
+ if (oper != XLTW_None)
+ error_context_stack = callback.previous;
+}
+
+/*
+ * ConditionalXactLockTableWait
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true if the lock was acquired.
+ */
+bool
+ConditionalXactLockTableWait(TransactionId xid)
+{
+ LOCKTAG tag;
+ bool first = true;
+
+ for (;;)
+ {
+ Assert(TransactionIdIsValid(xid));
+ Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+
+ if (LockAcquire(&tag, ShareLock, false, true) == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+
+ LockRelease(&tag, ShareLock, false);
+
+ if (!TransactionIdIsInProgress(xid))
+ break;
+
+ /* See XactLockTableWait about this case */
+ if (!first)
+ pg_usleep(1000L);
+ first = false;
+ xid = SubTransGetTopmostTransaction(xid);
+ }
+
+ return true;
+}
+
+/*
+ * SpeculativeInsertionLockAcquire
+ *
+ * Insert a lock showing that the given transaction ID is inserting a tuple,
+ * but hasn't yet decided whether it's going to keep it. The lock can then be
+ * used to wait for the decision to go ahead with the insertion, or aborting
+ * it.
+ *
+ * The token is used to distinguish multiple insertions by the same
+ * transaction. It is returned to caller.
+ */
+uint32
+SpeculativeInsertionLockAcquire(TransactionId xid)
+{
+ LOCKTAG tag;
+
+ speculativeInsertionToken++;
+
+ /*
+ * Check for wrap-around. Zero means no token is held, so don't use that.
+ */
+ if (speculativeInsertionToken == 0)
+ speculativeInsertionToken = 1;
+
+ SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken);
+
+ (void) LockAcquire(&tag, ExclusiveLock, false, false);
+
+ return speculativeInsertionToken;
+}
+
+/*
+ * SpeculativeInsertionLockRelease
+ *
+ * Delete the lock showing that the given transaction is speculatively
+ * inserting a tuple.
+ */
+void
+SpeculativeInsertionLockRelease(TransactionId xid)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, speculativeInsertionToken);
+
+ LockRelease(&tag, ExclusiveLock, false);
+}
+
+/*
+ * SpeculativeInsertionWait
+ *
+ * Wait for the specified transaction to finish or abort the insertion of a
+ * tuple.
+ */
+void
+SpeculativeInsertionWait(TransactionId xid, uint32 token)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_SPECULATIVE_INSERTION(tag, xid, token);
+
+ Assert(TransactionIdIsValid(xid));
+ Assert(token != 0);
+
+ (void) LockAcquire(&tag, ShareLock, false, false);
+ LockRelease(&tag, ShareLock, false);
+}
+
+/*
+ * XactLockTableWaitErrorCb
+ * Error context callback for transaction lock waits.
+ */
+static void
+XactLockTableWaitErrorCb(void *arg)
+{
+ XactLockTableWaitInfo *info = (XactLockTableWaitInfo *) arg;
+
+ /*
+ * We would like to print schema name too, but that would require a
+ * syscache lookup.
+ */
+ if (info->oper != XLTW_None &&
+ ItemPointerIsValid(info->ctid) && RelationIsValid(info->rel))
+ {
+ const char *cxt;
+
+ switch (info->oper)
+ {
+ case XLTW_Update:
+ cxt = gettext_noop("while updating tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_Delete:
+ cxt = gettext_noop("while deleting tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_Lock:
+ cxt = gettext_noop("while locking tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_LockUpdated:
+ cxt = gettext_noop("while locking updated version (%u,%u) of tuple in relation \"%s\"");
+ break;
+ case XLTW_InsertIndex:
+ cxt = gettext_noop("while inserting index tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_InsertIndexUnique:
+ cxt = gettext_noop("while checking uniqueness of tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_FetchUpdated:
+ cxt = gettext_noop("while rechecking updated tuple (%u,%u) in relation \"%s\"");
+ break;
+ case XLTW_RecheckExclusionConstr:
+ cxt = gettext_noop("while checking exclusion constraint on tuple (%u,%u) in relation \"%s\"");
+ break;
+
+ default:
+ return;
+ }
+
+ errcontext(cxt,
+ ItemPointerGetBlockNumber(info->ctid),
+ ItemPointerGetOffsetNumber(info->ctid),
+ RelationGetRelationName(info->rel));
+ }
+}
+
+/*
+ * WaitForLockersMultiple
+ * Wait until no transaction holds locks that conflict with the given
+ * locktags at the given lockmode.
+ *
+ * To do this, obtain the current list of lockers, and wait on their VXIDs
+ * until they are finished.
+ *
+ * Note we don't try to acquire the locks on the given locktags, only the
+ * VXIDs and XIDs of their lock holders; if somebody grabs a conflicting lock
+ * on the objects after we obtained our initial list of lockers, we will not
+ * wait for them.
+ */
+void
+WaitForLockersMultiple(List *locktags, LOCKMODE lockmode, bool progress)
+{
+ List *holders = NIL;
+ ListCell *lc;
+ int total = 0;
+ int done = 0;
+
+ /* Done if no locks to wait for */
+ if (locktags == NIL)
+ return;
+
+ /* Collect the transactions we need to wait on */
+ foreach(lc, locktags)
+ {
+ LOCKTAG *locktag = lfirst(lc);
+ int count;
+
+ holders = lappend(holders,
+ GetLockConflicts(locktag, lockmode,
+ progress ? &count : NULL));
+ if (progress)
+ total += count;
+ }
+
+ if (progress)
+ pgstat_progress_update_param(PROGRESS_WAITFOR_TOTAL, total);
+
+ /*
+ * Note: GetLockConflicts() never reports our own xid, hence we need not
+ * check for that. Also, prepared xacts are reported and awaited.
+ */
+
+ /* Finally wait for each such transaction to complete */
+ foreach(lc, holders)
+ {
+ VirtualTransactionId *lockholders = lfirst(lc);
+
+ while (VirtualTransactionIdIsValid(*lockholders))
+ {
+ /* If requested, publish who we're going to wait for. */
+ if (progress)
+ {
+ PGPROC *holder = BackendIdGetProc(lockholders->backendId);
+
+ if (holder)
+ pgstat_progress_update_param(PROGRESS_WAITFOR_CURRENT_PID,
+ holder->pid);
+ }
+ VirtualXactLock(*lockholders, true);
+ lockholders++;
+
+ if (progress)
+ pgstat_progress_update_param(PROGRESS_WAITFOR_DONE, ++done);
+ }
+ }
+ if (progress)
+ {
+ const int index[] = {
+ PROGRESS_WAITFOR_TOTAL,
+ PROGRESS_WAITFOR_DONE,
+ PROGRESS_WAITFOR_CURRENT_PID
+ };
+ const int64 values[] = {
+ 0, 0, 0
+ };
+
+ pgstat_progress_update_multi_param(3, index, values);
+ }
+
+ list_free_deep(holders);
+}
+
+/*
+ * WaitForLockers
+ *
+ * Same as WaitForLockersMultiple, for a single lock tag.
+ */
+void
+WaitForLockers(LOCKTAG heaplocktag, LOCKMODE lockmode, bool progress)
+{
+ List *l;
+
+ l = list_make1(&heaplocktag);
+ WaitForLockersMultiple(l, lockmode, progress);
+ list_free(l);
+}
+
+
+/*
+ * LockDatabaseObject
+ *
+ * Obtain a lock on a general object of the current database. Don't use
+ * this for shared objects (such as tablespaces). It's unwise to apply it
+ * to relations, also, since a lock taken this way will NOT conflict with
+ * locks taken via LockRelation and friends.
+ */
+void
+LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ MyDatabaseId,
+ classid,
+ objid,
+ objsubid);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+
+ /* Make sure syscaches are up-to-date with any changes we waited for */
+ AcceptInvalidationMessages();
+}
+
+/*
+ * UnlockDatabaseObject
+ */
+void
+UnlockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ MyDatabaseId,
+ classid,
+ objid,
+ objsubid);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockSharedObject
+ *
+ * Obtain a lock on a shared-across-databases object.
+ */
+void
+LockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ InvalidOid,
+ classid,
+ objid,
+ objsubid);
+
+ (void) LockAcquire(&tag, lockmode, false, false);
+
+ /* Make sure syscaches are up-to-date with any changes we waited for */
+ AcceptInvalidationMessages();
+}
+
+/*
+ * UnlockSharedObject
+ */
+void
+UnlockSharedObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ InvalidOid,
+ classid,
+ objid,
+ objsubid);
+
+ LockRelease(&tag, lockmode, false);
+}
+
+/*
+ * LockSharedObjectForSession
+ *
+ * Obtain a session-level lock on a shared-across-databases object.
+ * See LockRelationIdForSession for notes about session-level locks.
+ */
+void
+LockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ InvalidOid,
+ classid,
+ objid,
+ objsubid);
+
+ (void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ * UnlockSharedObjectForSession
+ */
+void
+UnlockSharedObjectForSession(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_OBJECT(tag,
+ InvalidOid,
+ classid,
+ objid,
+ objsubid);
+
+ LockRelease(&tag, lockmode, true);
+}
+
+/*
+ * LockApplyTransactionForSession
+ *
+ * Obtain a session-level lock on a transaction being applied on a logical
+ * replication subscriber. See LockRelationIdForSession for notes about
+ * session-level locks.
+ */
+void
+LockApplyTransactionForSession(Oid suboid, TransactionId xid, uint16 objid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_APPLY_TRANSACTION(tag,
+ MyDatabaseId,
+ suboid,
+ xid,
+ objid);
+
+ (void) LockAcquire(&tag, lockmode, true, false);
+}
+
+/*
+ * UnlockApplyTransactionForSession
+ */
+void
+UnlockApplyTransactionForSession(Oid suboid, TransactionId xid, uint16 objid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+
+ SET_LOCKTAG_APPLY_TRANSACTION(tag,
+ MyDatabaseId,
+ suboid,
+ xid,
+ objid);
+
+ LockRelease(&tag, lockmode, true);
+}
+
+/*
+ * Append a description of a lockable object to buf.
+ *
+ * Ideally we would print names for the numeric values, but that requires
+ * getting locks on system tables, which might cause problems since this is
+ * typically used to report deadlock situations.
+ */
+void
+DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
+{
+ switch ((LockTagType) tag->locktag_type)
+ {
+ case LOCKTAG_RELATION:
+ appendStringInfo(buf,
+ _("relation %u of database %u"),
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_RELATION_EXTEND:
+ appendStringInfo(buf,
+ _("extension of relation %u of database %u"),
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_DATABASE_FROZEN_IDS:
+ appendStringInfo(buf,
+ _("pg_database.datfrozenxid of database %u"),
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_PAGE:
+ appendStringInfo(buf,
+ _("page %u of relation %u of database %u"),
+ tag->locktag_field3,
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_TUPLE:
+ appendStringInfo(buf,
+ _("tuple (%u,%u) of relation %u of database %u"),
+ tag->locktag_field3,
+ tag->locktag_field4,
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_TRANSACTION:
+ appendStringInfo(buf,
+ _("transaction %u"),
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_VIRTUALTRANSACTION:
+ appendStringInfo(buf,
+ _("virtual transaction %d/%u"),
+ tag->locktag_field1,
+ tag->locktag_field2);
+ break;
+ case LOCKTAG_SPECULATIVE_TOKEN:
+ appendStringInfo(buf,
+ _("speculative token %u of transaction %u"),
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_OBJECT:
+ appendStringInfo(buf,
+ _("object %u of class %u of database %u"),
+ tag->locktag_field3,
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ case LOCKTAG_USERLOCK:
+ /* reserved for old contrib code, now on pgfoundry */
+ appendStringInfo(buf,
+ _("user lock [%u,%u,%u]"),
+ tag->locktag_field1,
+ tag->locktag_field2,
+ tag->locktag_field3);
+ break;
+ case LOCKTAG_ADVISORY:
+ appendStringInfo(buf,
+ _("advisory lock [%u,%u,%u,%u]"),
+ tag->locktag_field1,
+ tag->locktag_field2,
+ tag->locktag_field3,
+ tag->locktag_field4);
+ break;
+ case LOCKTAG_APPLY_TRANSACTION:
+ appendStringInfo(buf,
+ _("remote transaction %u of subscription %u of database %u"),
+ tag->locktag_field3,
+ tag->locktag_field2,
+ tag->locktag_field1);
+ break;
+ default:
+ appendStringInfo(buf,
+ _("unrecognized locktag type %d"),
+ (int) tag->locktag_type);
+ break;
+ }
+}
+
+/*
+ * GetLockNameFromTagType
+ *
+ * Given locktag type, return the corresponding lock name.
+ */
+const char *
+GetLockNameFromTagType(uint16 locktag_type)
+{
+ if (locktag_type > LOCKTAG_LAST_TYPE)
+ return "???";
+ return LockTagTypeNames[locktag_type];
+}
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
new file mode 100644
index 0000000..ec6240f
--- /dev/null
+++ b/src/backend/storage/lmgr/lock.c
@@ -0,0 +1,4651 @@
+/*-------------------------------------------------------------------------
+ *
+ * lock.c
+ * POSTGRES primary lock mechanism
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/lock.c
+ *
+ * NOTES
+ * A lock table is a shared memory hash table. When
+ * a process tries to acquire a lock of a type that conflicts
+ * with existing locks, it is put to sleep using the routines
+ * in storage/lmgr/proc.c.
+ *
+ * For the most part, this code should be invoked via lmgr.c
+ * or another lock-management module, not directly.
+ *
+ * Interface:
+ *
+ * InitLocks(), GetLocksMethodTable(), GetLockTagsMethodTable(),
+ * LockAcquire(), LockRelease(), LockReleaseAll(),
+ * LockCheckConflicts(), GrantLock()
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "storage/standby.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/resowner_private.h"
+
+
+/* This configuration variable is used to set the lock table size */
+int max_locks_per_xact; /* set by guc.c */
+
+#define NLOCKENTS() \
+ mul_size(max_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+
+
+/*
+ * Data structures defining the semantics of the standard lock methods.
+ *
+ * The conflict table defines the semantics of the various lock modes.
+ */
+static const LOCKMASK LockConflicts[] = {
+ 0,
+
+ /* AccessShareLock */
+ LOCKBIT_ON(AccessExclusiveLock),
+
+ /* RowShareLock */
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* RowExclusiveLock */
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* ShareUpdateExclusiveLock */
+ LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* ShareLock */
+ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* ShareRowExclusiveLock */
+ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* ExclusiveLock */
+ LOCKBIT_ON(RowShareLock) |
+ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock),
+
+ /* AccessExclusiveLock */
+ LOCKBIT_ON(AccessShareLock) | LOCKBIT_ON(RowShareLock) |
+ LOCKBIT_ON(RowExclusiveLock) | LOCKBIT_ON(ShareUpdateExclusiveLock) |
+ LOCKBIT_ON(ShareLock) | LOCKBIT_ON(ShareRowExclusiveLock) |
+ LOCKBIT_ON(ExclusiveLock) | LOCKBIT_ON(AccessExclusiveLock)
+
+};
+
+/* Names of lock modes, for debug printouts */
+static const char *const lock_mode_names[] =
+{
+ "INVALID",
+ "AccessShareLock",
+ "RowShareLock",
+ "RowExclusiveLock",
+ "ShareUpdateExclusiveLock",
+ "ShareLock",
+ "ShareRowExclusiveLock",
+ "ExclusiveLock",
+ "AccessExclusiveLock"
+};
+
+#ifndef LOCK_DEBUG
+static bool Dummy_trace = false;
+#endif
+
+static const LockMethodData default_lockmethod = {
+ MaxLockMode,
+ LockConflicts,
+ lock_mode_names,
+#ifdef LOCK_DEBUG
+ &Trace_locks
+#else
+ &Dummy_trace
+#endif
+};
+
+static const LockMethodData user_lockmethod = {
+ MaxLockMode,
+ LockConflicts,
+ lock_mode_names,
+#ifdef LOCK_DEBUG
+ &Trace_userlocks
+#else
+ &Dummy_trace
+#endif
+};
+
+/*
+ * map from lock method id to the lock table data structures
+ */
+static const LockMethod LockMethods[] = {
+ NULL,
+ &default_lockmethod,
+ &user_lockmethod
+};
+
+
+/* Record that's written to 2PC state file when a lock is persisted */
+typedef struct TwoPhaseLockRecord
+{
+ LOCKTAG locktag;
+ LOCKMODE lockmode;
+} TwoPhaseLockRecord;
+
+
+/*
+ * Count of the number of fast path lock slots we believe to be used. This
+ * might be higher than the real number if another backend has transferred
+ * our locks to the primary lock table, but it can never be lower than the
+ * real value, since only we can acquire locks on our own behalf.
+ */
+static int FastPathLocalUseCount = 0;
+
+/*
+ * Flag to indicate if the relation extension lock is held by this backend.
+ * This flag is used to ensure that while holding the relation extension lock
+ * we don't try to acquire a heavyweight lock on any other object. This
+ * restriction implies that the relation extension lock won't ever participate
+ * in the deadlock cycle because we can never wait for any other heavyweight
+ * lock after acquiring this lock.
+ *
+ * Such a restriction is okay for relation extension locks as unlike other
+ * heavyweight locks these are not held till the transaction end. These are
+ * taken for a short duration to extend a particular relation and then
+ * released.
+ */
+static bool IsRelationExtensionLockHeld PG_USED_FOR_ASSERTS_ONLY = false;
+
+/* Macros for manipulating proc->fpLockBits */
+#define FAST_PATH_BITS_PER_SLOT 3
+#define FAST_PATH_LOCKNUMBER_OFFSET 1
+#define FAST_PATH_MASK ((1 << FAST_PATH_BITS_PER_SLOT) - 1)
+#define FAST_PATH_GET_BITS(proc, n) \
+ (((proc)->fpLockBits >> (FAST_PATH_BITS_PER_SLOT * n)) & FAST_PATH_MASK)
+#define FAST_PATH_BIT_POSITION(n, l) \
+ (AssertMacro((l) >= FAST_PATH_LOCKNUMBER_OFFSET), \
+ AssertMacro((l) < FAST_PATH_BITS_PER_SLOT+FAST_PATH_LOCKNUMBER_OFFSET), \
+ AssertMacro((n) < FP_LOCK_SLOTS_PER_BACKEND), \
+ ((l) - FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT * (n)))
+#define FAST_PATH_SET_LOCKMODE(proc, n, l) \
+ (proc)->fpLockBits |= UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)
+#define FAST_PATH_CLEAR_LOCKMODE(proc, n, l) \
+ (proc)->fpLockBits &= ~(UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l))
+#define FAST_PATH_CHECK_LOCKMODE(proc, n, l) \
+ ((proc)->fpLockBits & (UINT64CONST(1) << FAST_PATH_BIT_POSITION(n, l)))
+
+/*
+ * The fast-path lock mechanism is concerned only with relation locks on
+ * unshared relations by backends bound to a database. The fast-path
+ * mechanism exists mostly to accelerate acquisition and release of locks
+ * that rarely conflict. Because ShareUpdateExclusiveLock is
+ * self-conflicting, it can't use the fast-path mechanism; but it also does
+ * not conflict with any of the locks that do, so we can ignore it completely.
+ */
+#define EligibleForRelationFastPath(locktag, mode) \
+ ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \
+ (locktag)->locktag_type == LOCKTAG_RELATION && \
+ (locktag)->locktag_field1 == MyDatabaseId && \
+ MyDatabaseId != InvalidOid && \
+ (mode) < ShareUpdateExclusiveLock)
+#define ConflictsWithRelationFastPath(locktag, mode) \
+ ((locktag)->locktag_lockmethodid == DEFAULT_LOCKMETHOD && \
+ (locktag)->locktag_type == LOCKTAG_RELATION && \
+ (locktag)->locktag_field1 != InvalidOid && \
+ (mode) > ShareUpdateExclusiveLock)
+
+static bool FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode);
+static bool FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode);
+static bool FastPathTransferRelationLocks(LockMethod lockMethodTable,
+ const LOCKTAG *locktag, uint32 hashcode);
+static PROCLOCK *FastPathGetRelationLockEntry(LOCALLOCK *locallock);
+
+/*
+ * To make the fast-path lock mechanism work, we must have some way of
+ * preventing the use of the fast-path when a conflicting lock might be present.
+ * We partition* the locktag space into FAST_PATH_STRONG_LOCK_HASH_PARTITIONS,
+ * and maintain an integer count of the number of "strong" lockers
+ * in each partition. When any "strong" lockers are present (which is
+ * hopefully not very often), the fast-path mechanism can't be used, and we
+ * must fall back to the slower method of pushing matching locks directly
+ * into the main lock tables.
+ *
+ * The deadlock detector does not know anything about the fast path mechanism,
+ * so any locks that might be involved in a deadlock must be transferred from
+ * the fast-path queues to the main lock table.
+ */
+
+#define FAST_PATH_STRONG_LOCK_HASH_BITS 10
+#define FAST_PATH_STRONG_LOCK_HASH_PARTITIONS \
+ (1 << FAST_PATH_STRONG_LOCK_HASH_BITS)
+#define FastPathStrongLockHashPartition(hashcode) \
+ ((hashcode) % FAST_PATH_STRONG_LOCK_HASH_PARTITIONS)
+
+typedef struct
+{
+ slock_t mutex;
+ uint32 count[FAST_PATH_STRONG_LOCK_HASH_PARTITIONS];
+} FastPathStrongRelationLockData;
+
+static volatile FastPathStrongRelationLockData *FastPathStrongRelationLocks;
+
+
+/*
+ * Pointers to hash tables containing lock state
+ *
+ * The LockMethodLockHash and LockMethodProcLockHash hash tables are in
+ * shared memory; LockMethodLocalHash is local to each backend.
+ */
+static HTAB *LockMethodLockHash;
+static HTAB *LockMethodProcLockHash;
+static HTAB *LockMethodLocalHash;
+
+
+/* private state for error cleanup */
+static LOCALLOCK *StrongLockInProgress;
+static LOCALLOCK *awaitedLock;
+static ResourceOwner awaitedOwner;
+
+
+#ifdef LOCK_DEBUG
+
+/*------
+ * The following configuration options are available for lock debugging:
+ *
+ * TRACE_LOCKS -- give a bunch of output what's going on in this file
+ * TRACE_USERLOCKS -- same but for user locks
+ * TRACE_LOCK_OIDMIN-- do not trace locks for tables below this oid
+ * (use to avoid output on system tables)
+ * TRACE_LOCK_TABLE -- trace locks on this table (oid) unconditionally
+ * DEBUG_DEADLOCKS -- currently dumps locks at untimely occasions ;)
+ *
+ * Furthermore, but in storage/lmgr/lwlock.c:
+ * TRACE_LWLOCKS -- trace lightweight locks (pretty useless)
+ *
+ * Define LOCK_DEBUG at compile time to get all these enabled.
+ * --------
+ */
+
+int Trace_lock_oidmin = FirstNormalObjectId;
+bool Trace_locks = false;
+bool Trace_userlocks = false;
+int Trace_lock_table = 0;
+bool Debug_deadlocks = false;
+
+
+inline static bool
+LOCK_DEBUG_ENABLED(const LOCKTAG *tag)
+{
+ return
+ (*(LockMethods[tag->locktag_lockmethodid]->trace_flag) &&
+ ((Oid) tag->locktag_field2 >= (Oid) Trace_lock_oidmin))
+ || (Trace_lock_table &&
+ (tag->locktag_field2 == Trace_lock_table));
+}
+
+
+inline static void
+LOCK_PRINT(const char *where, const LOCK *lock, LOCKMODE type)
+{
+ if (LOCK_DEBUG_ENABLED(&lock->tag))
+ elog(LOG,
+ "%s: lock(%p) id(%u,%u,%u,%u,%u,%u) grantMask(%x) "
+ "req(%d,%d,%d,%d,%d,%d,%d)=%d "
+ "grant(%d,%d,%d,%d,%d,%d,%d)=%d wait(%d) type(%s)",
+ where, lock,
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3, lock->tag.locktag_field4,
+ lock->tag.locktag_type, lock->tag.locktag_lockmethodid,
+ lock->grantMask,
+ lock->requested[1], lock->requested[2], lock->requested[3],
+ lock->requested[4], lock->requested[5], lock->requested[6],
+ lock->requested[7], lock->nRequested,
+ lock->granted[1], lock->granted[2], lock->granted[3],
+ lock->granted[4], lock->granted[5], lock->granted[6],
+ lock->granted[7], lock->nGranted,
+ dclist_count(&lock->waitProcs),
+ LockMethods[LOCK_LOCKMETHOD(*lock)]->lockModeNames[type]);
+}
+
+
+inline static void
+PROCLOCK_PRINT(const char *where, const PROCLOCK *proclockP)
+{
+ if (LOCK_DEBUG_ENABLED(&proclockP->tag.myLock->tag))
+ elog(LOG,
+ "%s: proclock(%p) lock(%p) method(%u) proc(%p) hold(%x)",
+ where, proclockP, proclockP->tag.myLock,
+ PROCLOCK_LOCKMETHOD(*(proclockP)),
+ proclockP->tag.myProc, (int) proclockP->holdMask);
+}
+#else /* not LOCK_DEBUG */
+
+#define LOCK_PRINT(where, lock, type) ((void) 0)
+#define PROCLOCK_PRINT(where, proclockP) ((void) 0)
+#endif /* not LOCK_DEBUG */
+
+
+static uint32 proclock_hash(const void *key, Size keysize);
+static void RemoveLocalLock(LOCALLOCK *locallock);
+static PROCLOCK *SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
+ const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode);
+static void GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner);
+static void BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode);
+static void FinishStrongLockAcquire(void);
+static void WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner);
+static void ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock);
+static void LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent);
+static bool UnGrantLock(LOCK *lock, LOCKMODE lockmode,
+ PROCLOCK *proclock, LockMethod lockMethodTable);
+static void CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+ LockMethod lockMethodTable, uint32 hashcode,
+ bool wakeupNeeded);
+static void LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
+ LOCKTAG *locktag, LOCKMODE lockmode,
+ bool decrement_strong_lock_count);
+static void GetSingleProcBlockerStatusData(PGPROC *blocked_proc,
+ BlockedProcsData *data);
+
+
+/*
+ * InitLocks -- Initialize the lock manager's data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments. In the normal postmaster case, the shared hash tables
+ * are created here, as well as a locallock hash table that will remain
+ * unused and empty in the postmaster itself. Backends inherit the pointers
+ * to the shared tables via fork(), and also inherit an image of the locallock
+ * hash table, which they proceed to use. In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables and to create its locallock hash table.
+ */
+void
+InitLocks(void)
+{
+ HASHCTL info;
+ long init_table_size,
+ max_table_size;
+ bool found;
+
+ /*
+ * Compute init/max size to request for lock hashtables. Note these
+ * calculations must agree with LockShmemSize!
+ */
+ max_table_size = NLOCKENTS();
+ init_table_size = max_table_size / 2;
+
+ /*
+ * Allocate hash table for LOCK structs. This stores per-locked-object
+ * information.
+ */
+ info.keysize = sizeof(LOCKTAG);
+ info.entrysize = sizeof(LOCK);
+ info.num_partitions = NUM_LOCK_PARTITIONS;
+
+ LockMethodLockHash = ShmemInitHash("LOCK hash",
+ init_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
+
+ /* Assume an average of 2 holders per lock */
+ max_table_size *= 2;
+ init_table_size *= 2;
+
+ /*
+ * Allocate hash table for PROCLOCK structs. This stores
+ * per-lock-per-holder information.
+ */
+ info.keysize = sizeof(PROCLOCKTAG);
+ info.entrysize = sizeof(PROCLOCK);
+ info.hash = proclock_hash;
+ info.num_partitions = NUM_LOCK_PARTITIONS;
+
+ LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash",
+ init_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+
+ /*
+ * Allocate fast-path structures.
+ */
+ FastPathStrongRelationLocks =
+ ShmemInitStruct("Fast Path Strong Relation Lock Data",
+ sizeof(FastPathStrongRelationLockData), &found);
+ if (!found)
+ SpinLockInit(&FastPathStrongRelationLocks->mutex);
+
+ /*
+ * Allocate non-shared hash table for LOCALLOCK structs. This stores lock
+ * counts and resource owner information.
+ *
+ * The non-shared table could already exist in this process (this occurs
+ * when the postmaster is recreating shared memory after a backend crash).
+ * If so, delete and recreate it. (We could simply leave it, since it
+ * ought to be empty in the postmaster, but for safety let's zap it.)
+ */
+ if (LockMethodLocalHash)
+ hash_destroy(LockMethodLocalHash);
+
+ info.keysize = sizeof(LOCALLOCKTAG);
+ info.entrysize = sizeof(LOCALLOCK);
+
+ LockMethodLocalHash = hash_create("LOCALLOCK hash",
+ 16,
+ &info,
+ HASH_ELEM | HASH_BLOBS);
+}
+
+
+/*
+ * Fetch the lock method table associated with a given lock
+ */
+LockMethod
+GetLocksMethodTable(const LOCK *lock)
+{
+ LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*lock);
+
+ Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+ return LockMethods[lockmethodid];
+}
+
+/*
+ * Fetch the lock method table associated with a given locktag
+ */
+LockMethod
+GetLockTagsMethodTable(const LOCKTAG *locktag)
+{
+ LOCKMETHODID lockmethodid = (LOCKMETHODID) locktag->locktag_lockmethodid;
+
+ Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+ return LockMethods[lockmethodid];
+}
+
+
+/*
+ * Compute the hash code associated with a LOCKTAG.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed. Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+uint32
+LockTagHashCode(const LOCKTAG *locktag)
+{
+ return get_hash_value(LockMethodLockHash, (const void *) locktag);
+}
+
+/*
+ * Compute the hash code associated with a PROCLOCKTAG.
+ *
+ * Because we want to use just one set of partition locks for both the
+ * LOCK and PROCLOCK hash tables, we have to make sure that PROCLOCKs
+ * fall into the same partition number as their associated LOCKs.
+ * dynahash.c expects the partition number to be the low-order bits of
+ * the hash code, and therefore a PROCLOCKTAG's hash code must have the
+ * same low-order bits as the associated LOCKTAG's hash code. We achieve
+ * this with this specialized hash function.
+ */
+static uint32
+proclock_hash(const void *key, Size keysize)
+{
+ const PROCLOCKTAG *proclocktag = (const PROCLOCKTAG *) key;
+ uint32 lockhash;
+ Datum procptr;
+
+ Assert(keysize == sizeof(PROCLOCKTAG));
+
+ /* Look into the associated LOCK object, and compute its hash code */
+ lockhash = LockTagHashCode(&proclocktag->myLock->tag);
+
+ /*
+ * To make the hash code also depend on the PGPROC, we xor the proc
+ * struct's address into the hash code, left-shifted so that the
+ * partition-number bits don't change. Since this is only a hash, we
+ * don't care if we lose high-order bits of the address; use an
+ * intermediate variable to suppress cast-pointer-to-int warnings.
+ */
+ procptr = PointerGetDatum(proclocktag->myProc);
+ lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS;
+
+ return lockhash;
+}
+
+/*
+ * Compute the hash code associated with a PROCLOCKTAG, given the hashcode
+ * for its underlying LOCK.
+ *
+ * We use this just to avoid redundant calls of LockTagHashCode().
+ */
+static inline uint32
+ProcLockHashCode(const PROCLOCKTAG *proclocktag, uint32 hashcode)
+{
+ uint32 lockhash = hashcode;
+ Datum procptr;
+
+ /*
+ * This must match proclock_hash()!
+ */
+ procptr = PointerGetDatum(proclocktag->myProc);
+ lockhash ^= ((uint32) procptr) << LOG2_NUM_LOCK_PARTITIONS;
+
+ return lockhash;
+}
+
+/*
+ * Given two lock modes, return whether they would conflict.
+ */
+bool
+DoLockModesConflict(LOCKMODE mode1, LOCKMODE mode2)
+{
+ LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+
+ if (lockMethodTable->conflictTab[mode1] & LOCKBIT_ON(mode2))
+ return true;
+
+ return false;
+}
+
+/*
+ * LockHeldByMe -- test whether lock 'locktag' is held with mode 'lockmode'
+ * by the current transaction
+ */
+bool
+LockHeldByMe(const LOCKTAG *locktag, LOCKMODE lockmode)
+{
+ LOCALLOCKTAG localtag;
+ LOCALLOCK *locallock;
+
+ /*
+ * See if there is a LOCALLOCK entry for this lock and lockmode
+ */
+ MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+ localtag.lock = *locktag;
+ localtag.mode = lockmode;
+
+ locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+ &localtag,
+ HASH_FIND, NULL);
+
+ return (locallock && locallock->nLocks > 0);
+}
+
+#ifdef USE_ASSERT_CHECKING
+/*
+ * GetLockMethodLocalHash -- return the hash of local locks, for modules that
+ * evaluate assertions based on all locks held.
+ */
+HTAB *
+GetLockMethodLocalHash(void)
+{
+ return LockMethodLocalHash;
+}
+#endif
+
+/*
+ * LockHasWaiters -- look up 'locktag' and check if releasing this
+ * lock would wake up other processes waiting for it.
+ */
+bool
+LockHasWaiters(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
+{
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LockMethod lockMethodTable;
+ LOCALLOCKTAG localtag;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ LWLock *partitionLock;
+ bool hasWaiters = false;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+ if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+ elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+#ifdef LOCK_DEBUG
+ if (LOCK_DEBUG_ENABLED(locktag))
+ elog(LOG, "LockHasWaiters: lock [%u,%u] %s",
+ locktag->locktag_field1, locktag->locktag_field2,
+ lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+ /*
+ * Find the LOCALLOCK entry for this lock and lockmode
+ */
+ MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+ localtag.lock = *locktag;
+ localtag.mode = lockmode;
+
+ locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+ &localtag,
+ HASH_FIND, NULL);
+
+ /*
+ * let the caller print its own error message, too. Do not ereport(ERROR).
+ */
+ if (!locallock || locallock->nLocks <= 0)
+ {
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ return false;
+ }
+
+ /*
+ * Check the shared lock table.
+ */
+ partitionLock = LockHashPartitionLock(locallock->hashcode);
+
+ LWLockAcquire(partitionLock, LW_SHARED);
+
+ /*
+ * We don't need to re-find the lock or proclock, since we kept their
+ * addresses in the locallock table, and they couldn't have been removed
+ * while we were holding a lock on them.
+ */
+ lock = locallock->lock;
+ LOCK_PRINT("LockHasWaiters: found", lock, lockmode);
+ proclock = locallock->proclock;
+ PROCLOCK_PRINT("LockHasWaiters: found", proclock);
+
+ /*
+ * Double-check that we are actually holding a lock of the type we want to
+ * release.
+ */
+ if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+ {
+ PROCLOCK_PRINT("LockHasWaiters: WRONGTYPE", proclock);
+ LWLockRelease(partitionLock);
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ RemoveLocalLock(locallock);
+ return false;
+ }
+
+ /*
+ * Do the checking.
+ */
+ if ((lockMethodTable->conflictTab[lockmode] & lock->waitMask) != 0)
+ hasWaiters = true;
+
+ LWLockRelease(partitionLock);
+
+ return hasWaiters;
+}
+
+/*
+ * LockAcquire -- Check for lock conflicts, sleep if conflict found,
+ * set lock if/when no conflicts.
+ *
+ * Inputs:
+ * locktag: unique identifier for the lockable object
+ * lockmode: lock mode to acquire
+ * sessionLock: if true, acquire lock for session not current transaction
+ * dontWait: if true, don't wait to acquire lock
+ *
+ * Returns one of:
+ * LOCKACQUIRE_NOT_AVAIL lock not available, and dontWait=true
+ * LOCKACQUIRE_OK lock successfully acquired
+ * LOCKACQUIRE_ALREADY_HELD incremented count for lock already held
+ * LOCKACQUIRE_ALREADY_CLEAR incremented count for lock already clear
+ *
+ * In the normal case where dontWait=false and the caller doesn't need to
+ * distinguish a freshly acquired lock from one already taken earlier in
+ * this same transaction, there is no need to examine the return value.
+ *
+ * Side Effects: The lock is acquired and recorded in lock tables.
+ *
+ * NOTE: if we wait for the lock, there is no way to abort the wait
+ * short of aborting the transaction.
+ */
+LockAcquireResult
+LockAcquire(const LOCKTAG *locktag,
+ LOCKMODE lockmode,
+ bool sessionLock,
+ bool dontWait)
+{
+ return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait,
+ true, NULL);
+}
+
+/*
+ * LockAcquireExtended - allows us to specify additional options
+ *
+ * reportMemoryError specifies whether a lock request that fills the lock
+ * table should generate an ERROR or not. Passing "false" allows the caller
+ * to attempt to recover from lock-table-full situations, perhaps by forcibly
+ * canceling other lock holders and then retrying. Note, however, that the
+ * return code for that is LOCKACQUIRE_NOT_AVAIL, so that it's unsafe to use
+ * in combination with dontWait = true, as the cause of failure couldn't be
+ * distinguished.
+ *
+ * If locallockp isn't NULL, *locallockp receives a pointer to the LOCALLOCK
+ * table entry if a lock is successfully acquired, or NULL if not.
+ */
+LockAcquireResult
+LockAcquireExtended(const LOCKTAG *locktag,
+ LOCKMODE lockmode,
+ bool sessionLock,
+ bool dontWait,
+ bool reportMemoryError,
+ LOCALLOCK **locallockp)
+{
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LockMethod lockMethodTable;
+ LOCALLOCKTAG localtag;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ bool found;
+ ResourceOwner owner;
+ uint32 hashcode;
+ LWLock *partitionLock;
+ bool found_conflict;
+ bool log_lock = false;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+ if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+ elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+ if (RecoveryInProgress() && !InRecovery &&
+ (locktag->locktag_type == LOCKTAG_OBJECT ||
+ locktag->locktag_type == LOCKTAG_RELATION) &&
+ lockmode > RowExclusiveLock)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("cannot acquire lock mode %s on database objects while recovery is in progress",
+ lockMethodTable->lockModeNames[lockmode]),
+ errhint("Only RowExclusiveLock or less can be acquired on database objects during recovery.")));
+
+#ifdef LOCK_DEBUG
+ if (LOCK_DEBUG_ENABLED(locktag))
+ elog(LOG, "LockAcquire: lock [%u,%u] %s",
+ locktag->locktag_field1, locktag->locktag_field2,
+ lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+ /* Identify owner for lock */
+ if (sessionLock)
+ owner = NULL;
+ else
+ owner = CurrentResourceOwner;
+
+ /*
+ * Find or create a LOCALLOCK entry for this lock and lockmode
+ */
+ MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+ localtag.lock = *locktag;
+ localtag.mode = lockmode;
+
+ locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+ &localtag,
+ HASH_ENTER, &found);
+
+ /*
+ * if it's a new locallock object, initialize it
+ */
+ if (!found)
+ {
+ locallock->lock = NULL;
+ locallock->proclock = NULL;
+ locallock->hashcode = LockTagHashCode(&(localtag.lock));
+ locallock->nLocks = 0;
+ locallock->holdsStrongLockCount = false;
+ locallock->lockCleared = false;
+ locallock->numLockOwners = 0;
+ locallock->maxLockOwners = 8;
+ locallock->lockOwners = NULL; /* in case next line fails */
+ locallock->lockOwners = (LOCALLOCKOWNER *)
+ MemoryContextAlloc(TopMemoryContext,
+ locallock->maxLockOwners * sizeof(LOCALLOCKOWNER));
+ }
+ else
+ {
+ /* Make sure there will be room to remember the lock */
+ if (locallock->numLockOwners >= locallock->maxLockOwners)
+ {
+ int newsize = locallock->maxLockOwners * 2;
+
+ locallock->lockOwners = (LOCALLOCKOWNER *)
+ repalloc(locallock->lockOwners,
+ newsize * sizeof(LOCALLOCKOWNER));
+ locallock->maxLockOwners = newsize;
+ }
+ }
+ hashcode = locallock->hashcode;
+
+ if (locallockp)
+ *locallockp = locallock;
+
+ /*
+ * If we already hold the lock, we can just increase the count locally.
+ *
+ * If lockCleared is already set, caller need not worry about absorbing
+ * sinval messages related to the lock's object.
+ */
+ if (locallock->nLocks > 0)
+ {
+ GrantLockLocal(locallock, owner);
+ if (locallock->lockCleared)
+ return LOCKACQUIRE_ALREADY_CLEAR;
+ else
+ return LOCKACQUIRE_ALREADY_HELD;
+ }
+
+ /*
+ * We don't acquire any other heavyweight lock while holding the relation
+ * extension lock. We do allow to acquire the same relation extension
+ * lock more than once but that case won't reach here.
+ */
+ Assert(!IsRelationExtensionLockHeld);
+
+ /*
+ * Prepare to emit a WAL record if acquisition of this lock needs to be
+ * replayed in a standby server.
+ *
+ * Here we prepare to log; after lock is acquired we'll issue log record.
+ * This arrangement simplifies error recovery in case the preparation step
+ * fails.
+ *
+ * Only AccessExclusiveLocks can conflict with lock types that read-only
+ * transactions can acquire in a standby server. Make sure this definition
+ * matches the one in GetRunningTransactionLocks().
+ */
+ if (lockmode >= AccessExclusiveLock &&
+ locktag->locktag_type == LOCKTAG_RELATION &&
+ !RecoveryInProgress() &&
+ XLogStandbyInfoActive())
+ {
+ LogAccessExclusiveLockPrepare();
+ log_lock = true;
+ }
+
+ /*
+ * Attempt to take lock via fast path, if eligible. But if we remember
+ * having filled up the fast path array, we don't attempt to make any
+ * further use of it until we release some locks. It's possible that some
+ * other backend has transferred some of those locks to the shared hash
+ * table, leaving space free, but it's not worth acquiring the LWLock just
+ * to check. It's also possible that we're acquiring a second or third
+ * lock type on a relation we have already locked using the fast-path, but
+ * for now we don't worry about that case either.
+ */
+ if (EligibleForRelationFastPath(locktag, lockmode) &&
+ FastPathLocalUseCount < FP_LOCK_SLOTS_PER_BACKEND)
+ {
+ uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode);
+ bool acquired;
+
+ /*
+ * LWLockAcquire acts as a memory sequencing point, so it's safe to
+ * assume that any strong locker whose increment to
+ * FastPathStrongRelationLocks->counts becomes visible after we test
+ * it has yet to begin to transfer fast-path locks.
+ */
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+ if (FastPathStrongRelationLocks->count[fasthashcode] != 0)
+ acquired = false;
+ else
+ acquired = FastPathGrantRelationLock(locktag->locktag_field2,
+ lockmode);
+ LWLockRelease(&MyProc->fpInfoLock);
+ if (acquired)
+ {
+ /*
+ * The locallock might contain stale pointers to some old shared
+ * objects; we MUST reset these to null before considering the
+ * lock to be acquired via fast-path.
+ */
+ locallock->lock = NULL;
+ locallock->proclock = NULL;
+ GrantLockLocal(locallock, owner);
+ return LOCKACQUIRE_OK;
+ }
+ }
+
+ /*
+ * If this lock could potentially have been taken via the fast-path by
+ * some other backend, we must (temporarily) disable further use of the
+ * fast-path for this lock tag, and migrate any locks already taken via
+ * this method to the main lock table.
+ */
+ if (ConflictsWithRelationFastPath(locktag, lockmode))
+ {
+ uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+ BeginStrongLockAcquire(locallock, fasthashcode);
+ if (!FastPathTransferRelationLocks(lockMethodTable, locktag,
+ hashcode))
+ {
+ AbortStrongLockAcquire();
+ if (locallock->nLocks == 0)
+ RemoveLocalLock(locallock);
+ if (locallockp)
+ *locallockp = NULL;
+ if (reportMemoryError)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_locks_per_transaction")));
+ else
+ return LOCKACQUIRE_NOT_AVAIL;
+ }
+ }
+
+ /*
+ * We didn't find the lock in our LOCALLOCK table, and we didn't manage to
+ * take it via the fast-path, either, so we've got to mess with the shared
+ * lock table.
+ */
+ partitionLock = LockHashPartitionLock(hashcode);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Find or create lock and proclock entries with this tag
+ *
+ * Note: if the locallock object already existed, it might have a pointer
+ * to the lock already ... but we should not assume that that pointer is
+ * valid, since a lock object with zero hold and request counts can go
+ * away anytime. So we have to use SetupLockInTable() to recompute the
+ * lock and proclock pointers, even if they're already set.
+ */
+ proclock = SetupLockInTable(lockMethodTable, MyProc, locktag,
+ hashcode, lockmode);
+ if (!proclock)
+ {
+ AbortStrongLockAcquire();
+ LWLockRelease(partitionLock);
+ if (locallock->nLocks == 0)
+ RemoveLocalLock(locallock);
+ if (locallockp)
+ *locallockp = NULL;
+ if (reportMemoryError)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_locks_per_transaction")));
+ else
+ return LOCKACQUIRE_NOT_AVAIL;
+ }
+ locallock->proclock = proclock;
+ lock = proclock->tag.myLock;
+ locallock->lock = lock;
+
+ /*
+ * If lock requested conflicts with locks requested by waiters, must join
+ * wait queue. Otherwise, check for conflict with already-held locks.
+ * (That's last because most complex check.)
+ */
+ if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+ found_conflict = true;
+ else
+ found_conflict = LockCheckConflicts(lockMethodTable, lockmode,
+ lock, proclock);
+
+ if (!found_conflict)
+ {
+ /* No conflict with held or previously requested locks */
+ GrantLock(lock, proclock, lockmode);
+ GrantLockLocal(locallock, owner);
+ }
+ else
+ {
+ /*
+ * We can't acquire the lock immediately. If caller specified no
+ * blocking, remove useless table entries and return
+ * LOCKACQUIRE_NOT_AVAIL without waiting.
+ */
+ if (dontWait)
+ {
+ AbortStrongLockAcquire();
+ if (proclock->holdMask == 0)
+ {
+ uint32 proclock_hashcode;
+
+ proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
+ dlist_delete(&proclock->lockLink);
+ dlist_delete(&proclock->procLink);
+ if (!hash_search_with_hash_value(LockMethodProcLockHash,
+ &(proclock->tag),
+ proclock_hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "proclock table corrupted");
+ }
+ else
+ PROCLOCK_PRINT("LockAcquire: NOWAIT", proclock);
+ lock->nRequested--;
+ lock->requested[lockmode]--;
+ LOCK_PRINT("LockAcquire: conditional lock failed", lock, lockmode);
+ Assert((lock->nRequested > 0) && (lock->requested[lockmode] >= 0));
+ Assert(lock->nGranted <= lock->nRequested);
+ LWLockRelease(partitionLock);
+ if (locallock->nLocks == 0)
+ RemoveLocalLock(locallock);
+ if (locallockp)
+ *locallockp = NULL;
+ return LOCKACQUIRE_NOT_AVAIL;
+ }
+
+ /*
+ * Set bitmask of locks this process already holds on this object.
+ */
+ MyProc->heldLocks = proclock->holdMask;
+
+ /*
+ * Sleep till someone wakes me up.
+ */
+
+ TRACE_POSTGRESQL_LOCK_WAIT_START(locktag->locktag_field1,
+ locktag->locktag_field2,
+ locktag->locktag_field3,
+ locktag->locktag_field4,
+ locktag->locktag_type,
+ lockmode);
+
+ WaitOnLock(locallock, owner);
+
+ TRACE_POSTGRESQL_LOCK_WAIT_DONE(locktag->locktag_field1,
+ locktag->locktag_field2,
+ locktag->locktag_field3,
+ locktag->locktag_field4,
+ locktag->locktag_type,
+ lockmode);
+
+ /*
+ * NOTE: do not do any material change of state between here and
+ * return. All required changes in locktable state must have been
+ * done when the lock was granted to us --- see notes in WaitOnLock.
+ */
+
+ /*
+ * Check the proclock entry status, in case something in the ipc
+ * communication doesn't work correctly.
+ */
+ if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+ {
+ AbortStrongLockAcquire();
+ PROCLOCK_PRINT("LockAcquire: INCONSISTENT", proclock);
+ LOCK_PRINT("LockAcquire: INCONSISTENT", lock, lockmode);
+ /* Should we retry ? */
+ LWLockRelease(partitionLock);
+ elog(ERROR, "LockAcquire failed");
+ }
+ PROCLOCK_PRINT("LockAcquire: granted", proclock);
+ LOCK_PRINT("LockAcquire: granted", lock, lockmode);
+ }
+
+ /*
+ * Lock state is fully up-to-date now; if we error out after this, no
+ * special error cleanup is required.
+ */
+ FinishStrongLockAcquire();
+
+ LWLockRelease(partitionLock);
+
+ /*
+ * Emit a WAL record if acquisition of this lock needs to be replayed in a
+ * standby server.
+ */
+ if (log_lock)
+ {
+ /*
+ * Decode the locktag back to the original values, to avoid sending
+ * lots of empty bytes with every message. See lock.h to check how a
+ * locktag is defined for LOCKTAG_RELATION
+ */
+ LogAccessExclusiveLock(locktag->locktag_field1,
+ locktag->locktag_field2);
+ }
+
+ return LOCKACQUIRE_OK;
+}
+
+/*
+ * Find or create LOCK and PROCLOCK objects as needed for a new lock
+ * request.
+ *
+ * Returns the PROCLOCK object, or NULL if we failed to create the objects
+ * for lack of shared memory.
+ *
+ * The appropriate partition lock must be held at entry, and will be
+ * held at exit.
+ */
+static PROCLOCK *
+SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
+ const LOCKTAG *locktag, uint32 hashcode, LOCKMODE lockmode)
+{
+ LOCK *lock;
+ PROCLOCK *proclock;
+ PROCLOCKTAG proclocktag;
+ uint32 proclock_hashcode;
+ bool found;
+
+ /*
+ * Find or create a lock with this tag.
+ */
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ locktag,
+ hashcode,
+ HASH_ENTER_NULL,
+ &found);
+ if (!lock)
+ return NULL;
+
+ /*
+ * if it's a new lock object, initialize it
+ */
+ if (!found)
+ {
+ lock->grantMask = 0;
+ lock->waitMask = 0;
+ dlist_init(&lock->procLocks);
+ dclist_init(&lock->waitProcs);
+ lock->nRequested = 0;
+ lock->nGranted = 0;
+ MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+ MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+ LOCK_PRINT("LockAcquire: new", lock, lockmode);
+ }
+ else
+ {
+ LOCK_PRINT("LockAcquire: found", lock, lockmode);
+ Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+ Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+ Assert(lock->nGranted <= lock->nRequested);
+ }
+
+ /*
+ * Create the hash key for the proclock table.
+ */
+ proclocktag.myLock = lock;
+ proclocktag.myProc = proc;
+
+ proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+ /*
+ * Find or create a proclock entry with this tag
+ */
+ proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ &proclocktag,
+ proclock_hashcode,
+ HASH_ENTER_NULL,
+ &found);
+ if (!proclock)
+ {
+ /* Oops, not enough shmem for the proclock */
+ if (lock->nRequested == 0)
+ {
+ /*
+ * There are no other requestors of this lock, so garbage-collect
+ * the lock object. We *must* do this to avoid a permanent leak
+ * of shared memory, because there won't be anything to cause
+ * anyone to release the lock object later.
+ */
+ Assert(dlist_is_empty(&(lock->procLocks)));
+ if (!hash_search_with_hash_value(LockMethodLockHash,
+ &(lock->tag),
+ hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "lock table corrupted");
+ }
+ return NULL;
+ }
+
+ /*
+ * If new, initialize the new entry
+ */
+ if (!found)
+ {
+ uint32 partition = LockHashPartition(hashcode);
+
+ /*
+ * It might seem unsafe to access proclock->groupLeader without a
+ * lock, but it's not really. Either we are initializing a proclock
+ * on our own behalf, in which case our group leader isn't changing
+ * because the group leader for a process can only ever be changed by
+ * the process itself; or else we are transferring a fast-path lock to
+ * the main lock table, in which case that process can't change it's
+ * lock group leader without first releasing all of its locks (and in
+ * particular the one we are currently transferring).
+ */
+ proclock->groupLeader = proc->lockGroupLeader != NULL ?
+ proc->lockGroupLeader : proc;
+ proclock->holdMask = 0;
+ proclock->releaseMask = 0;
+ /* Add proclock to appropriate lists */
+ dlist_push_tail(&lock->procLocks, &proclock->lockLink);
+ dlist_push_tail(&proc->myProcLocks[partition], &proclock->procLink);
+ PROCLOCK_PRINT("LockAcquire: new", proclock);
+ }
+ else
+ {
+ PROCLOCK_PRINT("LockAcquire: found", proclock);
+ Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+#ifdef CHECK_DEADLOCK_RISK
+
+ /*
+ * Issue warning if we already hold a lower-level lock on this object
+ * and do not hold a lock of the requested level or higher. This
+ * indicates a deadlock-prone coding practice (eg, we'd have a
+ * deadlock if another backend were following the same code path at
+ * about the same time).
+ *
+ * This is not enabled by default, because it may generate log entries
+ * about user-level coding practices that are in fact safe in context.
+ * It can be enabled to help find system-level problems.
+ *
+ * XXX Doing numeric comparison on the lockmodes is a hack; it'd be
+ * better to use a table. For now, though, this works.
+ */
+ {
+ int i;
+
+ for (i = lockMethodTable->numLockModes; i > 0; i--)
+ {
+ if (proclock->holdMask & LOCKBIT_ON(i))
+ {
+ if (i >= (int) lockmode)
+ break; /* safe: we have a lock >= req level */
+ elog(LOG, "deadlock risk: raising lock level"
+ " from %s to %s on object %u/%u/%u",
+ lockMethodTable->lockModeNames[i],
+ lockMethodTable->lockModeNames[lockmode],
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3);
+ break;
+ }
+ }
+ }
+#endif /* CHECK_DEADLOCK_RISK */
+ }
+
+ /*
+ * lock->nRequested and lock->requested[] count the total number of
+ * requests, whether granted or waiting, so increment those immediately.
+ * The other counts don't increment till we get the lock.
+ */
+ lock->nRequested++;
+ lock->requested[lockmode]++;
+ Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+ /*
+ * We shouldn't already hold the desired lock; else locallock table is
+ * broken.
+ */
+ if (proclock->holdMask & LOCKBIT_ON(lockmode))
+ elog(ERROR, "lock %s on object %u/%u/%u is already held",
+ lockMethodTable->lockModeNames[lockmode],
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3);
+
+ return proclock;
+}
+
+/*
+ * Check and set/reset the flag that we hold the relation extension lock.
+ *
+ * It is callers responsibility that this function is called after
+ * acquiring/releasing the relation extension lock.
+ *
+ * Pass acquired as true if lock is acquired, false otherwise.
+ */
+static inline void
+CheckAndSetLockHeld(LOCALLOCK *locallock, bool acquired)
+{
+#ifdef USE_ASSERT_CHECKING
+ if (LOCALLOCK_LOCKTAG(*locallock) == LOCKTAG_RELATION_EXTEND)
+ IsRelationExtensionLockHeld = acquired;
+#endif
+}
+
+/*
+ * Subroutine to free a locallock entry
+ */
+static void
+RemoveLocalLock(LOCALLOCK *locallock)
+{
+ int i;
+
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (locallock->lockOwners[i].owner != NULL)
+ ResourceOwnerForgetLock(locallock->lockOwners[i].owner, locallock);
+ }
+ locallock->numLockOwners = 0;
+ if (locallock->lockOwners != NULL)
+ pfree(locallock->lockOwners);
+ locallock->lockOwners = NULL;
+
+ if (locallock->holdsStrongLockCount)
+ {
+ uint32 fasthashcode;
+
+ fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
+
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+ FastPathStrongRelationLocks->count[fasthashcode]--;
+ locallock->holdsStrongLockCount = false;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+ }
+
+ if (!hash_search(LockMethodLocalHash,
+ &(locallock->tag),
+ HASH_REMOVE, NULL))
+ elog(WARNING, "locallock table corrupted");
+
+ /*
+ * Indicate that the lock is released for certain types of locks
+ */
+ CheckAndSetLockHeld(locallock, false);
+}
+
+/*
+ * LockCheckConflicts -- test whether requested lock conflicts
+ * with those already granted
+ *
+ * Returns true if conflict, false if no conflict.
+ *
+ * NOTES:
+ * Here's what makes this complicated: one process's locks don't
+ * conflict with one another, no matter what purpose they are held for
+ * (eg, session and transaction locks do not conflict). Nor do the locks
+ * of one process in a lock group conflict with those of another process in
+ * the same group. So, we must subtract off these locks when determining
+ * whether the requested new lock conflicts with those already held.
+ */
+bool
+LockCheckConflicts(LockMethod lockMethodTable,
+ LOCKMODE lockmode,
+ LOCK *lock,
+ PROCLOCK *proclock)
+{
+ int numLockModes = lockMethodTable->numLockModes;
+ LOCKMASK myLocks;
+ int conflictMask = lockMethodTable->conflictTab[lockmode];
+ int conflictsRemaining[MAX_LOCKMODES];
+ int totalConflictsRemaining = 0;
+ dlist_iter proclock_iter;
+ int i;
+
+ /*
+ * first check for global conflicts: If no locks conflict with my request,
+ * then I get the lock.
+ *
+ * Checking for conflict: lock->grantMask represents the types of
+ * currently held locks. conflictTable[lockmode] has a bit set for each
+ * type of lock that conflicts with request. Bitwise compare tells if
+ * there is a conflict.
+ */
+ if (!(conflictMask & lock->grantMask))
+ {
+ PROCLOCK_PRINT("LockCheckConflicts: no conflict", proclock);
+ return false;
+ }
+
+ /*
+ * Rats. Something conflicts. But it could still be my own lock, or a
+ * lock held by another member of my locking group. First, figure out how
+ * many conflicts remain after subtracting out any locks I hold myself.
+ */
+ myLocks = proclock->holdMask;
+ for (i = 1; i <= numLockModes; i++)
+ {
+ if ((conflictMask & LOCKBIT_ON(i)) == 0)
+ {
+ conflictsRemaining[i] = 0;
+ continue;
+ }
+ conflictsRemaining[i] = lock->granted[i];
+ if (myLocks & LOCKBIT_ON(i))
+ --conflictsRemaining[i];
+ totalConflictsRemaining += conflictsRemaining[i];
+ }
+
+ /* If no conflicts remain, we get the lock. */
+ if (totalConflictsRemaining == 0)
+ {
+ PROCLOCK_PRINT("LockCheckConflicts: resolved (simple)", proclock);
+ return false;
+ }
+
+ /* If no group locking, it's definitely a conflict. */
+ if (proclock->groupLeader == MyProc && MyProc->lockGroupLeader == NULL)
+ {
+ Assert(proclock->tag.myProc == MyProc);
+ PROCLOCK_PRINT("LockCheckConflicts: conflicting (simple)",
+ proclock);
+ return true;
+ }
+
+ /*
+ * The relation extension lock conflict even between the group members.
+ */
+ if (LOCK_LOCKTAG(*lock) == LOCKTAG_RELATION_EXTEND)
+ {
+ PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)",
+ proclock);
+ return true;
+ }
+
+ /*
+ * Locks held in conflicting modes by members of our own lock group are
+ * not real conflicts; we can subtract those out and see if we still have
+ * a conflict. This is O(N) in the number of processes holding or
+ * awaiting locks on this object. We could improve that by making the
+ * shared memory state more complex (and larger) but it doesn't seem worth
+ * it.
+ */
+ dlist_foreach(proclock_iter, &lock->procLocks)
+ {
+ PROCLOCK *otherproclock =
+ dlist_container(PROCLOCK, lockLink, proclock_iter.cur);
+
+ if (proclock != otherproclock &&
+ proclock->groupLeader == otherproclock->groupLeader &&
+ (otherproclock->holdMask & conflictMask) != 0)
+ {
+ int intersectMask = otherproclock->holdMask & conflictMask;
+
+ for (i = 1; i <= numLockModes; i++)
+ {
+ if ((intersectMask & LOCKBIT_ON(i)) != 0)
+ {
+ if (conflictsRemaining[i] <= 0)
+ elog(PANIC, "proclocks held do not match lock");
+ conflictsRemaining[i]--;
+ totalConflictsRemaining--;
+ }
+ }
+
+ if (totalConflictsRemaining == 0)
+ {
+ PROCLOCK_PRINT("LockCheckConflicts: resolved (group)",
+ proclock);
+ return false;
+ }
+ }
+ }
+
+ /* Nope, it's a real conflict. */
+ PROCLOCK_PRINT("LockCheckConflicts: conflicting (group)", proclock);
+ return true;
+}
+
+/*
+ * GrantLock -- update the lock and proclock data structures to show
+ * the lock request has been granted.
+ *
+ * NOTE: if proc was blocked, it also needs to be removed from the wait list
+ * and have its waitLock/waitProcLock fields cleared. That's not done here.
+ *
+ * NOTE: the lock grant also has to be recorded in the associated LOCALLOCK
+ * table entry; but since we may be awaking some other process, we can't do
+ * that here; it's done by GrantLockLocal, instead.
+ */
+void
+GrantLock(LOCK *lock, PROCLOCK *proclock, LOCKMODE lockmode)
+{
+ lock->nGranted++;
+ lock->granted[lockmode]++;
+ lock->grantMask |= LOCKBIT_ON(lockmode);
+ if (lock->granted[lockmode] == lock->requested[lockmode])
+ lock->waitMask &= LOCKBIT_OFF(lockmode);
+ proclock->holdMask |= LOCKBIT_ON(lockmode);
+ LOCK_PRINT("GrantLock", lock, lockmode);
+ Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
+ Assert(lock->nGranted <= lock->nRequested);
+}
+
+/*
+ * UnGrantLock -- opposite of GrantLock.
+ *
+ * Updates the lock and proclock data structures to show that the lock
+ * is no longer held nor requested by the current holder.
+ *
+ * Returns true if there were any waiters waiting on the lock that
+ * should now be woken up with ProcLockWakeup.
+ */
+static bool
+UnGrantLock(LOCK *lock, LOCKMODE lockmode,
+ PROCLOCK *proclock, LockMethod lockMethodTable)
+{
+ bool wakeupNeeded = false;
+
+ Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+ Assert((lock->nGranted > 0) && (lock->granted[lockmode] > 0));
+ Assert(lock->nGranted <= lock->nRequested);
+
+ /*
+ * fix the general lock stats
+ */
+ lock->nRequested--;
+ lock->requested[lockmode]--;
+ lock->nGranted--;
+ lock->granted[lockmode]--;
+
+ if (lock->granted[lockmode] == 0)
+ {
+ /* change the conflict mask. No more of this lock type. */
+ lock->grantMask &= LOCKBIT_OFF(lockmode);
+ }
+
+ LOCK_PRINT("UnGrantLock: updated", lock, lockmode);
+
+ /*
+ * We need only run ProcLockWakeup if the released lock conflicts with at
+ * least one of the lock types requested by waiter(s). Otherwise whatever
+ * conflict made them wait must still exist. NOTE: before MVCC, we could
+ * skip wakeup if lock->granted[lockmode] was still positive. But that's
+ * not true anymore, because the remaining granted locks might belong to
+ * some waiter, who could now be awakened because he doesn't conflict with
+ * his own locks.
+ */
+ if (lockMethodTable->conflictTab[lockmode] & lock->waitMask)
+ wakeupNeeded = true;
+
+ /*
+ * Now fix the per-proclock state.
+ */
+ proclock->holdMask &= LOCKBIT_OFF(lockmode);
+ PROCLOCK_PRINT("UnGrantLock: updated", proclock);
+
+ return wakeupNeeded;
+}
+
+/*
+ * CleanUpLock -- clean up after releasing a lock. We garbage-collect the
+ * proclock and lock objects if possible, and call ProcLockWakeup if there
+ * are remaining requests and the caller says it's OK. (Normally, this
+ * should be called after UnGrantLock, and wakeupNeeded is the result from
+ * UnGrantLock.)
+ *
+ * The appropriate partition lock must be held at entry, and will be
+ * held at exit.
+ */
+static void
+CleanUpLock(LOCK *lock, PROCLOCK *proclock,
+ LockMethod lockMethodTable, uint32 hashcode,
+ bool wakeupNeeded)
+{
+ /*
+ * If this was my last hold on this lock, delete my entry in the proclock
+ * table.
+ */
+ if (proclock->holdMask == 0)
+ {
+ uint32 proclock_hashcode;
+
+ PROCLOCK_PRINT("CleanUpLock: deleting", proclock);
+ dlist_delete(&proclock->lockLink);
+ dlist_delete(&proclock->procLink);
+ proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
+ if (!hash_search_with_hash_value(LockMethodProcLockHash,
+ &(proclock->tag),
+ proclock_hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "proclock table corrupted");
+ }
+
+ if (lock->nRequested == 0)
+ {
+ /*
+ * The caller just released the last lock, so garbage-collect the lock
+ * object.
+ */
+ LOCK_PRINT("CleanUpLock: deleting", lock, 0);
+ Assert(dlist_is_empty(&lock->procLocks));
+ if (!hash_search_with_hash_value(LockMethodLockHash,
+ &(lock->tag),
+ hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "lock table corrupted");
+ }
+ else if (wakeupNeeded)
+ {
+ /* There are waiters on this lock, so wake them up. */
+ ProcLockWakeup(lockMethodTable, lock);
+ }
+}
+
+/*
+ * GrantLockLocal -- update the locallock data structures to show
+ * the lock request has been granted.
+ *
+ * We expect that LockAcquire made sure there is room to add a new
+ * ResourceOwner entry.
+ */
+static void
+GrantLockLocal(LOCALLOCK *locallock, ResourceOwner owner)
+{
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ int i;
+
+ Assert(locallock->numLockOwners < locallock->maxLockOwners);
+ /* Count the total */
+ locallock->nLocks++;
+ /* Count the per-owner lock */
+ for (i = 0; i < locallock->numLockOwners; i++)
+ {
+ if (lockOwners[i].owner == owner)
+ {
+ lockOwners[i].nLocks++;
+ return;
+ }
+ }
+ lockOwners[i].owner = owner;
+ lockOwners[i].nLocks = 1;
+ locallock->numLockOwners++;
+ if (owner != NULL)
+ ResourceOwnerRememberLock(owner, locallock);
+
+ /* Indicate that the lock is acquired for certain types of locks. */
+ CheckAndSetLockHeld(locallock, true);
+}
+
+/*
+ * BeginStrongLockAcquire - inhibit use of fastpath for a given LOCALLOCK,
+ * and arrange for error cleanup if it fails
+ */
+static void
+BeginStrongLockAcquire(LOCALLOCK *locallock, uint32 fasthashcode)
+{
+ Assert(StrongLockInProgress == NULL);
+ Assert(locallock->holdsStrongLockCount == false);
+
+ /*
+ * Adding to a memory location is not atomic, so we take a spinlock to
+ * ensure we don't collide with someone else trying to bump the count at
+ * the same time.
+ *
+ * XXX: It might be worth considering using an atomic fetch-and-add
+ * instruction here, on architectures where that is supported.
+ */
+
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ FastPathStrongRelationLocks->count[fasthashcode]++;
+ locallock->holdsStrongLockCount = true;
+ StrongLockInProgress = locallock;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+}
+
+/*
+ * FinishStrongLockAcquire - cancel pending cleanup for a strong lock
+ * acquisition once it's no longer needed
+ */
+static void
+FinishStrongLockAcquire(void)
+{
+ StrongLockInProgress = NULL;
+}
+
+/*
+ * AbortStrongLockAcquire - undo strong lock state changes performed by
+ * BeginStrongLockAcquire.
+ */
+void
+AbortStrongLockAcquire(void)
+{
+ uint32 fasthashcode;
+ LOCALLOCK *locallock = StrongLockInProgress;
+
+ if (locallock == NULL)
+ return;
+
+ fasthashcode = FastPathStrongLockHashPartition(locallock->hashcode);
+ Assert(locallock->holdsStrongLockCount == true);
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+ FastPathStrongRelationLocks->count[fasthashcode]--;
+ locallock->holdsStrongLockCount = false;
+ StrongLockInProgress = NULL;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+}
+
+/*
+ * GrantAwaitedLock -- call GrantLockLocal for the lock we are doing
+ * WaitOnLock on.
+ *
+ * proc.c needs this for the case where we are booted off the lock by
+ * timeout, but discover that someone granted us the lock anyway.
+ *
+ * We could just export GrantLockLocal, but that would require including
+ * resowner.h in lock.h, which creates circularity.
+ */
+void
+GrantAwaitedLock(void)
+{
+ GrantLockLocal(awaitedLock, awaitedOwner);
+}
+
+/*
+ * MarkLockClear -- mark an acquired lock as "clear"
+ *
+ * This means that we know we have absorbed all sinval messages that other
+ * sessions generated before we acquired this lock, and so we can confidently
+ * assume we know about any catalog changes protected by this lock.
+ */
+void
+MarkLockClear(LOCALLOCK *locallock)
+{
+ Assert(locallock->nLocks > 0);
+ locallock->lockCleared = true;
+}
+
+/*
+ * WaitOnLock -- wait to acquire a lock
+ *
+ * Caller must have set MyProc->heldLocks to reflect locks already held
+ * on the lockable object by this process.
+ *
+ * The appropriate partition lock must be held at entry.
+ */
+static void
+WaitOnLock(LOCALLOCK *locallock, ResourceOwner owner)
+{
+ LOCKMETHODID lockmethodid = LOCALLOCK_LOCKMETHOD(*locallock);
+ LockMethod lockMethodTable = LockMethods[lockmethodid];
+
+ LOCK_PRINT("WaitOnLock: sleeping on lock",
+ locallock->lock, locallock->tag.mode);
+
+ /* adjust the process title to indicate that it's waiting */
+ set_ps_display_suffix("waiting");
+
+ awaitedLock = locallock;
+ awaitedOwner = owner;
+
+ /*
+ * NOTE: Think not to put any shared-state cleanup after the call to
+ * ProcSleep, in either the normal or failure path. The lock state must
+ * be fully set by the lock grantor, or by CheckDeadLock if we give up
+ * waiting for the lock. This is necessary because of the possibility
+ * that a cancel/die interrupt will interrupt ProcSleep after someone else
+ * grants us the lock, but before we've noticed it. Hence, after granting,
+ * the locktable state must fully reflect the fact that we own the lock;
+ * we can't do additional work on return.
+ *
+ * We can and do use a PG_TRY block to try to clean up after failure, but
+ * this still has a major limitation: elog(FATAL) can occur while waiting
+ * (eg, a "die" interrupt), and then control won't come back here. So all
+ * cleanup of essential state should happen in LockErrorCleanup, not here.
+ * We can use PG_TRY to clear the "waiting" status flags, since doing that
+ * is unimportant if the process exits.
+ */
+ PG_TRY();
+ {
+ if (ProcSleep(locallock, lockMethodTable) != PROC_WAIT_STATUS_OK)
+ {
+ /*
+ * We failed as a result of a deadlock, see CheckDeadLock(). Quit
+ * now.
+ */
+ awaitedLock = NULL;
+ LOCK_PRINT("WaitOnLock: aborting on lock",
+ locallock->lock, locallock->tag.mode);
+ LWLockRelease(LockHashPartitionLock(locallock->hashcode));
+
+ /*
+ * Now that we aren't holding the partition lock, we can give an
+ * error report including details about the detected deadlock.
+ */
+ DeadLockReport();
+ /* not reached */
+ }
+ }
+ PG_CATCH();
+ {
+ /* In this path, awaitedLock remains set until LockErrorCleanup */
+
+ /* reset ps display to remove the suffix */
+ set_ps_display_remove_suffix();
+
+ /* and propagate the error */
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ awaitedLock = NULL;
+
+ /* reset ps display to remove the suffix */
+ set_ps_display_remove_suffix();
+
+ LOCK_PRINT("WaitOnLock: wakeup on lock",
+ locallock->lock, locallock->tag.mode);
+}
+
+/*
+ * Remove a proc from the wait-queue it is on (caller must know it is on one).
+ * This is only used when the proc has failed to get the lock, so we set its
+ * waitStatus to PROC_WAIT_STATUS_ERROR.
+ *
+ * Appropriate partition lock must be held by caller. Also, caller is
+ * responsible for signaling the proc if needed.
+ *
+ * NB: this does not clean up any locallock object that may exist for the lock.
+ */
+void
+RemoveFromWaitQueue(PGPROC *proc, uint32 hashcode)
+{
+ LOCK *waitLock = proc->waitLock;
+ PROCLOCK *proclock = proc->waitProcLock;
+ LOCKMODE lockmode = proc->waitLockMode;
+ LOCKMETHODID lockmethodid = LOCK_LOCKMETHOD(*waitLock);
+
+ /* Make sure proc is waiting */
+ Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING);
+ Assert(proc->links.next != NULL);
+ Assert(waitLock);
+ Assert(!dclist_is_empty(&waitLock->waitProcs));
+ Assert(0 < lockmethodid && lockmethodid < lengthof(LockMethods));
+
+ /* Remove proc from lock's wait queue */
+ dclist_delete_from_thoroughly(&waitLock->waitProcs, &proc->links);
+
+ /* Undo increments of request counts by waiting process */
+ Assert(waitLock->nRequested > 0);
+ Assert(waitLock->nRequested > proc->waitLock->nGranted);
+ waitLock->nRequested--;
+ Assert(waitLock->requested[lockmode] > 0);
+ waitLock->requested[lockmode]--;
+ /* don't forget to clear waitMask bit if appropriate */
+ if (waitLock->granted[lockmode] == waitLock->requested[lockmode])
+ waitLock->waitMask &= LOCKBIT_OFF(lockmode);
+
+ /* Clean up the proc's own state, and pass it the ok/fail signal */
+ proc->waitLock = NULL;
+ proc->waitProcLock = NULL;
+ proc->waitStatus = PROC_WAIT_STATUS_ERROR;
+
+ /*
+ * Delete the proclock immediately if it represents no already-held locks.
+ * (This must happen now because if the owner of the lock decides to
+ * release it, and the requested/granted counts then go to zero,
+ * LockRelease expects there to be no remaining proclocks.) Then see if
+ * any other waiters for the lock can be woken up now.
+ */
+ CleanUpLock(waitLock, proclock,
+ LockMethods[lockmethodid], hashcode,
+ true);
+}
+
+/*
+ * LockRelease -- look up 'locktag' and release one 'lockmode' lock on it.
+ * Release a session lock if 'sessionLock' is true, else release a
+ * regular transaction lock.
+ *
+ * Side Effects: find any waiting processes that are now wakable,
+ * grant them their requested locks and awaken them.
+ * (We have to grant the lock here to avoid a race between
+ * the waking process and any new process to
+ * come along and request the lock.)
+ */
+bool
+LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
+{
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LockMethod lockMethodTable;
+ LOCALLOCKTAG localtag;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ LWLock *partitionLock;
+ bool wakeupNeeded;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+ if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+ elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+#ifdef LOCK_DEBUG
+ if (LOCK_DEBUG_ENABLED(locktag))
+ elog(LOG, "LockRelease: lock [%u,%u] %s",
+ locktag->locktag_field1, locktag->locktag_field2,
+ lockMethodTable->lockModeNames[lockmode]);
+#endif
+
+ /*
+ * Find the LOCALLOCK entry for this lock and lockmode
+ */
+ MemSet(&localtag, 0, sizeof(localtag)); /* must clear padding */
+ localtag.lock = *locktag;
+ localtag.mode = lockmode;
+
+ locallock = (LOCALLOCK *) hash_search(LockMethodLocalHash,
+ &localtag,
+ HASH_FIND, NULL);
+
+ /*
+ * let the caller print its own error message, too. Do not ereport(ERROR).
+ */
+ if (!locallock || locallock->nLocks <= 0)
+ {
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ return false;
+ }
+
+ /*
+ * Decrease the count for the resource owner.
+ */
+ {
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ ResourceOwner owner;
+ int i;
+
+ /* Identify owner for lock */
+ if (sessionLock)
+ owner = NULL;
+ else
+ owner = CurrentResourceOwner;
+
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == owner)
+ {
+ Assert(lockOwners[i].nLocks > 0);
+ if (--lockOwners[i].nLocks == 0)
+ {
+ if (owner != NULL)
+ ResourceOwnerForgetLock(owner, locallock);
+ /* compact out unused slot */
+ locallock->numLockOwners--;
+ if (i < locallock->numLockOwners)
+ lockOwners[i] = lockOwners[locallock->numLockOwners];
+ }
+ break;
+ }
+ }
+ if (i < 0)
+ {
+ /* don't release a lock belonging to another owner */
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ return false;
+ }
+ }
+
+ /*
+ * Decrease the total local count. If we're still holding the lock, we're
+ * done.
+ */
+ locallock->nLocks--;
+
+ if (locallock->nLocks > 0)
+ return true;
+
+ /*
+ * At this point we can no longer suppose we are clear of invalidation
+ * messages related to this lock. Although we'll delete the LOCALLOCK
+ * object before any intentional return from this routine, it seems worth
+ * the trouble to explicitly reset lockCleared right now, just in case
+ * some error prevents us from deleting the LOCALLOCK.
+ */
+ locallock->lockCleared = false;
+
+ /* Attempt fast release of any lock eligible for the fast path. */
+ if (EligibleForRelationFastPath(locktag, lockmode) &&
+ FastPathLocalUseCount > 0)
+ {
+ bool released;
+
+ /*
+ * We might not find the lock here, even if we originally entered it
+ * here. Another backend may have moved it to the main table.
+ */
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+ released = FastPathUnGrantRelationLock(locktag->locktag_field2,
+ lockmode);
+ LWLockRelease(&MyProc->fpInfoLock);
+ if (released)
+ {
+ RemoveLocalLock(locallock);
+ return true;
+ }
+ }
+
+ /*
+ * Otherwise we've got to mess with the shared lock table.
+ */
+ partitionLock = LockHashPartitionLock(locallock->hashcode);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Normally, we don't need to re-find the lock or proclock, since we kept
+ * their addresses in the locallock table, and they couldn't have been
+ * removed while we were holding a lock on them. But it's possible that
+ * the lock was taken fast-path and has since been moved to the main hash
+ * table by another backend, in which case we will need to look up the
+ * objects here. We assume the lock field is NULL if so.
+ */
+ lock = locallock->lock;
+ if (!lock)
+ {
+ PROCLOCKTAG proclocktag;
+
+ Assert(EligibleForRelationFastPath(locktag, lockmode));
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ locktag,
+ locallock->hashcode,
+ HASH_FIND,
+ NULL);
+ if (!lock)
+ elog(ERROR, "failed to re-find shared lock object");
+ locallock->lock = lock;
+
+ proclocktag.myLock = lock;
+ proclocktag.myProc = MyProc;
+ locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+ &proclocktag,
+ HASH_FIND,
+ NULL);
+ if (!locallock->proclock)
+ elog(ERROR, "failed to re-find shared proclock object");
+ }
+ LOCK_PRINT("LockRelease: found", lock, lockmode);
+ proclock = locallock->proclock;
+ PROCLOCK_PRINT("LockRelease: found", proclock);
+
+ /*
+ * Double-check that we are actually holding a lock of the type we want to
+ * release.
+ */
+ if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+ {
+ PROCLOCK_PRINT("LockRelease: WRONGTYPE", proclock);
+ LWLockRelease(partitionLock);
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ RemoveLocalLock(locallock);
+ return false;
+ }
+
+ /*
+ * Do the releasing. CleanUpLock will waken any now-wakable waiters.
+ */
+ wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+ CleanUpLock(lock, proclock,
+ lockMethodTable, locallock->hashcode,
+ wakeupNeeded);
+
+ LWLockRelease(partitionLock);
+
+ RemoveLocalLock(locallock);
+ return true;
+}
+
+/*
+ * LockReleaseAll -- Release all locks of the specified lock method that
+ * are held by the current process.
+ *
+ * Well, not necessarily *all* locks. The available behaviors are:
+ * allLocks == true: release all locks including session locks.
+ * allLocks == false: release all non-session locks.
+ */
+void
+LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks)
+{
+ HASH_SEQ_STATUS status;
+ LockMethod lockMethodTable;
+ int i,
+ numLockModes;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ int partition;
+ bool have_fast_path_lwlock = false;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+
+#ifdef LOCK_DEBUG
+ if (*(lockMethodTable->trace_flag))
+ elog(LOG, "LockReleaseAll: lockmethod=%d", lockmethodid);
+#endif
+
+ /*
+ * Get rid of our fast-path VXID lock, if appropriate. Note that this is
+ * the only way that the lock we hold on our own VXID can ever get
+ * released: it is always and only released when a toplevel transaction
+ * ends.
+ */
+ if (lockmethodid == DEFAULT_LOCKMETHOD)
+ VirtualXactLockTableCleanup();
+
+ numLockModes = lockMethodTable->numLockModes;
+
+ /*
+ * First we run through the locallock table and get rid of unwanted
+ * entries, then we scan the process's proclocks and get rid of those. We
+ * do this separately because we may have multiple locallock entries
+ * pointing to the same proclock, and we daren't end up with any dangling
+ * pointers. Fast-path locks are cleaned up during the locallock table
+ * scan, though.
+ */
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ /*
+ * If the LOCALLOCK entry is unused, we must've run out of shared
+ * memory while trying to set up this lock. Just forget the local
+ * entry.
+ */
+ if (locallock->nLocks == 0)
+ {
+ RemoveLocalLock(locallock);
+ continue;
+ }
+
+ /* Ignore items that are not of the lockmethod to be removed */
+ if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+ continue;
+
+ /*
+ * If we are asked to release all locks, we can just zap the entry.
+ * Otherwise, must scan to see if there are session locks. We assume
+ * there is at most one lockOwners entry for session locks.
+ */
+ if (!allLocks)
+ {
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+
+ /* If session lock is above array position 0, move it down to 0 */
+ for (i = 0; i < locallock->numLockOwners; i++)
+ {
+ if (lockOwners[i].owner == NULL)
+ lockOwners[0] = lockOwners[i];
+ else
+ ResourceOwnerForgetLock(lockOwners[i].owner, locallock);
+ }
+
+ if (locallock->numLockOwners > 0 &&
+ lockOwners[0].owner == NULL &&
+ lockOwners[0].nLocks > 0)
+ {
+ /* Fix the locallock to show just the session locks */
+ locallock->nLocks = lockOwners[0].nLocks;
+ locallock->numLockOwners = 1;
+ /* We aren't deleting this locallock, so done */
+ continue;
+ }
+ else
+ locallock->numLockOwners = 0;
+ }
+
+ /*
+ * If the lock or proclock pointers are NULL, this lock was taken via
+ * the relation fast-path (and is not known to have been transferred).
+ */
+ if (locallock->proclock == NULL || locallock->lock == NULL)
+ {
+ LOCKMODE lockmode = locallock->tag.mode;
+ Oid relid;
+
+ /* Verify that a fast-path lock is what we've got. */
+ if (!EligibleForRelationFastPath(&locallock->tag.lock, lockmode))
+ elog(PANIC, "locallock table corrupted");
+
+ /*
+ * If we don't currently hold the LWLock that protects our
+ * fast-path data structures, we must acquire it before attempting
+ * to release the lock via the fast-path. We will continue to
+ * hold the LWLock until we're done scanning the locallock table,
+ * unless we hit a transferred fast-path lock. (XXX is this
+ * really such a good idea? There could be a lot of entries ...)
+ */
+ if (!have_fast_path_lwlock)
+ {
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+ have_fast_path_lwlock = true;
+ }
+
+ /* Attempt fast-path release. */
+ relid = locallock->tag.lock.locktag_field2;
+ if (FastPathUnGrantRelationLock(relid, lockmode))
+ {
+ RemoveLocalLock(locallock);
+ continue;
+ }
+
+ /*
+ * Our lock, originally taken via the fast path, has been
+ * transferred to the main lock table. That's going to require
+ * some extra work, so release our fast-path lock before starting.
+ */
+ LWLockRelease(&MyProc->fpInfoLock);
+ have_fast_path_lwlock = false;
+
+ /*
+ * Now dump the lock. We haven't got a pointer to the LOCK or
+ * PROCLOCK in this case, so we have to handle this a bit
+ * differently than a normal lock release. Unfortunately, this
+ * requires an extra LWLock acquire-and-release cycle on the
+ * partitionLock, but hopefully it shouldn't happen often.
+ */
+ LockRefindAndRelease(lockMethodTable, MyProc,
+ &locallock->tag.lock, lockmode, false);
+ RemoveLocalLock(locallock);
+ continue;
+ }
+
+ /* Mark the proclock to show we need to release this lockmode */
+ if (locallock->nLocks > 0)
+ locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+ /* And remove the locallock hashtable entry */
+ RemoveLocalLock(locallock);
+ }
+
+ /* Done with the fast-path data structures */
+ if (have_fast_path_lwlock)
+ LWLockRelease(&MyProc->fpInfoLock);
+
+ /*
+ * Now, scan each lock partition separately.
+ */
+ for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+ {
+ LWLock *partitionLock;
+ dlist_head *procLocks = &MyProc->myProcLocks[partition];
+ dlist_mutable_iter proclock_iter;
+
+ partitionLock = LockHashPartitionLockByIndex(partition);
+
+ /*
+ * If the proclock list for this partition is empty, we can skip
+ * acquiring the partition lock. This optimization is trickier than
+ * it looks, because another backend could be in process of adding
+ * something to our proclock list due to promoting one of our
+ * fast-path locks. However, any such lock must be one that we
+ * decided not to delete above, so it's okay to skip it again now;
+ * we'd just decide not to delete it again. We must, however, be
+ * careful to re-fetch the list header once we've acquired the
+ * partition lock, to be sure we have a valid, up-to-date pointer.
+ * (There is probably no significant risk if pointer fetch/store is
+ * atomic, but we don't wish to assume that.)
+ *
+ * XXX This argument assumes that the locallock table correctly
+ * represents all of our fast-path locks. While allLocks mode
+ * guarantees to clean up all of our normal locks regardless of the
+ * locallock situation, we lose that guarantee for fast-path locks.
+ * This is not ideal.
+ */
+ if (dlist_is_empty(procLocks))
+ continue; /* needn't examine this partition */
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ dlist_foreach_modify(proclock_iter, procLocks)
+ {
+ PROCLOCK *proclock = dlist_container(PROCLOCK, procLink, proclock_iter.cur);
+ bool wakeupNeeded = false;
+
+ Assert(proclock->tag.myProc == MyProc);
+
+ lock = proclock->tag.myLock;
+
+ /* Ignore items that are not of the lockmethod to be removed */
+ if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
+ continue;
+
+ /*
+ * In allLocks mode, force release of all locks even if locallock
+ * table had problems
+ */
+ if (allLocks)
+ proclock->releaseMask = proclock->holdMask;
+ else
+ Assert((proclock->releaseMask & ~proclock->holdMask) == 0);
+
+ /*
+ * Ignore items that have nothing to be released, unless they have
+ * holdMask == 0 and are therefore recyclable
+ */
+ if (proclock->releaseMask == 0 && proclock->holdMask != 0)
+ continue;
+
+ PROCLOCK_PRINT("LockReleaseAll", proclock);
+ LOCK_PRINT("LockReleaseAll", lock, 0);
+ Assert(lock->nRequested >= 0);
+ Assert(lock->nGranted >= 0);
+ Assert(lock->nGranted <= lock->nRequested);
+ Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+ /*
+ * Release the previously-marked lock modes
+ */
+ for (i = 1; i <= numLockModes; i++)
+ {
+ if (proclock->releaseMask & LOCKBIT_ON(i))
+ wakeupNeeded |= UnGrantLock(lock, i, proclock,
+ lockMethodTable);
+ }
+ Assert((lock->nRequested >= 0) && (lock->nGranted >= 0));
+ Assert(lock->nGranted <= lock->nRequested);
+ LOCK_PRINT("LockReleaseAll: updated", lock, 0);
+
+ proclock->releaseMask = 0;
+
+ /* CleanUpLock will wake up waiters if needed. */
+ CleanUpLock(lock, proclock,
+ lockMethodTable,
+ LockTagHashCode(&lock->tag),
+ wakeupNeeded);
+ } /* loop over PROCLOCKs within this partition */
+
+ LWLockRelease(partitionLock);
+ } /* loop over partitions */
+
+#ifdef LOCK_DEBUG
+ if (*(lockMethodTable->trace_flag))
+ elog(LOG, "LockReleaseAll done");
+#endif
+}
+
+/*
+ * LockReleaseSession -- Release all session locks of the specified lock method
+ * that are held by the current process.
+ */
+void
+LockReleaseSession(LOCKMETHODID lockmethodid)
+{
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ /* Ignore items that are not of the specified lock method */
+ if (LOCALLOCK_LOCKMETHOD(*locallock) != lockmethodid)
+ continue;
+
+ ReleaseLockIfHeld(locallock, true);
+ }
+}
+
+/*
+ * LockReleaseCurrentOwner
+ * Release all locks belonging to CurrentResourceOwner
+ *
+ * If the caller knows what those locks are, it can pass them as an array.
+ * That speeds up the call significantly, when a lot of locks are held.
+ * Otherwise, pass NULL for locallocks, and we'll traverse through our hash
+ * table to find them.
+ */
+void
+LockReleaseCurrentOwner(LOCALLOCK **locallocks, int nlocks)
+{
+ if (locallocks == NULL)
+ {
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ ReleaseLockIfHeld(locallock, false);
+ }
+ else
+ {
+ int i;
+
+ for (i = nlocks - 1; i >= 0; i--)
+ ReleaseLockIfHeld(locallocks[i], false);
+ }
+}
+
+/*
+ * ReleaseLockIfHeld
+ * Release any session-level locks on this lockable object if sessionLock
+ * is true; else, release any locks held by CurrentResourceOwner.
+ *
+ * It is tempting to pass this a ResourceOwner pointer (or NULL for session
+ * locks), but without refactoring LockRelease() we cannot support releasing
+ * locks belonging to resource owners other than CurrentResourceOwner.
+ * If we were to refactor, it'd be a good idea to fix it so we don't have to
+ * do a hashtable lookup of the locallock, too. However, currently this
+ * function isn't used heavily enough to justify refactoring for its
+ * convenience.
+ */
+static void
+ReleaseLockIfHeld(LOCALLOCK *locallock, bool sessionLock)
+{
+ ResourceOwner owner;
+ LOCALLOCKOWNER *lockOwners;
+ int i;
+
+ /* Identify owner for lock (must match LockRelease!) */
+ if (sessionLock)
+ owner = NULL;
+ else
+ owner = CurrentResourceOwner;
+
+ /* Scan to see if there are any locks belonging to the target owner */
+ lockOwners = locallock->lockOwners;
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == owner)
+ {
+ Assert(lockOwners[i].nLocks > 0);
+ if (lockOwners[i].nLocks < locallock->nLocks)
+ {
+ /*
+ * We will still hold this lock after forgetting this
+ * ResourceOwner.
+ */
+ locallock->nLocks -= lockOwners[i].nLocks;
+ /* compact out unused slot */
+ locallock->numLockOwners--;
+ if (owner != NULL)
+ ResourceOwnerForgetLock(owner, locallock);
+ if (i < locallock->numLockOwners)
+ lockOwners[i] = lockOwners[locallock->numLockOwners];
+ }
+ else
+ {
+ Assert(lockOwners[i].nLocks == locallock->nLocks);
+ /* We want to call LockRelease just once */
+ lockOwners[i].nLocks = 1;
+ locallock->nLocks = 1;
+ if (!LockRelease(&locallock->tag.lock,
+ locallock->tag.mode,
+ sessionLock))
+ elog(WARNING, "ReleaseLockIfHeld: failed??");
+ }
+ break;
+ }
+ }
+}
+
+/*
+ * LockReassignCurrentOwner
+ * Reassign all locks belonging to CurrentResourceOwner to belong
+ * to its parent resource owner.
+ *
+ * If the caller knows what those locks are, it can pass them as an array.
+ * That speeds up the call significantly, when a lot of locks are held
+ * (e.g pg_dump with a large schema). Otherwise, pass NULL for locallocks,
+ * and we'll traverse through our hash table to find them.
+ */
+void
+LockReassignCurrentOwner(LOCALLOCK **locallocks, int nlocks)
+{
+ ResourceOwner parent = ResourceOwnerGetParent(CurrentResourceOwner);
+
+ Assert(parent != NULL);
+
+ if (locallocks == NULL)
+ {
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ LockReassignOwner(locallock, parent);
+ }
+ else
+ {
+ int i;
+
+ for (i = nlocks - 1; i >= 0; i--)
+ LockReassignOwner(locallocks[i], parent);
+ }
+}
+
+/*
+ * Subroutine of LockReassignCurrentOwner. Reassigns a given lock belonging to
+ * CurrentResourceOwner to its parent.
+ */
+static void
+LockReassignOwner(LOCALLOCK *locallock, ResourceOwner parent)
+{
+ LOCALLOCKOWNER *lockOwners;
+ int i;
+ int ic = -1;
+ int ip = -1;
+
+ /*
+ * Scan to see if there are any locks belonging to current owner or its
+ * parent
+ */
+ lockOwners = locallock->lockOwners;
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == CurrentResourceOwner)
+ ic = i;
+ else if (lockOwners[i].owner == parent)
+ ip = i;
+ }
+
+ if (ic < 0)
+ return; /* no current locks */
+
+ if (ip < 0)
+ {
+ /* Parent has no slot, so just give it the child's slot */
+ lockOwners[ic].owner = parent;
+ ResourceOwnerRememberLock(parent, locallock);
+ }
+ else
+ {
+ /* Merge child's count with parent's */
+ lockOwners[ip].nLocks += lockOwners[ic].nLocks;
+ /* compact out unused slot */
+ locallock->numLockOwners--;
+ if (ic < locallock->numLockOwners)
+ lockOwners[ic] = lockOwners[locallock->numLockOwners];
+ }
+ ResourceOwnerForgetLock(CurrentResourceOwner, locallock);
+}
+
+/*
+ * FastPathGrantRelationLock
+ * Grant lock using per-backend fast-path array, if there is space.
+ */
+static bool
+FastPathGrantRelationLock(Oid relid, LOCKMODE lockmode)
+{
+ uint32 f;
+ uint32 unused_slot = FP_LOCK_SLOTS_PER_BACKEND;
+
+ /* Scan for existing entry for this relid, remembering empty slot. */
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ if (FAST_PATH_GET_BITS(MyProc, f) == 0)
+ unused_slot = f;
+ else if (MyProc->fpRelId[f] == relid)
+ {
+ Assert(!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode));
+ FAST_PATH_SET_LOCKMODE(MyProc, f, lockmode);
+ return true;
+ }
+ }
+
+ /* If no existing entry, use any empty slot. */
+ if (unused_slot < FP_LOCK_SLOTS_PER_BACKEND)
+ {
+ MyProc->fpRelId[unused_slot] = relid;
+ FAST_PATH_SET_LOCKMODE(MyProc, unused_slot, lockmode);
+ ++FastPathLocalUseCount;
+ return true;
+ }
+
+ /* No existing entry, and no empty slot. */
+ return false;
+}
+
+/*
+ * FastPathUnGrantRelationLock
+ * Release fast-path lock, if present. Update backend-private local
+ * use count, while we're at it.
+ */
+static bool
+FastPathUnGrantRelationLock(Oid relid, LOCKMODE lockmode)
+{
+ uint32 f;
+ bool result = false;
+
+ FastPathLocalUseCount = 0;
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ if (MyProc->fpRelId[f] == relid
+ && FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
+ {
+ Assert(!result);
+ FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode);
+ result = true;
+ /* we continue iterating so as to update FastPathLocalUseCount */
+ }
+ if (FAST_PATH_GET_BITS(MyProc, f) != 0)
+ ++FastPathLocalUseCount;
+ }
+ return result;
+}
+
+/*
+ * FastPathTransferRelationLocks
+ * Transfer locks matching the given lock tag from per-backend fast-path
+ * arrays to the shared hash table.
+ *
+ * Returns true if successful, false if ran out of shared memory.
+ */
+static bool
+FastPathTransferRelationLocks(LockMethod lockMethodTable, const LOCKTAG *locktag,
+ uint32 hashcode)
+{
+ LWLock *partitionLock = LockHashPartitionLock(hashcode);
+ Oid relid = locktag->locktag_field2;
+ uint32 i;
+
+ /*
+ * Every PGPROC that can potentially hold a fast-path lock is present in
+ * ProcGlobal->allProcs. Prepared transactions are not, but any
+ * outstanding fast-path locks held by prepared transactions are
+ * transferred to the main lock table.
+ */
+ for (i = 0; i < ProcGlobal->allProcCount; i++)
+ {
+ PGPROC *proc = &ProcGlobal->allProcs[i];
+ uint32 f;
+
+ LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
+
+ /*
+ * If the target backend isn't referencing the same database as the
+ * lock, then we needn't examine the individual relation IDs at all;
+ * none of them can be relevant.
+ *
+ * proc->databaseId is set at backend startup time and never changes
+ * thereafter, so it might be safe to perform this test before
+ * acquiring &proc->fpInfoLock. In particular, it's certainly safe to
+ * assume that if the target backend holds any fast-path locks, it
+ * must have performed a memory-fencing operation (in particular, an
+ * LWLock acquisition) since setting proc->databaseId. However, it's
+ * less clear that our backend is certain to have performed a memory
+ * fencing operation since the other backend set proc->databaseId. So
+ * for now, we test it after acquiring the LWLock just to be safe.
+ */
+ if (proc->databaseId != locktag->locktag_field1)
+ {
+ LWLockRelease(&proc->fpInfoLock);
+ continue;
+ }
+
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ uint32 lockmode;
+
+ /* Look for an allocated slot matching the given relid. */
+ if (relid != proc->fpRelId[f] || FAST_PATH_GET_BITS(proc, f) == 0)
+ continue;
+
+ /* Find or create lock object. */
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ for (lockmode = FAST_PATH_LOCKNUMBER_OFFSET;
+ lockmode < FAST_PATH_LOCKNUMBER_OFFSET + FAST_PATH_BITS_PER_SLOT;
+ ++lockmode)
+ {
+ PROCLOCK *proclock;
+
+ if (!FAST_PATH_CHECK_LOCKMODE(proc, f, lockmode))
+ continue;
+ proclock = SetupLockInTable(lockMethodTable, proc, locktag,
+ hashcode, lockmode);
+ if (!proclock)
+ {
+ LWLockRelease(partitionLock);
+ LWLockRelease(&proc->fpInfoLock);
+ return false;
+ }
+ GrantLock(proclock->tag.myLock, proclock, lockmode);
+ FAST_PATH_CLEAR_LOCKMODE(proc, f, lockmode);
+ }
+ LWLockRelease(partitionLock);
+
+ /* No need to examine remaining slots. */
+ break;
+ }
+ LWLockRelease(&proc->fpInfoLock);
+ }
+ return true;
+}
+
+/*
+ * FastPathGetRelationLockEntry
+ * Return the PROCLOCK for a lock originally taken via the fast-path,
+ * transferring it to the primary lock table if necessary.
+ *
+ * Note: caller takes care of updating the locallock object.
+ */
+static PROCLOCK *
+FastPathGetRelationLockEntry(LOCALLOCK *locallock)
+{
+ LockMethod lockMethodTable = LockMethods[DEFAULT_LOCKMETHOD];
+ LOCKTAG *locktag = &locallock->tag.lock;
+ PROCLOCK *proclock = NULL;
+ LWLock *partitionLock = LockHashPartitionLock(locallock->hashcode);
+ Oid relid = locktag->locktag_field2;
+ uint32 f;
+
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ uint32 lockmode;
+
+ /* Look for an allocated slot matching the given relid. */
+ if (relid != MyProc->fpRelId[f] || FAST_PATH_GET_BITS(MyProc, f) == 0)
+ continue;
+
+ /* If we don't have a lock of the given mode, forget it! */
+ lockmode = locallock->tag.mode;
+ if (!FAST_PATH_CHECK_LOCKMODE(MyProc, f, lockmode))
+ break;
+
+ /* Find or create lock object. */
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ proclock = SetupLockInTable(lockMethodTable, MyProc, locktag,
+ locallock->hashcode, lockmode);
+ if (!proclock)
+ {
+ LWLockRelease(partitionLock);
+ LWLockRelease(&MyProc->fpInfoLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_locks_per_transaction")));
+ }
+ GrantLock(proclock->tag.myLock, proclock, lockmode);
+ FAST_PATH_CLEAR_LOCKMODE(MyProc, f, lockmode);
+
+ LWLockRelease(partitionLock);
+
+ /* No need to examine remaining slots. */
+ break;
+ }
+
+ LWLockRelease(&MyProc->fpInfoLock);
+
+ /* Lock may have already been transferred by some other backend. */
+ if (proclock == NULL)
+ {
+ LOCK *lock;
+ PROCLOCKTAG proclocktag;
+ uint32 proclock_hashcode;
+
+ LWLockAcquire(partitionLock, LW_SHARED);
+
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ locktag,
+ locallock->hashcode,
+ HASH_FIND,
+ NULL);
+ if (!lock)
+ elog(ERROR, "failed to re-find shared lock object");
+
+ proclocktag.myLock = lock;
+ proclocktag.myProc = MyProc;
+
+ proclock_hashcode = ProcLockHashCode(&proclocktag, locallock->hashcode);
+ proclock = (PROCLOCK *)
+ hash_search_with_hash_value(LockMethodProcLockHash,
+ &proclocktag,
+ proclock_hashcode,
+ HASH_FIND,
+ NULL);
+ if (!proclock)
+ elog(ERROR, "failed to re-find shared proclock object");
+ LWLockRelease(partitionLock);
+ }
+
+ return proclock;
+}
+
+/*
+ * GetLockConflicts
+ * Get an array of VirtualTransactionIds of xacts currently holding locks
+ * that would conflict with the specified lock/lockmode.
+ * xacts merely awaiting such a lock are NOT reported.
+ *
+ * The result array is palloc'd and is terminated with an invalid VXID.
+ * *countp, if not null, is updated to the number of items set.
+ *
+ * Of course, the result could be out of date by the time it's returned, so
+ * use of this function has to be thought about carefully. Similarly, a
+ * PGPROC with no "lxid" will be considered non-conflicting regardless of any
+ * lock it holds. Existing callers don't care about a locker after that
+ * locker's pg_xact updates complete. CommitTransaction() clears "lxid" after
+ * pg_xact updates and before releasing locks.
+ *
+ * Note we never include the current xact's vxid in the result array,
+ * since an xact never blocks itself.
+ */
+VirtualTransactionId *
+GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode, int *countp)
+{
+ static VirtualTransactionId *vxids;
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LockMethod lockMethodTable;
+ LOCK *lock;
+ LOCKMASK conflictMask;
+ dlist_iter proclock_iter;
+ PROCLOCK *proclock;
+ uint32 hashcode;
+ LWLock *partitionLock;
+ int count = 0;
+ int fast_count = 0;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+ if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
+ elog(ERROR, "unrecognized lock mode: %d", lockmode);
+
+ /*
+ * Allocate memory to store results, and fill with InvalidVXID. We only
+ * need enough space for MaxBackends + max_prepared_xacts + a terminator.
+ * InHotStandby allocate once in TopMemoryContext.
+ */
+ if (InHotStandby)
+ {
+ if (vxids == NULL)
+ vxids = (VirtualTransactionId *)
+ MemoryContextAlloc(TopMemoryContext,
+ sizeof(VirtualTransactionId) *
+ (MaxBackends + max_prepared_xacts + 1));
+ }
+ else
+ vxids = (VirtualTransactionId *)
+ palloc0(sizeof(VirtualTransactionId) *
+ (MaxBackends + max_prepared_xacts + 1));
+
+ /* Compute hash code and partition lock, and look up conflicting modes. */
+ hashcode = LockTagHashCode(locktag);
+ partitionLock = LockHashPartitionLock(hashcode);
+ conflictMask = lockMethodTable->conflictTab[lockmode];
+
+ /*
+ * Fast path locks might not have been entered in the primary lock table.
+ * If the lock we're dealing with could conflict with such a lock, we must
+ * examine each backend's fast-path array for conflicts.
+ */
+ if (ConflictsWithRelationFastPath(locktag, lockmode))
+ {
+ int i;
+ Oid relid = locktag->locktag_field2;
+ VirtualTransactionId vxid;
+
+ /*
+ * Iterate over relevant PGPROCs. Anything held by a prepared
+ * transaction will have been transferred to the primary lock table,
+ * so we need not worry about those. This is all a bit fuzzy, because
+ * new locks could be taken after we've visited a particular
+ * partition, but the callers had better be prepared to deal with that
+ * anyway, since the locks could equally well be taken between the
+ * time we return the value and the time the caller does something
+ * with it.
+ */
+ for (i = 0; i < ProcGlobal->allProcCount; i++)
+ {
+ PGPROC *proc = &ProcGlobal->allProcs[i];
+ uint32 f;
+
+ /* A backend never blocks itself */
+ if (proc == MyProc)
+ continue;
+
+ LWLockAcquire(&proc->fpInfoLock, LW_SHARED);
+
+ /*
+ * If the target backend isn't referencing the same database as
+ * the lock, then we needn't examine the individual relation IDs
+ * at all; none of them can be relevant.
+ *
+ * See FastPathTransferRelationLocks() for discussion of why we do
+ * this test after acquiring the lock.
+ */
+ if (proc->databaseId != locktag->locktag_field1)
+ {
+ LWLockRelease(&proc->fpInfoLock);
+ continue;
+ }
+
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; f++)
+ {
+ uint32 lockmask;
+
+ /* Look for an allocated slot matching the given relid. */
+ if (relid != proc->fpRelId[f])
+ continue;
+ lockmask = FAST_PATH_GET_BITS(proc, f);
+ if (!lockmask)
+ continue;
+ lockmask <<= FAST_PATH_LOCKNUMBER_OFFSET;
+
+ /*
+ * There can only be one entry per relation, so if we found it
+ * and it doesn't conflict, we can skip the rest of the slots.
+ */
+ if ((lockmask & conflictMask) == 0)
+ break;
+
+ /* Conflict! */
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+
+ if (VirtualTransactionIdIsValid(vxid))
+ vxids[count++] = vxid;
+ /* else, xact already committed or aborted */
+
+ /* No need to examine remaining slots. */
+ break;
+ }
+
+ LWLockRelease(&proc->fpInfoLock);
+ }
+ }
+
+ /* Remember how many fast-path conflicts we found. */
+ fast_count = count;
+
+ /*
+ * Look up the lock object matching the tag.
+ */
+ LWLockAcquire(partitionLock, LW_SHARED);
+
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ locktag,
+ hashcode,
+ HASH_FIND,
+ NULL);
+ if (!lock)
+ {
+ /*
+ * If the lock object doesn't exist, there is nothing holding a lock
+ * on this lockable object.
+ */
+ LWLockRelease(partitionLock);
+ vxids[count].backendId = InvalidBackendId;
+ vxids[count].localTransactionId = InvalidLocalTransactionId;
+ if (countp)
+ *countp = count;
+ return vxids;
+ }
+
+ /*
+ * Examine each existing holder (or awaiter) of the lock.
+ */
+ dlist_foreach(proclock_iter, &lock->procLocks)
+ {
+ proclock = dlist_container(PROCLOCK, lockLink, proclock_iter.cur);
+
+ if (conflictMask & proclock->holdMask)
+ {
+ PGPROC *proc = proclock->tag.myProc;
+
+ /* A backend never blocks itself */
+ if (proc != MyProc)
+ {
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+
+ if (VirtualTransactionIdIsValid(vxid))
+ {
+ int i;
+
+ /* Avoid duplicate entries. */
+ for (i = 0; i < fast_count; ++i)
+ if (VirtualTransactionIdEquals(vxids[i], vxid))
+ break;
+ if (i >= fast_count)
+ vxids[count++] = vxid;
+ }
+ /* else, xact already committed or aborted */
+ }
+ }
+ }
+
+ LWLockRelease(partitionLock);
+
+ if (count > MaxBackends + max_prepared_xacts) /* should never happen */
+ elog(PANIC, "too many conflicting locks found");
+
+ vxids[count].backendId = InvalidBackendId;
+ vxids[count].localTransactionId = InvalidLocalTransactionId;
+ if (countp)
+ *countp = count;
+ return vxids;
+}
+
+/*
+ * Find a lock in the shared lock table and release it. It is the caller's
+ * responsibility to verify that this is a sane thing to do. (For example, it
+ * would be bad to release a lock here if there might still be a LOCALLOCK
+ * object with pointers to it.)
+ *
+ * We currently use this in two situations: first, to release locks held by
+ * prepared transactions on commit (see lock_twophase_postcommit); and second,
+ * to release locks taken via the fast-path, transferred to the main hash
+ * table, and then released (see LockReleaseAll).
+ */
+static void
+LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
+ LOCKTAG *locktag, LOCKMODE lockmode,
+ bool decrement_strong_lock_count)
+{
+ LOCK *lock;
+ PROCLOCK *proclock;
+ PROCLOCKTAG proclocktag;
+ uint32 hashcode;
+ uint32 proclock_hashcode;
+ LWLock *partitionLock;
+ bool wakeupNeeded;
+
+ hashcode = LockTagHashCode(locktag);
+ partitionLock = LockHashPartitionLock(hashcode);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Re-find the lock object (it had better be there).
+ */
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ locktag,
+ hashcode,
+ HASH_FIND,
+ NULL);
+ if (!lock)
+ elog(PANIC, "failed to re-find shared lock object");
+
+ /*
+ * Re-find the proclock object (ditto).
+ */
+ proclocktag.myLock = lock;
+ proclocktag.myProc = proc;
+
+ proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+ proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ &proclocktag,
+ proclock_hashcode,
+ HASH_FIND,
+ NULL);
+ if (!proclock)
+ elog(PANIC, "failed to re-find shared proclock object");
+
+ /*
+ * Double-check that we are actually holding a lock of the type we want to
+ * release.
+ */
+ if (!(proclock->holdMask & LOCKBIT_ON(lockmode)))
+ {
+ PROCLOCK_PRINT("lock_twophase_postcommit: WRONGTYPE", proclock);
+ LWLockRelease(partitionLock);
+ elog(WARNING, "you don't own a lock of type %s",
+ lockMethodTable->lockModeNames[lockmode]);
+ return;
+ }
+
+ /*
+ * Do the releasing. CleanUpLock will waken any now-wakable waiters.
+ */
+ wakeupNeeded = UnGrantLock(lock, lockmode, proclock, lockMethodTable);
+
+ CleanUpLock(lock, proclock,
+ lockMethodTable, hashcode,
+ wakeupNeeded);
+
+ LWLockRelease(partitionLock);
+
+ /*
+ * Decrement strong lock count. This logic is needed only for 2PC.
+ */
+ if (decrement_strong_lock_count
+ && ConflictsWithRelationFastPath(locktag, lockmode))
+ {
+ uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ Assert(FastPathStrongRelationLocks->count[fasthashcode] > 0);
+ FastPathStrongRelationLocks->count[fasthashcode]--;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+ }
+}
+
+/*
+ * CheckForSessionAndXactLocks
+ * Check to see if transaction holds both session-level and xact-level
+ * locks on the same object; if so, throw an error.
+ *
+ * If we have both session- and transaction-level locks on the same object,
+ * PREPARE TRANSACTION must fail. This should never happen with regular
+ * locks, since we only take those at session level in some special operations
+ * like VACUUM. It's possible to hit this with advisory locks, though.
+ *
+ * It would be nice if we could keep the session hold and give away the
+ * transactional hold to the prepared xact. However, that would require two
+ * PROCLOCK objects, and we cannot be sure that another PROCLOCK will be
+ * available when it comes time for PostPrepare_Locks to do the deed.
+ * So for now, we error out while we can still do so safely.
+ *
+ * Since the LOCALLOCK table stores a separate entry for each lockmode,
+ * we can't implement this check by examining LOCALLOCK entries in isolation.
+ * We must build a transient hashtable that is indexed by locktag only.
+ */
+static void
+CheckForSessionAndXactLocks(void)
+{
+ typedef struct
+ {
+ LOCKTAG lock; /* identifies the lockable object */
+ bool sessLock; /* is any lockmode held at session level? */
+ bool xactLock; /* is any lockmode held at xact level? */
+ } PerLockTagEntry;
+
+ HASHCTL hash_ctl;
+ HTAB *lockhtab;
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ /* Create a local hash table keyed by LOCKTAG only */
+ hash_ctl.keysize = sizeof(LOCKTAG);
+ hash_ctl.entrysize = sizeof(PerLockTagEntry);
+ hash_ctl.hcxt = CurrentMemoryContext;
+
+ lockhtab = hash_create("CheckForSessionAndXactLocks table",
+ 256, /* arbitrary initial size */
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+
+ /* Scan local lock table to find entries for each LOCKTAG */
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ PerLockTagEntry *hentry;
+ bool found;
+ int i;
+
+ /*
+ * Ignore VXID locks. We don't want those to be held by prepared
+ * transactions, since they aren't meaningful after a restart.
+ */
+ if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+ continue;
+
+ /* Ignore it if we don't actually hold the lock */
+ if (locallock->nLocks <= 0)
+ continue;
+
+ /* Otherwise, find or make an entry in lockhtab */
+ hentry = (PerLockTagEntry *) hash_search(lockhtab,
+ &locallock->tag.lock,
+ HASH_ENTER, &found);
+ if (!found) /* initialize, if newly created */
+ hentry->sessLock = hentry->xactLock = false;
+
+ /* Scan to see if we hold lock at session or xact level or both */
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == NULL)
+ hentry->sessLock = true;
+ else
+ hentry->xactLock = true;
+ }
+
+ /*
+ * We can throw error immediately when we see both types of locks; no
+ * need to wait around to see if there are more violations.
+ */
+ if (hentry->sessLock && hentry->xactLock)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+ }
+
+ /* Success, so clean up */
+ hash_destroy(lockhtab);
+}
+
+/*
+ * AtPrepare_Locks
+ * Do the preparatory work for a PREPARE: make 2PC state file records
+ * for all locks currently held.
+ *
+ * Session-level locks are ignored, as are VXID locks.
+ *
+ * For the most part, we don't need to touch shared memory for this ---
+ * all the necessary state information is in the locallock table.
+ * Fast-path locks are an exception, however: we move any such locks to
+ * the main table before allowing PREPARE TRANSACTION to succeed.
+ */
+void
+AtPrepare_Locks(void)
+{
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+
+ /* First, verify there aren't locks of both xact and session level */
+ CheckForSessionAndXactLocks();
+
+ /* Now do the per-locallock cleanup work */
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ TwoPhaseLockRecord record;
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ bool haveSessionLock;
+ bool haveXactLock;
+ int i;
+
+ /*
+ * Ignore VXID locks. We don't want those to be held by prepared
+ * transactions, since they aren't meaningful after a restart.
+ */
+ if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+ continue;
+
+ /* Ignore it if we don't actually hold the lock */
+ if (locallock->nLocks <= 0)
+ continue;
+
+ /* Scan to see whether we hold it at session or transaction level */
+ haveSessionLock = haveXactLock = false;
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == NULL)
+ haveSessionLock = true;
+ else
+ haveXactLock = true;
+ }
+
+ /* Ignore it if we have only session lock */
+ if (!haveXactLock)
+ continue;
+
+ /* This can't happen, because we already checked it */
+ if (haveSessionLock)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+
+ /*
+ * If the local lock was taken via the fast-path, we need to move it
+ * to the primary lock table, or just get a pointer to the existing
+ * primary lock table entry if by chance it's already been
+ * transferred.
+ */
+ if (locallock->proclock == NULL)
+ {
+ locallock->proclock = FastPathGetRelationLockEntry(locallock);
+ locallock->lock = locallock->proclock->tag.myLock;
+ }
+
+ /*
+ * Arrange to not release any strong lock count held by this lock
+ * entry. We must retain the count until the prepared transaction is
+ * committed or rolled back.
+ */
+ locallock->holdsStrongLockCount = false;
+
+ /*
+ * Create a 2PC record.
+ */
+ memcpy(&(record.locktag), &(locallock->tag.lock), sizeof(LOCKTAG));
+ record.lockmode = locallock->tag.mode;
+
+ RegisterTwoPhaseRecord(TWOPHASE_RM_LOCK_ID, 0,
+ &record, sizeof(TwoPhaseLockRecord));
+ }
+}
+
+/*
+ * PostPrepare_Locks
+ * Clean up after successful PREPARE
+ *
+ * Here, we want to transfer ownership of our locks to a dummy PGPROC
+ * that's now associated with the prepared transaction, and we want to
+ * clean out the corresponding entries in the LOCALLOCK table.
+ *
+ * Note: by removing the LOCALLOCK entries, we are leaving dangling
+ * pointers in the transaction's resource owner. This is OK at the
+ * moment since resowner.c doesn't try to free locks retail at a toplevel
+ * transaction commit or abort. We could alternatively zero out nLocks
+ * and leave the LOCALLOCK entries to be garbage-collected by LockReleaseAll,
+ * but that probably costs more cycles.
+ */
+void
+PostPrepare_Locks(TransactionId xid)
+{
+ PGPROC *newproc = TwoPhaseGetDummyProc(xid, false);
+ HASH_SEQ_STATUS status;
+ LOCALLOCK *locallock;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ PROCLOCKTAG proclocktag;
+ int partition;
+
+ /* Can't prepare a lock group follower. */
+ Assert(MyProc->lockGroupLeader == NULL ||
+ MyProc->lockGroupLeader == MyProc);
+
+ /* This is a critical section: any error means big trouble */
+ START_CRIT_SECTION();
+
+ /*
+ * First we run through the locallock table and get rid of unwanted
+ * entries, then we scan the process's proclocks and transfer them to the
+ * target proc.
+ *
+ * We do this separately because we may have multiple locallock entries
+ * pointing to the same proclock, and we daren't end up with any dangling
+ * pointers.
+ */
+ hash_seq_init(&status, LockMethodLocalHash);
+
+ while ((locallock = (LOCALLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ LOCALLOCKOWNER *lockOwners = locallock->lockOwners;
+ bool haveSessionLock;
+ bool haveXactLock;
+ int i;
+
+ if (locallock->proclock == NULL || locallock->lock == NULL)
+ {
+ /*
+ * We must've run out of shared memory while trying to set up this
+ * lock. Just forget the local entry.
+ */
+ Assert(locallock->nLocks == 0);
+ RemoveLocalLock(locallock);
+ continue;
+ }
+
+ /* Ignore VXID locks */
+ if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+ continue;
+
+ /* Scan to see whether we hold it at session or transaction level */
+ haveSessionLock = haveXactLock = false;
+ for (i = locallock->numLockOwners - 1; i >= 0; i--)
+ {
+ if (lockOwners[i].owner == NULL)
+ haveSessionLock = true;
+ else
+ haveXactLock = true;
+ }
+
+ /* Ignore it if we have only session lock */
+ if (!haveXactLock)
+ continue;
+
+ /* This can't happen, because we already checked it */
+ if (haveSessionLock)
+ ereport(PANIC,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot PREPARE while holding both session-level and transaction-level locks on the same object")));
+
+ /* Mark the proclock to show we need to release this lockmode */
+ if (locallock->nLocks > 0)
+ locallock->proclock->releaseMask |= LOCKBIT_ON(locallock->tag.mode);
+
+ /* And remove the locallock hashtable entry */
+ RemoveLocalLock(locallock);
+ }
+
+ /*
+ * Now, scan each lock partition separately.
+ */
+ for (partition = 0; partition < NUM_LOCK_PARTITIONS; partition++)
+ {
+ LWLock *partitionLock;
+ dlist_head *procLocks = &(MyProc->myProcLocks[partition]);
+ dlist_mutable_iter proclock_iter;
+
+ partitionLock = LockHashPartitionLockByIndex(partition);
+
+ /*
+ * If the proclock list for this partition is empty, we can skip
+ * acquiring the partition lock. This optimization is safer than the
+ * situation in LockReleaseAll, because we got rid of any fast-path
+ * locks during AtPrepare_Locks, so there cannot be any case where
+ * another backend is adding something to our lists now. For safety,
+ * though, we code this the same way as in LockReleaseAll.
+ */
+ if (dlist_is_empty(procLocks))
+ continue; /* needn't examine this partition */
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ dlist_foreach_modify(proclock_iter, procLocks)
+ {
+ proclock = dlist_container(PROCLOCK, procLink, proclock_iter.cur);
+
+ Assert(proclock->tag.myProc == MyProc);
+
+ lock = proclock->tag.myLock;
+
+ /* Ignore VXID locks */
+ if (lock->tag.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+ continue;
+
+ PROCLOCK_PRINT("PostPrepare_Locks", proclock);
+ LOCK_PRINT("PostPrepare_Locks", lock, 0);
+ Assert(lock->nRequested >= 0);
+ Assert(lock->nGranted >= 0);
+ Assert(lock->nGranted <= lock->nRequested);
+ Assert((proclock->holdMask & ~lock->grantMask) == 0);
+
+ /* Ignore it if nothing to release (must be a session lock) */
+ if (proclock->releaseMask == 0)
+ continue;
+
+ /* Else we should be releasing all locks */
+ if (proclock->releaseMask != proclock->holdMask)
+ elog(PANIC, "we seem to have dropped a bit somewhere");
+
+ /*
+ * We cannot simply modify proclock->tag.myProc to reassign
+ * ownership of the lock, because that's part of the hash key and
+ * the proclock would then be in the wrong hash chain. Instead
+ * use hash_update_hash_key. (We used to create a new hash entry,
+ * but that risks out-of-memory failure if other processes are
+ * busy making proclocks too.) We must unlink the proclock from
+ * our procLink chain and put it into the new proc's chain, too.
+ *
+ * Note: the updated proclock hash key will still belong to the
+ * same hash partition, cf proclock_hash(). So the partition lock
+ * we already hold is sufficient for this.
+ */
+ dlist_delete(&proclock->procLink);
+
+ /*
+ * Create the new hash key for the proclock.
+ */
+ proclocktag.myLock = lock;
+ proclocktag.myProc = newproc;
+
+ /*
+ * Update groupLeader pointer to point to the new proc. (We'd
+ * better not be a member of somebody else's lock group!)
+ */
+ Assert(proclock->groupLeader == proclock->tag.myProc);
+ proclock->groupLeader = newproc;
+
+ /*
+ * Update the proclock. We should not find any existing entry for
+ * the same hash key, since there can be only one entry for any
+ * given lock with my own proc.
+ */
+ if (!hash_update_hash_key(LockMethodProcLockHash,
+ proclock,
+ &proclocktag))
+ elog(PANIC, "duplicate entry found while reassigning a prepared transaction's locks");
+
+ /* Re-link into the new proc's proclock list */
+ dlist_push_tail(&newproc->myProcLocks[partition], &proclock->procLink);
+
+ PROCLOCK_PRINT("PostPrepare_Locks: updated", proclock);
+ } /* loop over PROCLOCKs within this partition */
+
+ LWLockRelease(partitionLock);
+ } /* loop over partitions */
+
+ END_CRIT_SECTION();
+}
+
+
+/*
+ * Estimate shared-memory space used for lock tables
+ */
+Size
+LockShmemSize(void)
+{
+ Size size = 0;
+ long max_table_size;
+
+ /* lock hash table */
+ max_table_size = NLOCKENTS();
+ size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK)));
+
+ /* proclock hash table */
+ max_table_size *= 2;
+ size = add_size(size, hash_estimate_size(max_table_size, sizeof(PROCLOCK)));
+
+ /*
+ * Since NLOCKENTS is only an estimate, add 10% safety margin.
+ */
+ size = add_size(size, size / 10);
+
+ return size;
+}
+
+/*
+ * GetLockStatusData - Return a summary of the lock manager's internal
+ * status, for use in a user-level reporting function.
+ *
+ * The return data consists of an array of LockInstanceData objects,
+ * which are a lightly abstracted version of the PROCLOCK data structures,
+ * i.e. there is one entry for each unique lock and interested PGPROC.
+ * It is the caller's responsibility to match up related items (such as
+ * references to the same lockable object or PGPROC) if wanted.
+ *
+ * The design goal is to hold the LWLocks for as short a time as possible;
+ * thus, this function simply makes a copy of the necessary data and releases
+ * the locks, allowing the caller to contemplate and format the data for as
+ * long as it pleases.
+ */
+LockData *
+GetLockStatusData(void)
+{
+ LockData *data;
+ PROCLOCK *proclock;
+ HASH_SEQ_STATUS seqstat;
+ int els;
+ int el;
+ int i;
+
+ data = (LockData *) palloc(sizeof(LockData));
+
+ /* Guess how much space we'll need. */
+ els = MaxBackends;
+ el = 0;
+ data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * els);
+
+ /*
+ * First, we iterate through the per-backend fast-path arrays, locking
+ * them one at a time. This might produce an inconsistent picture of the
+ * system state, but taking all of those LWLocks at the same time seems
+ * impractical (in particular, note MAX_SIMUL_LWLOCKS). It shouldn't
+ * matter too much, because none of these locks can be involved in lock
+ * conflicts anyway - anything that might must be present in the main lock
+ * table. (For the same reason, we don't sweat about making leaderPid
+ * completely valid. We cannot safely dereference another backend's
+ * lockGroupLeader field without holding all lock partition locks, and
+ * it's not worth that.)
+ */
+ for (i = 0; i < ProcGlobal->allProcCount; ++i)
+ {
+ PGPROC *proc = &ProcGlobal->allProcs[i];
+ uint32 f;
+
+ LWLockAcquire(&proc->fpInfoLock, LW_SHARED);
+
+ for (f = 0; f < FP_LOCK_SLOTS_PER_BACKEND; ++f)
+ {
+ LockInstanceData *instance;
+ uint32 lockbits = FAST_PATH_GET_BITS(proc, f);
+
+ /* Skip unallocated slots. */
+ if (!lockbits)
+ continue;
+
+ if (el >= els)
+ {
+ els += MaxBackends;
+ data->locks = (LockInstanceData *)
+ repalloc(data->locks, sizeof(LockInstanceData) * els);
+ }
+
+ instance = &data->locks[el];
+ SET_LOCKTAG_RELATION(instance->locktag, proc->databaseId,
+ proc->fpRelId[f]);
+ instance->holdMask = lockbits << FAST_PATH_LOCKNUMBER_OFFSET;
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->leaderPid = proc->pid;
+ instance->fastpath = true;
+
+ /*
+ * Successfully taking fast path lock means there were no
+ * conflicting locks.
+ */
+ instance->waitStart = 0;
+
+ el++;
+ }
+
+ if (proc->fpVXIDLock)
+ {
+ VirtualTransactionId vxid;
+ LockInstanceData *instance;
+
+ if (el >= els)
+ {
+ els += MaxBackends;
+ data->locks = (LockInstanceData *)
+ repalloc(data->locks, sizeof(LockInstanceData) * els);
+ }
+
+ vxid.backendId = proc->backendId;
+ vxid.localTransactionId = proc->fpLocalTransactionId;
+
+ instance = &data->locks[el];
+ SET_LOCKTAG_VIRTUALTRANSACTION(instance->locktag, vxid);
+ instance->holdMask = LOCKBIT_ON(ExclusiveLock);
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->leaderPid = proc->pid;
+ instance->fastpath = true;
+ instance->waitStart = 0;
+
+ el++;
+ }
+
+ LWLockRelease(&proc->fpInfoLock);
+ }
+
+ /*
+ * Next, acquire lock on the entire shared lock data structure. We do
+ * this so that, at least for locks in the primary lock table, the state
+ * will be self-consistent.
+ *
+ * Since this is a read-only operation, we take shared instead of
+ * exclusive lock. There's not a whole lot of point to this, because all
+ * the normal operations require exclusive lock, but it doesn't hurt
+ * anything either. It will at least allow two backends to do
+ * GetLockStatusData in parallel.
+ *
+ * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+ */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+ /* Now we can safely count the number of proclocks */
+ data->nelements = el + hash_get_num_entries(LockMethodProcLockHash);
+ if (data->nelements > els)
+ {
+ els = data->nelements;
+ data->locks = (LockInstanceData *)
+ repalloc(data->locks, sizeof(LockInstanceData) * els);
+ }
+
+ /* Now scan the tables to copy the data */
+ hash_seq_init(&seqstat, LockMethodProcLockHash);
+
+ while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+ {
+ PGPROC *proc = proclock->tag.myProc;
+ LOCK *lock = proclock->tag.myLock;
+ LockInstanceData *instance = &data->locks[el];
+
+ memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+ instance->holdMask = proclock->holdMask;
+ if (proc->waitLock == proclock->tag.myLock)
+ instance->waitLockMode = proc->waitLockMode;
+ else
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->leaderPid = proclock->groupLeader->pid;
+ instance->fastpath = false;
+ instance->waitStart = (TimestampTz) pg_atomic_read_u64(&proc->waitStart);
+
+ el++;
+ }
+
+ /*
+ * And release locks. We do this in reverse order for two reasons: (1)
+ * Anyone else who needs more than one of the locks will be trying to lock
+ * them in increasing order; we don't want to release the other process
+ * until it can get all the locks it needs. (2) This avoids O(N^2)
+ * behavior inside LWLockRelease.
+ */
+ for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+ LWLockRelease(LockHashPartitionLockByIndex(i));
+
+ Assert(el == data->nelements);
+
+ return data;
+}
+
+/*
+ * GetBlockerStatusData - Return a summary of the lock manager's state
+ * concerning locks that are blocking the specified PID or any member of
+ * the PID's lock group, for use in a user-level reporting function.
+ *
+ * For each PID within the lock group that is awaiting some heavyweight lock,
+ * the return data includes an array of LockInstanceData objects, which are
+ * the same data structure used by GetLockStatusData; but unlike that function,
+ * this one reports only the PROCLOCKs associated with the lock that that PID
+ * is blocked on. (Hence, all the locktags should be the same for any one
+ * blocked PID.) In addition, we return an array of the PIDs of those backends
+ * that are ahead of the blocked PID in the lock's wait queue. These can be
+ * compared with the PIDs in the LockInstanceData objects to determine which
+ * waiters are ahead of or behind the blocked PID in the queue.
+ *
+ * If blocked_pid isn't a valid backend PID or nothing in its lock group is
+ * waiting on any heavyweight lock, return empty arrays.
+ *
+ * The design goal is to hold the LWLocks for as short a time as possible;
+ * thus, this function simply makes a copy of the necessary data and releases
+ * the locks, allowing the caller to contemplate and format the data for as
+ * long as it pleases.
+ */
+BlockedProcsData *
+GetBlockerStatusData(int blocked_pid)
+{
+ BlockedProcsData *data;
+ PGPROC *proc;
+ int i;
+
+ data = (BlockedProcsData *) palloc(sizeof(BlockedProcsData));
+
+ /*
+ * Guess how much space we'll need, and preallocate. Most of the time
+ * this will avoid needing to do repalloc while holding the LWLocks. (We
+ * assume, but check with an Assert, that MaxBackends is enough entries
+ * for the procs[] array; the other two could need enlargement, though.)
+ */
+ data->nprocs = data->nlocks = data->npids = 0;
+ data->maxprocs = data->maxlocks = data->maxpids = MaxBackends;
+ data->procs = (BlockedProcData *) palloc(sizeof(BlockedProcData) * data->maxprocs);
+ data->locks = (LockInstanceData *) palloc(sizeof(LockInstanceData) * data->maxlocks);
+ data->waiter_pids = (int *) palloc(sizeof(int) * data->maxpids);
+
+ /*
+ * In order to search the ProcArray for blocked_pid and assume that that
+ * entry won't immediately disappear under us, we must hold ProcArrayLock.
+ * In addition, to examine the lock grouping fields of any other backend,
+ * we must hold all the hash partition locks. (Only one of those locks is
+ * actually relevant for any one lock group, but we can't know which one
+ * ahead of time.) It's fairly annoying to hold all those locks
+ * throughout this, but it's no worse than GetLockStatusData(), and it
+ * does have the advantage that we're guaranteed to return a
+ * self-consistent instantaneous state.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ proc = BackendPidGetProcWithLock(blocked_pid);
+
+ /* Nothing to do if it's gone */
+ if (proc != NULL)
+ {
+ /*
+ * Acquire lock on the entire shared lock data structure. See notes
+ * in GetLockStatusData().
+ */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+ if (proc->lockGroupLeader == NULL)
+ {
+ /* Easy case, proc is not a lock group member */
+ GetSingleProcBlockerStatusData(proc, data);
+ }
+ else
+ {
+ /* Examine all procs in proc's lock group */
+ dlist_iter iter;
+
+ dlist_foreach(iter, &proc->lockGroupLeader->lockGroupMembers)
+ {
+ PGPROC *memberProc;
+
+ memberProc = dlist_container(PGPROC, lockGroupLink, iter.cur);
+ GetSingleProcBlockerStatusData(memberProc, data);
+ }
+ }
+
+ /*
+ * And release locks. See notes in GetLockStatusData().
+ */
+ for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+ LWLockRelease(LockHashPartitionLockByIndex(i));
+
+ Assert(data->nprocs <= data->maxprocs);
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return data;
+}
+
+/* Accumulate data about one possibly-blocked proc for GetBlockerStatusData */
+static void
+GetSingleProcBlockerStatusData(PGPROC *blocked_proc, BlockedProcsData *data)
+{
+ LOCK *theLock = blocked_proc->waitLock;
+ BlockedProcData *bproc;
+ dlist_iter proclock_iter;
+ dlist_iter proc_iter;
+ dclist_head *waitQueue;
+ int queue_size;
+
+ /* Nothing to do if this proc is not blocked */
+ if (theLock == NULL)
+ return;
+
+ /* Set up a procs[] element */
+ bproc = &data->procs[data->nprocs++];
+ bproc->pid = blocked_proc->pid;
+ bproc->first_lock = data->nlocks;
+ bproc->first_waiter = data->npids;
+
+ /*
+ * We may ignore the proc's fast-path arrays, since nothing in those could
+ * be related to a contended lock.
+ */
+
+ /* Collect all PROCLOCKs associated with theLock */
+ dlist_foreach(proclock_iter, &theLock->procLocks)
+ {
+ PROCLOCK *proclock =
+ dlist_container(PROCLOCK, lockLink, proclock_iter.cur);
+ PGPROC *proc = proclock->tag.myProc;
+ LOCK *lock = proclock->tag.myLock;
+ LockInstanceData *instance;
+
+ if (data->nlocks >= data->maxlocks)
+ {
+ data->maxlocks += MaxBackends;
+ data->locks = (LockInstanceData *)
+ repalloc(data->locks, sizeof(LockInstanceData) * data->maxlocks);
+ }
+
+ instance = &data->locks[data->nlocks];
+ memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+ instance->holdMask = proclock->holdMask;
+ if (proc->waitLock == lock)
+ instance->waitLockMode = proc->waitLockMode;
+ else
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->leaderPid = proclock->groupLeader->pid;
+ instance->fastpath = false;
+ data->nlocks++;
+ }
+
+ /* Enlarge waiter_pids[] if it's too small to hold all wait queue PIDs */
+ waitQueue = &(theLock->waitProcs);
+ queue_size = dclist_count(waitQueue);
+
+ if (queue_size > data->maxpids - data->npids)
+ {
+ data->maxpids = Max(data->maxpids + MaxBackends,
+ data->npids + queue_size);
+ data->waiter_pids = (int *) repalloc(data->waiter_pids,
+ sizeof(int) * data->maxpids);
+ }
+
+ /* Collect PIDs from the lock's wait queue, stopping at blocked_proc */
+ dclist_foreach(proc_iter, waitQueue)
+ {
+ PGPROC *queued_proc = dlist_container(PGPROC, links, proc_iter.cur);
+
+ if (queued_proc == blocked_proc)
+ break;
+ data->waiter_pids[data->npids++] = queued_proc->pid;
+ queued_proc = (PGPROC *) queued_proc->links.next;
+ }
+
+ bproc->num_locks = data->nlocks - bproc->first_lock;
+ bproc->num_waiters = data->npids - bproc->first_waiter;
+}
+
+/*
+ * Returns a list of currently held AccessExclusiveLocks, for use by
+ * LogStandbySnapshot(). The result is a palloc'd array,
+ * with the number of elements returned into *nlocks.
+ *
+ * XXX This currently takes a lock on all partitions of the lock table,
+ * but it's possible to do better. By reference counting locks and storing
+ * the value in the ProcArray entry for each backend we could tell if any
+ * locks need recording without having to acquire the partition locks and
+ * scan the lock table. Whether that's worth the additional overhead
+ * is pretty dubious though.
+ */
+xl_standby_lock *
+GetRunningTransactionLocks(int *nlocks)
+{
+ xl_standby_lock *accessExclusiveLocks;
+ PROCLOCK *proclock;
+ HASH_SEQ_STATUS seqstat;
+ int i;
+ int index;
+ int els;
+
+ /*
+ * Acquire lock on the entire shared lock data structure.
+ *
+ * Must grab LWLocks in partition-number order to avoid LWLock deadlock.
+ */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
+
+ /* Now we can safely count the number of proclocks */
+ els = hash_get_num_entries(LockMethodProcLockHash);
+
+ /*
+ * Allocating enough space for all locks in the lock table is overkill,
+ * but it's more convenient and faster than having to enlarge the array.
+ */
+ accessExclusiveLocks = palloc(els * sizeof(xl_standby_lock));
+
+ /* Now scan the tables to copy the data */
+ hash_seq_init(&seqstat, LockMethodProcLockHash);
+
+ /*
+ * If lock is a currently granted AccessExclusiveLock then it will have
+ * just one proclock holder, so locks are never accessed twice in this
+ * particular case. Don't copy this code for use elsewhere because in the
+ * general case this will give you duplicate locks when looking at
+ * non-exclusive lock types.
+ */
+ index = 0;
+ while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+ {
+ /* make sure this definition matches the one used in LockAcquire */
+ if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) &&
+ proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION)
+ {
+ PGPROC *proc = proclock->tag.myProc;
+ LOCK *lock = proclock->tag.myLock;
+ TransactionId xid = proc->xid;
+
+ /*
+ * Don't record locks for transactions if we know they have
+ * already issued their WAL record for commit but not yet released
+ * lock. It is still possible that we see locks held by already
+ * complete transactions, if they haven't yet zeroed their xids.
+ */
+ if (!TransactionIdIsValid(xid))
+ continue;
+
+ accessExclusiveLocks[index].xid = xid;
+ accessExclusiveLocks[index].dbOid = lock->tag.locktag_field1;
+ accessExclusiveLocks[index].relOid = lock->tag.locktag_field2;
+
+ index++;
+ }
+ }
+
+ Assert(index <= els);
+
+ /*
+ * And release locks. We do this in reverse order for two reasons: (1)
+ * Anyone else who needs more than one of the locks will be trying to lock
+ * them in increasing order; we don't want to release the other process
+ * until it can get all the locks it needs. (2) This avoids O(N^2)
+ * behavior inside LWLockRelease.
+ */
+ for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+ LWLockRelease(LockHashPartitionLockByIndex(i));
+
+ *nlocks = index;
+ return accessExclusiveLocks;
+}
+
+/* Provide the textual name of any lock mode */
+const char *
+GetLockmodeName(LOCKMETHODID lockmethodid, LOCKMODE mode)
+{
+ Assert(lockmethodid > 0 && lockmethodid < lengthof(LockMethods));
+ Assert(mode > 0 && mode <= LockMethods[lockmethodid]->numLockModes);
+ return LockMethods[lockmethodid]->lockModeNames[mode];
+}
+
+#ifdef LOCK_DEBUG
+/*
+ * Dump all locks in the given proc's myProcLocks lists.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+void
+DumpLocks(PGPROC *proc)
+{
+ int i;
+
+ if (proc == NULL)
+ return;
+
+ if (proc->waitLock)
+ LOCK_PRINT("DumpLocks: waiting on", proc->waitLock, 0);
+
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ {
+ dlist_head *procLocks = &proc->myProcLocks[i];
+ dlist_iter iter;
+
+ dlist_foreach(iter, procLocks)
+ {
+ PROCLOCK *proclock = dlist_container(PROCLOCK, procLink, iter.cur);
+ LOCK *lock = proclock->tag.myLock;
+
+ Assert(proclock->tag.myProc == proc);
+ PROCLOCK_PRINT("DumpLocks", proclock);
+ LOCK_PRINT("DumpLocks", lock, 0);
+ }
+ }
+}
+
+/*
+ * Dump all lmgr locks.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+void
+DumpAllLocks(void)
+{
+ PGPROC *proc;
+ PROCLOCK *proclock;
+ LOCK *lock;
+ HASH_SEQ_STATUS status;
+
+ proc = MyProc;
+
+ if (proc && proc->waitLock)
+ LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0);
+
+ hash_seq_init(&status, LockMethodProcLockHash);
+
+ while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ PROCLOCK_PRINT("DumpAllLocks", proclock);
+
+ lock = proclock->tag.myLock;
+ if (lock)
+ LOCK_PRINT("DumpAllLocks", lock, 0);
+ else
+ elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL");
+ }
+}
+#endif /* LOCK_DEBUG */
+
+/*
+ * LOCK 2PC resource manager's routines
+ */
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared.
+ *
+ * Because this function is run at db startup, re-acquiring the locks should
+ * never conflict with running transactions because there are none. We
+ * assume that the lock state represented by the stored 2PC files is legal.
+ *
+ * When switching from Hot Standby mode to normal operation, the locks will
+ * be already held by the startup process. The locks are acquired for the new
+ * procs without checking for conflicts, so we don't get a conflict between the
+ * startup process and the dummy procs, even though we will momentarily have
+ * a situation where two procs are holding the same AccessExclusiveLock,
+ * which isn't normally possible because the conflict. If we're in standby
+ * mode, but a recovery snapshot hasn't been established yet, it's possible
+ * that some but not all of the locks are already held by the startup process.
+ *
+ * This approach is simple, but also a bit dangerous, because if there isn't
+ * enough shared memory to acquire the locks, an error will be thrown, which
+ * is promoted to FATAL and recovery will abort, bringing down postmaster.
+ * A safer approach would be to transfer the locks like we do in
+ * AtPrepare_Locks, but then again, in hot standby mode it's possible for
+ * read-only backends to use up all the shared lock memory anyway, so that
+ * replaying the WAL record that needs to acquire a lock will throw an error
+ * and PANIC anyway.
+ */
+void
+lock_twophase_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+ PGPROC *proc = TwoPhaseGetDummyProc(xid, false);
+ LOCKTAG *locktag;
+ LOCKMODE lockmode;
+ LOCKMETHODID lockmethodid;
+ LOCK *lock;
+ PROCLOCK *proclock;
+ PROCLOCKTAG proclocktag;
+ bool found;
+ uint32 hashcode;
+ uint32 proclock_hashcode;
+ int partition;
+ LWLock *partitionLock;
+ LockMethod lockMethodTable;
+
+ Assert(len == sizeof(TwoPhaseLockRecord));
+ locktag = &rec->locktag;
+ lockmode = rec->lockmode;
+ lockmethodid = locktag->locktag_lockmethodid;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+
+ hashcode = LockTagHashCode(locktag);
+ partition = LockHashPartition(hashcode);
+ partitionLock = LockHashPartitionLock(hashcode);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Find or create a lock with this tag.
+ */
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ locktag,
+ hashcode,
+ HASH_ENTER_NULL,
+ &found);
+ if (!lock)
+ {
+ LWLockRelease(partitionLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_locks_per_transaction")));
+ }
+
+ /*
+ * if it's a new lock object, initialize it
+ */
+ if (!found)
+ {
+ lock->grantMask = 0;
+ lock->waitMask = 0;
+ dlist_init(&lock->procLocks);
+ dclist_init(&lock->waitProcs);
+ lock->nRequested = 0;
+ lock->nGranted = 0;
+ MemSet(lock->requested, 0, sizeof(int) * MAX_LOCKMODES);
+ MemSet(lock->granted, 0, sizeof(int) * MAX_LOCKMODES);
+ LOCK_PRINT("lock_twophase_recover: new", lock, lockmode);
+ }
+ else
+ {
+ LOCK_PRINT("lock_twophase_recover: found", lock, lockmode);
+ Assert((lock->nRequested >= 0) && (lock->requested[lockmode] >= 0));
+ Assert((lock->nGranted >= 0) && (lock->granted[lockmode] >= 0));
+ Assert(lock->nGranted <= lock->nRequested);
+ }
+
+ /*
+ * Create the hash key for the proclock table.
+ */
+ proclocktag.myLock = lock;
+ proclocktag.myProc = proc;
+
+ proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
+
+ /*
+ * Find or create a proclock entry with this tag
+ */
+ proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ &proclocktag,
+ proclock_hashcode,
+ HASH_ENTER_NULL,
+ &found);
+ if (!proclock)
+ {
+ /* Oops, not enough shmem for the proclock */
+ if (lock->nRequested == 0)
+ {
+ /*
+ * There are no other requestors of this lock, so garbage-collect
+ * the lock object. We *must* do this to avoid a permanent leak
+ * of shared memory, because there won't be anything to cause
+ * anyone to release the lock object later.
+ */
+ Assert(dlist_is_empty(&lock->procLocks));
+ if (!hash_search_with_hash_value(LockMethodLockHash,
+ &(lock->tag),
+ hashcode,
+ HASH_REMOVE,
+ NULL))
+ elog(PANIC, "lock table corrupted");
+ }
+ LWLockRelease(partitionLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_locks_per_transaction")));
+ }
+
+ /*
+ * If new, initialize the new entry
+ */
+ if (!found)
+ {
+ Assert(proc->lockGroupLeader == NULL);
+ proclock->groupLeader = proc;
+ proclock->holdMask = 0;
+ proclock->releaseMask = 0;
+ /* Add proclock to appropriate lists */
+ dlist_push_tail(&lock->procLocks, &proclock->lockLink);
+ dlist_push_tail(&proc->myProcLocks[partition],
+ &proclock->procLink);
+ PROCLOCK_PRINT("lock_twophase_recover: new", proclock);
+ }
+ else
+ {
+ PROCLOCK_PRINT("lock_twophase_recover: found", proclock);
+ Assert((proclock->holdMask & ~lock->grantMask) == 0);
+ }
+
+ /*
+ * lock->nRequested and lock->requested[] count the total number of
+ * requests, whether granted or waiting, so increment those immediately.
+ */
+ lock->nRequested++;
+ lock->requested[lockmode]++;
+ Assert((lock->nRequested > 0) && (lock->requested[lockmode] > 0));
+
+ /*
+ * We shouldn't already hold the desired lock.
+ */
+ if (proclock->holdMask & LOCKBIT_ON(lockmode))
+ elog(ERROR, "lock %s on object %u/%u/%u is already held",
+ lockMethodTable->lockModeNames[lockmode],
+ lock->tag.locktag_field1, lock->tag.locktag_field2,
+ lock->tag.locktag_field3);
+
+ /*
+ * We ignore any possible conflicts and just grant ourselves the lock. Not
+ * only because we don't bother, but also to avoid deadlocks when
+ * switching from standby to normal mode. See function comment.
+ */
+ GrantLock(lock, proclock, lockmode);
+
+ /*
+ * Bump strong lock count, to make sure any fast-path lock requests won't
+ * be granted without consulting the primary lock table.
+ */
+ if (ConflictsWithRelationFastPath(&lock->tag, lockmode))
+ {
+ uint32 fasthashcode = FastPathStrongLockHashPartition(hashcode);
+
+ SpinLockAcquire(&FastPathStrongRelationLocks->mutex);
+ FastPathStrongRelationLocks->count[fasthashcode]++;
+ SpinLockRelease(&FastPathStrongRelationLocks->mutex);
+ }
+
+ LWLockRelease(partitionLock);
+}
+
+/*
+ * Re-acquire a lock belonging to a transaction that was prepared, when
+ * starting up into hot standby mode.
+ */
+void
+lock_twophase_standby_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+ LOCKTAG *locktag;
+ LOCKMODE lockmode;
+ LOCKMETHODID lockmethodid;
+
+ Assert(len == sizeof(TwoPhaseLockRecord));
+ locktag = &rec->locktag;
+ lockmode = rec->lockmode;
+ lockmethodid = locktag->locktag_lockmethodid;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+ if (lockmode == AccessExclusiveLock &&
+ locktag->locktag_type == LOCKTAG_RELATION)
+ {
+ StandbyAcquireAccessExclusiveLock(xid,
+ locktag->locktag_field1 /* dboid */ ,
+ locktag->locktag_field2 /* reloid */ );
+ }
+}
+
+
+/*
+ * 2PC processing routine for COMMIT PREPARED case.
+ *
+ * Find and release the lock indicated by the 2PC record.
+ */
+void
+lock_twophase_postcommit(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhaseLockRecord *rec = (TwoPhaseLockRecord *) recdata;
+ PGPROC *proc = TwoPhaseGetDummyProc(xid, true);
+ LOCKTAG *locktag;
+ LOCKMETHODID lockmethodid;
+ LockMethod lockMethodTable;
+
+ Assert(len == sizeof(TwoPhaseLockRecord));
+ locktag = &rec->locktag;
+ lockmethodid = locktag->locktag_lockmethodid;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+ lockMethodTable = LockMethods[lockmethodid];
+
+ LockRefindAndRelease(lockMethodTable, proc, locktag, rec->lockmode, true);
+}
+
+/*
+ * 2PC processing routine for ROLLBACK PREPARED case.
+ *
+ * This is actually just the same as the COMMIT case.
+ */
+void
+lock_twophase_postabort(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ lock_twophase_postcommit(xid, info, recdata, len);
+}
+
+/*
+ * VirtualXactLockTableInsert
+ *
+ * Take vxid lock via the fast-path. There can't be any pre-existing
+ * lockers, as we haven't advertised this vxid via the ProcArray yet.
+ *
+ * Since MyProc->fpLocalTransactionId will normally contain the same data
+ * as MyProc->lxid, you might wonder if we really need both. The
+ * difference is that MyProc->lxid is set and cleared unlocked, and
+ * examined by procarray.c, while fpLocalTransactionId is protected by
+ * fpInfoLock and is used only by the locking subsystem. Doing it this
+ * way makes it easier to verify that there are no funny race conditions.
+ *
+ * We don't bother recording this lock in the local lock table, since it's
+ * only ever released at the end of a transaction. Instead,
+ * LockReleaseAll() calls VirtualXactLockTableCleanup().
+ */
+void
+VirtualXactLockTableInsert(VirtualTransactionId vxid)
+{
+ Assert(VirtualTransactionIdIsValid(vxid));
+
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+ Assert(MyProc->backendId == vxid.backendId);
+ Assert(MyProc->fpLocalTransactionId == InvalidLocalTransactionId);
+ Assert(MyProc->fpVXIDLock == false);
+
+ MyProc->fpVXIDLock = true;
+ MyProc->fpLocalTransactionId = vxid.localTransactionId;
+
+ LWLockRelease(&MyProc->fpInfoLock);
+}
+
+/*
+ * VirtualXactLockTableCleanup
+ *
+ * Check whether a VXID lock has been materialized; if so, release it,
+ * unblocking waiters.
+ */
+void
+VirtualXactLockTableCleanup(void)
+{
+ bool fastpath;
+ LocalTransactionId lxid;
+
+ Assert(MyProc->backendId != InvalidBackendId);
+
+ /*
+ * Clean up shared memory state.
+ */
+ LWLockAcquire(&MyProc->fpInfoLock, LW_EXCLUSIVE);
+
+ fastpath = MyProc->fpVXIDLock;
+ lxid = MyProc->fpLocalTransactionId;
+ MyProc->fpVXIDLock = false;
+ MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+
+ LWLockRelease(&MyProc->fpInfoLock);
+
+ /*
+ * If fpVXIDLock has been cleared without touching fpLocalTransactionId,
+ * that means someone transferred the lock to the main lock table.
+ */
+ if (!fastpath && LocalTransactionIdIsValid(lxid))
+ {
+ VirtualTransactionId vxid;
+ LOCKTAG locktag;
+
+ vxid.backendId = MyBackendId;
+ vxid.localTransactionId = lxid;
+ SET_LOCKTAG_VIRTUALTRANSACTION(locktag, vxid);
+
+ LockRefindAndRelease(LockMethods[DEFAULT_LOCKMETHOD], MyProc,
+ &locktag, ExclusiveLock, false);
+ }
+}
+
+/*
+ * XactLockForVirtualXact
+ *
+ * If TransactionIdIsValid(xid), this is essentially XactLockTableWait(xid,
+ * NULL, NULL, XLTW_None) or ConditionalXactLockTableWait(xid). Unlike those
+ * functions, it assumes "xid" is never a subtransaction and that "xid" is
+ * prepared, committed, or aborted.
+ *
+ * If !TransactionIdIsValid(xid), this locks every prepared XID having been
+ * known as "vxid" before its PREPARE TRANSACTION.
+ */
+static bool
+XactLockForVirtualXact(VirtualTransactionId vxid,
+ TransactionId xid, bool wait)
+{
+ bool more = false;
+
+ /* There is no point to wait for 2PCs if you have no 2PCs. */
+ if (max_prepared_xacts == 0)
+ return true;
+
+ do
+ {
+ LockAcquireResult lar;
+ LOCKTAG tag;
+
+ /* Clear state from previous iterations. */
+ if (more)
+ {
+ xid = InvalidTransactionId;
+ more = false;
+ }
+
+ /* If we have no xid, try to find one. */
+ if (!TransactionIdIsValid(xid))
+ xid = TwoPhaseGetXidByVirtualXID(vxid, &more);
+ if (!TransactionIdIsValid(xid))
+ {
+ Assert(!more);
+ return true;
+ }
+
+ /* Check or wait for XID completion. */
+ SET_LOCKTAG_TRANSACTION(tag, xid);
+ lar = LockAcquire(&tag, ShareLock, false, !wait);
+ if (lar == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+ LockRelease(&tag, ShareLock, false);
+ } while (more);
+
+ return true;
+}
+
+/*
+ * VirtualXactLock
+ *
+ * If wait = true, wait as long as the given VXID or any XID acquired by the
+ * same transaction is still running. Then, return true.
+ *
+ * If wait = false, just check whether that VXID or one of those XIDs is still
+ * running, and return true or false.
+ */
+bool
+VirtualXactLock(VirtualTransactionId vxid, bool wait)
+{
+ LOCKTAG tag;
+ PGPROC *proc;
+ TransactionId xid = InvalidTransactionId;
+
+ Assert(VirtualTransactionIdIsValid(vxid));
+
+ if (VirtualTransactionIdIsRecoveredPreparedXact(vxid))
+ /* no vxid lock; localTransactionId is a normal, locked XID */
+ return XactLockForVirtualXact(vxid, vxid.localTransactionId, wait);
+
+ SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid);
+
+ /*
+ * If a lock table entry must be made, this is the PGPROC on whose behalf
+ * it must be done. Note that the transaction might end or the PGPROC
+ * might be reassigned to a new backend before we get around to examining
+ * it, but it doesn't matter. If we find upon examination that the
+ * relevant lxid is no longer running here, that's enough to prove that
+ * it's no longer running anywhere.
+ */
+ proc = BackendIdGetProc(vxid.backendId);
+ if (proc == NULL)
+ return XactLockForVirtualXact(vxid, InvalidTransactionId, wait);
+
+ /*
+ * We must acquire this lock before checking the backendId and lxid
+ * against the ones we're waiting for. The target backend will only set
+ * or clear lxid while holding this lock.
+ */
+ LWLockAcquire(&proc->fpInfoLock, LW_EXCLUSIVE);
+
+ if (proc->backendId != vxid.backendId
+ || proc->fpLocalTransactionId != vxid.localTransactionId)
+ {
+ /* VXID ended */
+ LWLockRelease(&proc->fpInfoLock);
+ return XactLockForVirtualXact(vxid, InvalidTransactionId, wait);
+ }
+
+ /*
+ * If we aren't asked to wait, there's no need to set up a lock table
+ * entry. The transaction is still in progress, so just return false.
+ */
+ if (!wait)
+ {
+ LWLockRelease(&proc->fpInfoLock);
+ return false;
+ }
+
+ /*
+ * OK, we're going to need to sleep on the VXID. But first, we must set
+ * up the primary lock table entry, if needed (ie, convert the proc's
+ * fast-path lock on its VXID to a regular lock).
+ */
+ if (proc->fpVXIDLock)
+ {
+ PROCLOCK *proclock;
+ uint32 hashcode;
+ LWLock *partitionLock;
+
+ hashcode = LockTagHashCode(&tag);
+
+ partitionLock = LockHashPartitionLock(hashcode);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ proclock = SetupLockInTable(LockMethods[DEFAULT_LOCKMETHOD], proc,
+ &tag, hashcode, ExclusiveLock);
+ if (!proclock)
+ {
+ LWLockRelease(partitionLock);
+ LWLockRelease(&proc->fpInfoLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_locks_per_transaction")));
+ }
+ GrantLock(proclock->tag.myLock, proclock, ExclusiveLock);
+
+ LWLockRelease(partitionLock);
+
+ proc->fpVXIDLock = false;
+ }
+
+ /*
+ * If the proc has an XID now, we'll avoid a TwoPhaseGetXidByVirtualXID()
+ * search. The proc might have assigned this XID but not yet locked it,
+ * in which case the proc will lock this XID before releasing the VXID.
+ * The fpInfoLock critical section excludes VirtualXactLockTableCleanup(),
+ * so we won't save an XID of a different VXID. It doesn't matter whether
+ * we save this before or after setting up the primary lock table entry.
+ */
+ xid = proc->xid;
+
+ /* Done with proc->fpLockBits */
+ LWLockRelease(&proc->fpInfoLock);
+
+ /* Time to wait. */
+ (void) LockAcquire(&tag, ShareLock, false, false);
+
+ LockRelease(&tag, ShareLock, false);
+ return XactLockForVirtualXact(vxid, xid, wait);
+}
+
+/*
+ * LockWaiterCount
+ *
+ * Find the number of lock requester on this locktag
+ */
+int
+LockWaiterCount(const LOCKTAG *locktag)
+{
+ LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
+ LOCK *lock;
+ bool found;
+ uint32 hashcode;
+ LWLock *partitionLock;
+ int waiters = 0;
+
+ if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
+ elog(ERROR, "unrecognized lock method: %d", lockmethodid);
+
+ hashcode = LockTagHashCode(locktag);
+ partitionLock = LockHashPartitionLock(hashcode);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ lock = (LOCK *) hash_search_with_hash_value(LockMethodLockHash,
+ locktag,
+ hashcode,
+ HASH_FIND,
+ &found);
+ if (found)
+ {
+ Assert(lock != NULL);
+ waiters = lock->nRequested;
+ }
+ LWLockRelease(partitionLock);
+
+ return waiters;
+}
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index 0000000..01d738f
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -0,0 +1,1973 @@
+/*-------------------------------------------------------------------------
+ *
+ * lwlock.c
+ * Lightweight lock manager
+ *
+ * Lightweight locks are intended primarily to provide mutual exclusion of
+ * access to shared-memory data structures. Therefore, they offer both
+ * exclusive and shared lock modes (to support read/write and read-only
+ * access to a shared object). There are few other frammishes. User-level
+ * locking should be done with the full lock manager --- which depends on
+ * LWLocks to protect its shared state.
+ *
+ * In addition to exclusive and shared modes, lightweight locks can be used to
+ * wait until a variable changes value. The variable is initially not set
+ * when the lock is acquired with LWLockAcquire, i.e. it remains set to the
+ * value it was set to when the lock was released last, and can be updated
+ * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar
+ * waits for the variable to be updated, or until the lock is free. When
+ * releasing the lock with LWLockReleaseClearVar() the value can be set to an
+ * appropriate value for a free lock. The meaning of the variable is up to
+ * the caller, the lightweight lock code just assigns and compares it.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/lwlock.c
+ *
+ * NOTES:
+ *
+ * This used to be a pretty straight forward reader-writer lock
+ * implementation, in which the internal state was protected by a
+ * spinlock. Unfortunately the overhead of taking the spinlock proved to be
+ * too high for workloads/locks that were taken in shared mode very
+ * frequently. Often we were spinning in the (obviously exclusive) spinlock,
+ * while trying to acquire a shared lock that was actually free.
+ *
+ * Thus a new implementation was devised that provides wait-free shared lock
+ * acquisition for locks that aren't exclusively locked.
+ *
+ * The basic idea is to have a single atomic variable 'lockcount' instead of
+ * the formerly separate shared and exclusive counters and to use atomic
+ * operations to acquire the lock. That's fairly easy to do for plain
+ * rw-spinlocks, but a lot harder for something like LWLocks that want to wait
+ * in the OS.
+ *
+ * For lock acquisition we use an atomic compare-and-exchange on the lockcount
+ * variable. For exclusive lock we swap in a sentinel value
+ * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders.
+ *
+ * To release the lock we use an atomic decrement to release the lock. If the
+ * new value is zero (we get that atomically), we know we can/have to release
+ * waiters.
+ *
+ * Obviously it is important that the sentinel value for exclusive locks
+ * doesn't conflict with the maximum number of possible share lockers -
+ * luckily MAX_BACKENDS makes that easily possible.
+ *
+ *
+ * The attentive reader might have noticed that naively doing the above has a
+ * glaring race condition: We try to lock using the atomic operations and
+ * notice that we have to wait. Unfortunately by the time we have finished
+ * queuing, the former locker very well might have already finished it's
+ * work. That's problematic because we're now stuck waiting inside the OS.
+
+ * To mitigate those races we use a two phased attempt at locking:
+ * Phase 1: Try to do it atomically, if we succeed, nice
+ * Phase 2: Add ourselves to the waitqueue of the lock
+ * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from
+ * the queue
+ * Phase 4: Sleep till wake-up, goto Phase 1
+ *
+ * This protects us against the problem from above as nobody can release too
+ * quick, before we're queued, since after Phase 2 we're already queued.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "port/pg_bitutils.h"
+#include "postmaster/postmaster.h"
+#include "replication/slot.h"
+#include "storage/ipc.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+#ifdef LWLOCK_STATS
+#include "utils/hsearch.h"
+#endif
+
+
+/* We use the ShmemLock spinlock to protect LWLockCounter */
+extern slock_t *ShmemLock;
+
+#define LW_FLAG_HAS_WAITERS ((uint32) 1 << 30)
+#define LW_FLAG_RELEASE_OK ((uint32) 1 << 29)
+#define LW_FLAG_LOCKED ((uint32) 1 << 28)
+
+#define LW_VAL_EXCLUSIVE ((uint32) 1 << 24)
+#define LW_VAL_SHARED 1
+
+#define LW_LOCK_MASK ((uint32) ((1 << 25)-1))
+/* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */
+#define LW_SHARED_MASK ((uint32) ((1 << 24)-1))
+
+StaticAssertDecl(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
+ "MAX_BACKENDS too big for lwlock.c");
+
+/*
+ * There are three sorts of LWLock "tranches":
+ *
+ * 1. The individually-named locks defined in lwlocknames.h each have their
+ * own tranche. The names of these tranches appear in IndividualLWLockNames[]
+ * in lwlocknames.c.
+ *
+ * 2. There are some predefined tranches for built-in groups of locks.
+ * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names
+ * appear in BuiltinTrancheNames[] below.
+ *
+ * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche
+ * or LWLockRegisterTranche. The names of these that are known in the current
+ * process appear in LWLockTrancheNames[].
+ *
+ * All these names are user-visible as wait event names, so choose with care
+ * ... and do not forget to update the documentation's list of wait events.
+ */
+extern const char *const IndividualLWLockNames[]; /* in lwlocknames.c */
+
+static const char *const BuiltinTrancheNames[] = {
+ /* LWTRANCHE_XACT_BUFFER: */
+ "XactBuffer",
+ /* LWTRANCHE_COMMITTS_BUFFER: */
+ "CommitTsBuffer",
+ /* LWTRANCHE_SUBTRANS_BUFFER: */
+ "SubtransBuffer",
+ /* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */
+ "MultiXactOffsetBuffer",
+ /* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */
+ "MultiXactMemberBuffer",
+ /* LWTRANCHE_NOTIFY_BUFFER: */
+ "NotifyBuffer",
+ /* LWTRANCHE_SERIAL_BUFFER: */
+ "SerialBuffer",
+ /* LWTRANCHE_WAL_INSERT: */
+ "WALInsert",
+ /* LWTRANCHE_BUFFER_CONTENT: */
+ "BufferContent",
+ /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
+ "ReplicationOriginState",
+ /* LWTRANCHE_REPLICATION_SLOT_IO: */
+ "ReplicationSlotIO",
+ /* LWTRANCHE_LOCK_FASTPATH: */
+ "LockFastPath",
+ /* LWTRANCHE_BUFFER_MAPPING: */
+ "BufferMapping",
+ /* LWTRANCHE_LOCK_MANAGER: */
+ "LockManager",
+ /* LWTRANCHE_PREDICATE_LOCK_MANAGER: */
+ "PredicateLockManager",
+ /* LWTRANCHE_PARALLEL_HASH_JOIN: */
+ "ParallelHashJoin",
+ /* LWTRANCHE_PARALLEL_QUERY_DSA: */
+ "ParallelQueryDSA",
+ /* LWTRANCHE_PER_SESSION_DSA: */
+ "PerSessionDSA",
+ /* LWTRANCHE_PER_SESSION_RECORD_TYPE: */
+ "PerSessionRecordType",
+ /* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */
+ "PerSessionRecordTypmod",
+ /* LWTRANCHE_SHARED_TUPLESTORE: */
+ "SharedTupleStore",
+ /* LWTRANCHE_SHARED_TIDBITMAP: */
+ "SharedTidBitmap",
+ /* LWTRANCHE_PARALLEL_APPEND: */
+ "ParallelAppend",
+ /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */
+ "PerXactPredicateList",
+ /* LWTRANCHE_PGSTATS_DSA: */
+ "PgStatsDSA",
+ /* LWTRANCHE_PGSTATS_HASH: */
+ "PgStatsHash",
+ /* LWTRANCHE_PGSTATS_DATA: */
+ "PgStatsData",
+ /* LWTRANCHE_LAUNCHER_DSA: */
+ "LogicalRepLauncherDSA",
+ /* LWTRANCHE_LAUNCHER_HASH: */
+ "LogicalRepLauncherHash",
+};
+
+StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
+ LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS,
+ "missing entries in BuiltinTrancheNames[]");
+
+/*
+ * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and
+ * stores the names of all dynamically-created tranches known to the current
+ * process. Any unused entries in the array will contain NULL.
+ */
+static const char **LWLockTrancheNames = NULL;
+static int LWLockTrancheNamesAllocated = 0;
+
+/*
+ * This points to the main array of LWLocks in shared memory. Backends inherit
+ * the pointer by fork from the postmaster (except in the EXEC_BACKEND case,
+ * where we have special measures to pass it down).
+ */
+LWLockPadded *MainLWLockArray = NULL;
+
+/*
+ * We use this structure to keep track of locked LWLocks for release
+ * during error recovery. Normally, only a few will be held at once, but
+ * occasionally the number can be much higher; for example, the pg_buffercache
+ * extension locks all buffer partitions simultaneously.
+ */
+#define MAX_SIMUL_LWLOCKS 200
+
+/* struct representing the LWLocks we're holding */
+typedef struct LWLockHandle
+{
+ LWLock *lock;
+ LWLockMode mode;
+} LWLockHandle;
+
+static int num_held_lwlocks = 0;
+static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS];
+
+/* struct representing the LWLock tranche request for named tranche */
+typedef struct NamedLWLockTrancheRequest
+{
+ char tranche_name[NAMEDATALEN];
+ int num_lwlocks;
+} NamedLWLockTrancheRequest;
+
+static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL;
+static int NamedLWLockTrancheRequestsAllocated = 0;
+
+/*
+ * NamedLWLockTrancheRequests is both the valid length of the request array,
+ * and the length of the shared-memory NamedLWLockTrancheArray later on.
+ * This variable and NamedLWLockTrancheArray are non-static so that
+ * postmaster.c can copy them to child processes in EXEC_BACKEND builds.
+ */
+int NamedLWLockTrancheRequests = 0;
+
+/* points to data in shared memory: */
+NamedLWLockTranche *NamedLWLockTrancheArray = NULL;
+
+static void InitializeLWLocks(void);
+static inline void LWLockReportWaitStart(LWLock *lock);
+static inline void LWLockReportWaitEnd(void);
+static const char *GetLWTrancheName(uint16 trancheId);
+
+#define T_NAME(lock) \
+ GetLWTrancheName((lock)->tranche)
+
+#ifdef LWLOCK_STATS
+typedef struct lwlock_stats_key
+{
+ int tranche;
+ void *instance;
+} lwlock_stats_key;
+
+typedef struct lwlock_stats
+{
+ lwlock_stats_key key;
+ int sh_acquire_count;
+ int ex_acquire_count;
+ int block_count;
+ int dequeue_self_count;
+ int spin_delay_count;
+} lwlock_stats;
+
+static HTAB *lwlock_stats_htab;
+static lwlock_stats lwlock_stats_dummy;
+#endif
+
+#ifdef LOCK_DEBUG
+bool Trace_lwlocks = false;
+
+inline static void
+PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode)
+{
+ /* hide statement & context here, otherwise the log is just too verbose */
+ if (Trace_lwlocks)
+ {
+ uint32 state = pg_atomic_read_u32(&lock->state);
+
+ ereport(LOG,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d",
+ MyProcPid,
+ where, T_NAME(lock), lock,
+ (state & LW_VAL_EXCLUSIVE) != 0,
+ state & LW_SHARED_MASK,
+ (state & LW_FLAG_HAS_WAITERS) != 0,
+ pg_atomic_read_u32(&lock->nwaiters),
+ (state & LW_FLAG_RELEASE_OK) != 0)));
+ }
+}
+
+inline static void
+LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg)
+{
+ /* hide statement & context here, otherwise the log is just too verbose */
+ if (Trace_lwlocks)
+ {
+ ereport(LOG,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("%s(%s %p): %s", where,
+ T_NAME(lock), lock, msg)));
+ }
+}
+
+#else /* not LOCK_DEBUG */
+#define PRINT_LWDEBUG(a,b,c) ((void)0)
+#define LOG_LWDEBUG(a,b,c) ((void)0)
+#endif /* LOCK_DEBUG */
+
+#ifdef LWLOCK_STATS
+
+static void init_lwlock_stats(void);
+static void print_lwlock_stats(int code, Datum arg);
+static lwlock_stats * get_lwlock_stats_entry(LWLock *lock);
+
+static void
+init_lwlock_stats(void)
+{
+ HASHCTL ctl;
+ static MemoryContext lwlock_stats_cxt = NULL;
+ static bool exit_registered = false;
+
+ if (lwlock_stats_cxt != NULL)
+ MemoryContextDelete(lwlock_stats_cxt);
+
+ /*
+ * The LWLock stats will be updated within a critical section, which
+ * requires allocating new hash entries. Allocations within a critical
+ * section are normally not allowed because running out of memory would
+ * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally
+ * turned on in production, so that's an acceptable risk. The hash entries
+ * are small, so the risk of running out of memory is minimal in practice.
+ */
+ lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext,
+ "LWLock stats",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true);
+
+ ctl.keysize = sizeof(lwlock_stats_key);
+ ctl.entrysize = sizeof(lwlock_stats);
+ ctl.hcxt = lwlock_stats_cxt;
+ lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ if (!exit_registered)
+ {
+ on_shmem_exit(print_lwlock_stats, 0);
+ exit_registered = true;
+ }
+}
+
+static void
+print_lwlock_stats(int code, Datum arg)
+{
+ HASH_SEQ_STATUS scan;
+ lwlock_stats *lwstats;
+
+ hash_seq_init(&scan, lwlock_stats_htab);
+
+ /* Grab an LWLock to keep different backends from mixing reports */
+ LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE);
+
+ while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL)
+ {
+ fprintf(stderr,
+ "PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n",
+ MyProcPid, GetLWTrancheName(lwstats->key.tranche),
+ lwstats->key.instance, lwstats->sh_acquire_count,
+ lwstats->ex_acquire_count, lwstats->block_count,
+ lwstats->spin_delay_count, lwstats->dequeue_self_count);
+ }
+
+ LWLockRelease(&MainLWLockArray[0].lock);
+}
+
+static lwlock_stats *
+get_lwlock_stats_entry(LWLock *lock)
+{
+ lwlock_stats_key key;
+ lwlock_stats *lwstats;
+ bool found;
+
+ /*
+ * During shared memory initialization, the hash table doesn't exist yet.
+ * Stats of that phase aren't very interesting, so just collect operations
+ * on all locks in a single dummy entry.
+ */
+ if (lwlock_stats_htab == NULL)
+ return &lwlock_stats_dummy;
+
+ /* Fetch or create the entry. */
+ MemSet(&key, 0, sizeof(key));
+ key.tranche = lock->tranche;
+ key.instance = lock;
+ lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found);
+ if (!found)
+ {
+ lwstats->sh_acquire_count = 0;
+ lwstats->ex_acquire_count = 0;
+ lwstats->block_count = 0;
+ lwstats->dequeue_self_count = 0;
+ lwstats->spin_delay_count = 0;
+ }
+ return lwstats;
+}
+#endif /* LWLOCK_STATS */
+
+
+/*
+ * Compute number of LWLocks required by named tranches. These will be
+ * allocated in the main array.
+ */
+static int
+NumLWLocksForNamedTranches(void)
+{
+ int numLocks = 0;
+ int i;
+
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+
+ return numLocks;
+}
+
+/*
+ * Compute shmem space needed for LWLocks and named tranches.
+ */
+Size
+LWLockShmemSize(void)
+{
+ Size size;
+ int i;
+ int numLocks = NUM_FIXED_LWLOCKS;
+
+ /* Calculate total number of locks needed in the main array. */
+ numLocks += NumLWLocksForNamedTranches();
+
+ /* Space for the LWLock array. */
+ size = mul_size(numLocks, sizeof(LWLockPadded));
+
+ /* Space for dynamic allocation counter, plus room for alignment. */
+ size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE);
+
+ /* space for named tranches. */
+ size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche)));
+
+ /* space for name of each tranche. */
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1);
+
+ return size;
+}
+
+/*
+ * Allocate shmem space for the main LWLock array and all tranches and
+ * initialize it. We also register extension LWLock tranches here.
+ */
+void
+CreateLWLocks(void)
+{
+ if (!IsUnderPostmaster)
+ {
+ Size spaceLocks = LWLockShmemSize();
+ int *LWLockCounter;
+ char *ptr;
+
+ /* Allocate space */
+ ptr = (char *) ShmemAlloc(spaceLocks);
+
+ /* Leave room for dynamic allocation of tranches */
+ ptr += sizeof(int);
+
+ /* Ensure desired alignment of LWLock array */
+ ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE;
+
+ MainLWLockArray = (LWLockPadded *) ptr;
+
+ /*
+ * Initialize the dynamic-allocation counter for tranches, which is
+ * stored just before the first LWLock.
+ */
+ LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+ *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED;
+
+ /* Initialize all LWLocks */
+ InitializeLWLocks();
+ }
+
+ /* Register named extension LWLock tranches in the current process. */
+ for (int i = 0; i < NamedLWLockTrancheRequests; i++)
+ LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId,
+ NamedLWLockTrancheArray[i].trancheName);
+}
+
+/*
+ * Initialize LWLocks that are fixed and those belonging to named tranches.
+ */
+static void
+InitializeLWLocks(void)
+{
+ int numNamedLocks = NumLWLocksForNamedTranches();
+ int id;
+ int i;
+ int j;
+ LWLockPadded *lock;
+
+ /* Initialize all individual LWLocks in main array */
+ for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++)
+ LWLockInitialize(&lock->lock, id);
+
+ /* Initialize buffer mapping LWLocks in main array */
+ lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING);
+
+ /* Initialize lmgrs' LWLocks in main array */
+ lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER);
+
+ /* Initialize predicate lmgrs' LWLocks in main array */
+ lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER);
+
+ /*
+ * Copy the info about any named tranches into shared memory (so that
+ * other processes can see it), and initialize the requested LWLocks.
+ */
+ if (NamedLWLockTrancheRequests > 0)
+ {
+ char *trancheNames;
+
+ NamedLWLockTrancheArray = (NamedLWLockTranche *)
+ &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks];
+
+ trancheNames = (char *) NamedLWLockTrancheArray +
+ (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche));
+ lock = &MainLWLockArray[NUM_FIXED_LWLOCKS];
+
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ {
+ NamedLWLockTrancheRequest *request;
+ NamedLWLockTranche *tranche;
+ char *name;
+
+ request = &NamedLWLockTrancheRequestArray[i];
+ tranche = &NamedLWLockTrancheArray[i];
+
+ name = trancheNames;
+ trancheNames += strlen(request->tranche_name) + 1;
+ strcpy(name, request->tranche_name);
+ tranche->trancheId = LWLockNewTrancheId();
+ tranche->trancheName = name;
+
+ for (j = 0; j < request->num_lwlocks; j++, lock++)
+ LWLockInitialize(&lock->lock, tranche->trancheId);
+ }
+ }
+}
+
+/*
+ * InitLWLockAccess - initialize backend-local state needed to hold LWLocks
+ */
+void
+InitLWLockAccess(void)
+{
+#ifdef LWLOCK_STATS
+ init_lwlock_stats();
+#endif
+}
+
+/*
+ * GetNamedLWLockTranche - returns the base address of LWLock from the
+ * specified tranche.
+ *
+ * Caller needs to retrieve the requested number of LWLocks starting from
+ * the base lock address returned by this API. This can be used for
+ * tranches that are requested by using RequestNamedLWLockTranche() API.
+ */
+LWLockPadded *
+GetNamedLWLockTranche(const char *tranche_name)
+{
+ int lock_pos;
+ int i;
+
+ /*
+ * Obtain the position of base address of LWLock belonging to requested
+ * tranche_name in MainLWLockArray. LWLocks for named tranches are placed
+ * in MainLWLockArray after fixed locks.
+ */
+ lock_pos = NUM_FIXED_LWLOCKS;
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ {
+ if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name,
+ tranche_name) == 0)
+ return &MainLWLockArray[lock_pos];
+
+ lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+ }
+
+ elog(ERROR, "requested tranche is not registered");
+
+ /* just to keep compiler quiet */
+ return NULL;
+}
+
+/*
+ * Allocate a new tranche ID.
+ */
+int
+LWLockNewTrancheId(void)
+{
+ int result;
+ int *LWLockCounter;
+
+ LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+ SpinLockAcquire(ShmemLock);
+ result = (*LWLockCounter)++;
+ SpinLockRelease(ShmemLock);
+
+ return result;
+}
+
+/*
+ * Register a dynamic tranche name in the lookup table of the current process.
+ *
+ * This routine will save a pointer to the tranche name passed as an argument,
+ * so the name should be allocated in a backend-lifetime context
+ * (shared memory, TopMemoryContext, static constant, or similar).
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+LWLockRegisterTranche(int tranche_id, const char *tranche_name)
+{
+ /* This should only be called for user-defined tranches. */
+ if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED)
+ return;
+
+ /* Convert to array index. */
+ tranche_id -= LWTRANCHE_FIRST_USER_DEFINED;
+
+ /* If necessary, create or enlarge array. */
+ if (tranche_id >= LWLockTrancheNamesAllocated)
+ {
+ int newalloc;
+
+ newalloc = pg_nextpower2_32(Max(8, tranche_id + 1));
+
+ if (LWLockTrancheNames == NULL)
+ LWLockTrancheNames = (const char **)
+ MemoryContextAllocZero(TopMemoryContext,
+ newalloc * sizeof(char *));
+ else
+ LWLockTrancheNames =
+ repalloc0_array(LWLockTrancheNames, const char *, LWLockTrancheNamesAllocated, newalloc);
+ LWLockTrancheNamesAllocated = newalloc;
+ }
+
+ LWLockTrancheNames[tranche_id] = tranche_name;
+}
+
+/*
+ * RequestNamedLWLockTranche
+ * Request that extra LWLocks be allocated during postmaster
+ * startup.
+ *
+ * This may only be called via the shmem_request_hook of a library that is
+ * loaded into the postmaster via shared_preload_libraries. Calls from
+ * elsewhere will fail.
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks)
+{
+ NamedLWLockTrancheRequest *request;
+
+ if (!process_shmem_requests_in_progress)
+ elog(FATAL, "cannot request additional LWLocks outside shmem_request_hook");
+
+ if (NamedLWLockTrancheRequestArray == NULL)
+ {
+ NamedLWLockTrancheRequestsAllocated = 16;
+ NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+ MemoryContextAlloc(TopMemoryContext,
+ NamedLWLockTrancheRequestsAllocated
+ * sizeof(NamedLWLockTrancheRequest));
+ }
+
+ if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated)
+ {
+ int i = pg_nextpower2_32(NamedLWLockTrancheRequests + 1);
+
+ NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+ repalloc(NamedLWLockTrancheRequestArray,
+ i * sizeof(NamedLWLockTrancheRequest));
+ NamedLWLockTrancheRequestsAllocated = i;
+ }
+
+ request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests];
+ Assert(strlen(tranche_name) + 1 <= NAMEDATALEN);
+ strlcpy(request->tranche_name, tranche_name, NAMEDATALEN);
+ request->num_lwlocks = num_lwlocks;
+ NamedLWLockTrancheRequests++;
+}
+
+/*
+ * LWLockInitialize - initialize a new lwlock; it's initially unlocked
+ */
+void
+LWLockInitialize(LWLock *lock, int tranche_id)
+{
+ pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK);
+#ifdef LOCK_DEBUG
+ pg_atomic_init_u32(&lock->nwaiters, 0);
+#endif
+ lock->tranche = tranche_id;
+ proclist_init(&lock->waiters);
+}
+
+/*
+ * Report start of wait event for light-weight locks.
+ *
+ * This function will be used by all the light-weight lock calls which
+ * needs to wait to acquire the lock. This function distinguishes wait
+ * event based on tranche and lock id.
+ */
+static inline void
+LWLockReportWaitStart(LWLock *lock)
+{
+ pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche);
+}
+
+/*
+ * Report end of wait event for light-weight locks.
+ */
+static inline void
+LWLockReportWaitEnd(void)
+{
+ pgstat_report_wait_end();
+}
+
+/*
+ * Return the name of an LWLock tranche.
+ */
+static const char *
+GetLWTrancheName(uint16 trancheId)
+{
+ /* Individual LWLock? */
+ if (trancheId < NUM_INDIVIDUAL_LWLOCKS)
+ return IndividualLWLockNames[trancheId];
+
+ /* Built-in tranche? */
+ if (trancheId < LWTRANCHE_FIRST_USER_DEFINED)
+ return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS];
+
+ /*
+ * It's an extension tranche, so look in LWLockTrancheNames[]. However,
+ * it's possible that the tranche has never been registered in the current
+ * process, in which case give up and return "extension".
+ */
+ trancheId -= LWTRANCHE_FIRST_USER_DEFINED;
+
+ if (trancheId >= LWLockTrancheNamesAllocated ||
+ LWLockTrancheNames[trancheId] == NULL)
+ return "extension";
+
+ return LWLockTrancheNames[trancheId];
+}
+
+/*
+ * Return an identifier for an LWLock based on the wait class and event.
+ */
+const char *
+GetLWLockIdentifier(uint32 classId, uint16 eventId)
+{
+ Assert(classId == PG_WAIT_LWLOCK);
+ /* The event IDs are just tranche numbers. */
+ return GetLWTrancheName(eventId);
+}
+
+/*
+ * Internal function that tries to atomically acquire the lwlock in the passed
+ * in mode.
+ *
+ * This function will not block waiting for a lock to become free - that's the
+ * callers job.
+ *
+ * Returns true if the lock isn't free and we need to wait.
+ */
+static bool
+LWLockAttemptLock(LWLock *lock, LWLockMode mode)
+{
+ uint32 old_state;
+
+ Assert(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+
+ /*
+ * Read once outside the loop, later iterations will get the newer value
+ * via compare & exchange.
+ */
+ old_state = pg_atomic_read_u32(&lock->state);
+
+ /* loop until we've determined whether we could acquire the lock or not */
+ while (true)
+ {
+ uint32 desired_state;
+ bool lock_free;
+
+ desired_state = old_state;
+
+ if (mode == LW_EXCLUSIVE)
+ {
+ lock_free = (old_state & LW_LOCK_MASK) == 0;
+ if (lock_free)
+ desired_state += LW_VAL_EXCLUSIVE;
+ }
+ else
+ {
+ lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
+ if (lock_free)
+ desired_state += LW_VAL_SHARED;
+ }
+
+ /*
+ * Attempt to swap in the state we are expecting. If we didn't see
+ * lock to be free, that's just the old value. If we saw it as free,
+ * we'll attempt to mark it acquired. The reason that we always swap
+ * in the value is that this doubles as a memory barrier. We could try
+ * to be smarter and only swap in values if we saw the lock as free,
+ * but benchmark haven't shown it as beneficial so far.
+ *
+ * Retry if the value changed since we last looked at it.
+ */
+ if (pg_atomic_compare_exchange_u32(&lock->state,
+ &old_state, desired_state))
+ {
+ if (lock_free)
+ {
+ /* Great! Got the lock. */
+#ifdef LOCK_DEBUG
+ if (mode == LW_EXCLUSIVE)
+ lock->owner = MyProc;
+#endif
+ return false;
+ }
+ else
+ return true; /* somebody else has the lock */
+ }
+ }
+ pg_unreachable();
+}
+
+/*
+ * Lock the LWLock's wait list against concurrent activity.
+ *
+ * NB: even though the wait list is locked, non-conflicting lock operations
+ * may still happen concurrently.
+ *
+ * Time spent holding mutex should be short!
+ */
+static void
+LWLockWaitListLock(LWLock *lock)
+{
+ uint32 old_state;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+ uint32 delays = 0;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ while (true)
+ {
+ /* always try once to acquire lock directly */
+ old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED);
+ if (!(old_state & LW_FLAG_LOCKED))
+ break; /* got lock */
+
+ /* and then spin without atomic operations until lock is released */
+ {
+ SpinDelayStatus delayStatus;
+
+ init_local_spin_delay(&delayStatus);
+
+ while (old_state & LW_FLAG_LOCKED)
+ {
+ perform_spin_delay(&delayStatus);
+ old_state = pg_atomic_read_u32(&lock->state);
+ }
+#ifdef LWLOCK_STATS
+ delays += delayStatus.delays;
+#endif
+ finish_spin_delay(&delayStatus);
+ }
+
+ /*
+ * Retry. The lock might obviously already be re-acquired by the time
+ * we're attempting to get it again.
+ */
+ }
+
+#ifdef LWLOCK_STATS
+ lwstats->spin_delay_count += delays;
+#endif
+}
+
+/*
+ * Unlock the LWLock's wait list.
+ *
+ * Note that it can be more efficient to manipulate flags and release the
+ * locks in a single atomic operation.
+ */
+static void
+LWLockWaitListUnlock(LWLock *lock)
+{
+ uint32 old_state PG_USED_FOR_ASSERTS_ONLY;
+
+ old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED);
+
+ Assert(old_state & LW_FLAG_LOCKED);
+}
+
+/*
+ * Wakeup all the lockers that currently have a chance to acquire the lock.
+ */
+static void
+LWLockWakeup(LWLock *lock)
+{
+ bool new_release_ok;
+ bool wokeup_somebody = false;
+ proclist_head wakeup;
+ proclist_mutable_iter iter;
+
+ proclist_init(&wakeup);
+
+ new_release_ok = true;
+
+ /* lock wait list while collecting backends to wake up */
+ LWLockWaitListLock(lock);
+
+ proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
+ continue;
+
+ proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+ proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+ if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+ {
+ /*
+ * Prevent additional wakeups until retryer gets to run. Backends
+ * that are just waiting for the lock to become free don't retry
+ * automatically.
+ */
+ new_release_ok = false;
+
+ /*
+ * Don't wakeup (further) exclusive locks.
+ */
+ wokeup_somebody = true;
+ }
+
+ /*
+ * Signal that the process isn't on the wait list anymore. This allows
+ * LWLockDequeueSelf() to remove itself of the waitlist with a
+ * proclist_delete(), rather than having to check if it has been
+ * removed from the list.
+ */
+ Assert(waiter->lwWaiting == LW_WS_WAITING);
+ waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
+
+ /*
+ * Once we've woken up an exclusive lock, there's no point in waking
+ * up anybody else.
+ */
+ if (waiter->lwWaitMode == LW_EXCLUSIVE)
+ break;
+ }
+
+ Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS);
+
+ /* unset required flags, and release lock, in one fell swoop */
+ {
+ uint32 old_state;
+ uint32 desired_state;
+
+ old_state = pg_atomic_read_u32(&lock->state);
+ while (true)
+ {
+ desired_state = old_state;
+
+ /* compute desired flags */
+
+ if (new_release_ok)
+ desired_state |= LW_FLAG_RELEASE_OK;
+ else
+ desired_state &= ~LW_FLAG_RELEASE_OK;
+
+ if (proclist_is_empty(&wakeup))
+ desired_state &= ~LW_FLAG_HAS_WAITERS;
+
+ desired_state &= ~LW_FLAG_LOCKED; /* release lock */
+
+ if (pg_atomic_compare_exchange_u32(&lock->state, &old_state,
+ desired_state))
+ break;
+ }
+ }
+
+ /* Awaken any waiters I removed from the queue. */
+ proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ LOG_LWDEBUG("LWLockRelease", lock, "release waiter");
+ proclist_delete(&wakeup, iter.cur, lwWaitLink);
+
+ /*
+ * Guarantee that lwWaiting being unset only becomes visible once the
+ * unlink from the link has completed. Otherwise the target backend
+ * could be woken up for other reason and enqueue for a new lock - if
+ * that happens before the list unlink happens, the list would end up
+ * being corrupted.
+ *
+ * The barrier pairs with the LWLockWaitListLock() when enqueuing for
+ * another lock.
+ */
+ pg_write_barrier();
+ waiter->lwWaiting = LW_WS_NOT_WAITING;
+ PGSemaphoreUnlock(waiter->sem);
+ }
+}
+
+/*
+ * Add ourselves to the end of the queue.
+ *
+ * NB: Mode can be LW_WAIT_UNTIL_FREE here!
+ */
+static void
+LWLockQueueSelf(LWLock *lock, LWLockMode mode)
+{
+ /*
+ * If we don't have a PGPROC structure, there's no way to wait. This
+ * should never occur, since MyProc should only be null during shared
+ * memory initialization.
+ */
+ if (MyProc == NULL)
+ elog(PANIC, "cannot wait without a PGPROC structure");
+
+ if (MyProc->lwWaiting != LW_WS_NOT_WAITING)
+ elog(PANIC, "queueing for lock while waiting on another one");
+
+ LWLockWaitListLock(lock);
+
+ /* setting the flag is protected by the spinlock */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
+
+ MyProc->lwWaiting = LW_WS_WAITING;
+ MyProc->lwWaitMode = mode;
+
+ /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */
+ if (mode == LW_WAIT_UNTIL_FREE)
+ proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+ else
+ proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+
+ /* Can release the mutex now */
+ LWLockWaitListUnlock(lock);
+
+#ifdef LOCK_DEBUG
+ pg_atomic_fetch_add_u32(&lock->nwaiters, 1);
+#endif
+}
+
+/*
+ * Remove ourselves from the waitlist.
+ *
+ * This is used if we queued ourselves because we thought we needed to sleep
+ * but, after further checking, we discovered that we don't actually need to
+ * do so.
+ */
+static void
+LWLockDequeueSelf(LWLock *lock)
+{
+ bool on_waitlist;
+
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+
+ lwstats->dequeue_self_count++;
+#endif
+
+ LWLockWaitListLock(lock);
+
+ /*
+ * Remove ourselves from the waitlist, unless we've already been removed.
+ * The removal happens with the wait list lock held, so there's no race in
+ * this check.
+ */
+ on_waitlist = MyProc->lwWaiting == LW_WS_WAITING;
+ if (on_waitlist)
+ proclist_delete(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+
+ if (proclist_is_empty(&lock->waiters) &&
+ (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0)
+ {
+ pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS);
+ }
+
+ /* XXX: combine with fetch_and above? */
+ LWLockWaitListUnlock(lock);
+
+ /* clear waiting state again, nice for debugging */
+ if (on_waitlist)
+ MyProc->lwWaiting = LW_WS_NOT_WAITING;
+ else
+ {
+ int extraWaits = 0;
+
+ /*
+ * Somebody else dequeued us and has or will wake us up. Deal with the
+ * superfluous absorption of a wakeup.
+ */
+
+ /*
+ * Reset RELEASE_OK flag if somebody woke us before we removed
+ * ourselves - they'll have set it to false.
+ */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+ /*
+ * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
+ * get reset at some inconvenient point later. Most of the time this
+ * will immediately return.
+ */
+ for (;;)
+ {
+ PGSemaphoreLock(MyProc->sem);
+ if (MyProc->lwWaiting == LW_WS_NOT_WAITING)
+ break;
+ extraWaits++;
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(MyProc->sem);
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+}
+
+/*
+ * LWLockAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, sleep until it is. Returns true if the lock
+ * was available immediately, false if we had to sleep.
+ *
+ * Side effect: cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockAcquire(LWLock *lock, LWLockMode mode)
+{
+ PGPROC *proc = MyProc;
+ bool result = true;
+ int extraWaits = 0;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockAcquire", lock, mode);
+
+#ifdef LWLOCK_STATS
+ /* Count lock acquisition attempts */
+ if (mode == LW_EXCLUSIVE)
+ lwstats->ex_acquire_count++;
+ else
+ lwstats->sh_acquire_count++;
+#endif /* LWLOCK_STATS */
+
+ /*
+ * We can't wait if we haven't got a PGPROC. This should only occur
+ * during bootstrap or shared memory initialization. Put an Assert here
+ * to catch unsafe coding practices.
+ */
+ Assert(!(proc == NULL && IsUnderPostmaster));
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * Loop here to try to acquire lock after each time we are signaled by
+ * LWLockRelease.
+ *
+ * NOTE: it might seem better to have LWLockRelease actually grant us the
+ * lock, rather than retrying and possibly having to go back to sleep. But
+ * in practice that is no good because it means a process swap for every
+ * lock acquisition when two or more processes are contending for the same
+ * lock. Since LWLocks are normally used to protect not-very-long
+ * sections of computation, a process needs to be able to acquire and
+ * release the same lock many times during a single CPU time slice, even
+ * in the presence of contention. The efficiency of being able to do that
+ * outweighs the inefficiency of sometimes wasting a process dispatch
+ * cycle because the lock is not free when a released waiter finally gets
+ * to run. See pgsql-hackers archives for 29-Dec-01.
+ */
+ for (;;)
+ {
+ bool mustwait;
+
+ /*
+ * Try to grab the lock the first time, we're not in the waitqueue
+ * yet/anymore.
+ */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock");
+ break; /* got the lock */
+ }
+
+ /*
+ * Ok, at this point we couldn't grab the lock on the first try. We
+ * cannot simply queue ourselves to the end of the list and wait to be
+ * woken up because by now the lock could long have been released.
+ * Instead add us to the queue and try to grab the lock again. If we
+ * succeed we need to revert the queuing and be happy, otherwise we
+ * recheck the lock. If we still couldn't grab it, we know that the
+ * other locker will see our queue entries when releasing since they
+ * existed before we checked for the lock.
+ */
+
+ /* add to the queue */
+ LWLockQueueSelf(lock, mode);
+
+ /* we're now guaranteed to be woken up if necessary */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ /* ok, grabbed the lock the second time round, need to undo queueing */
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue");
+
+ LWLockDequeueSelf(lock);
+ break;
+ }
+
+ /*
+ * Wait until awakened.
+ *
+ * It is possible that we get awakened for a reason other than being
+ * signaled by LWLockRelease. If so, loop back and wait again. Once
+ * we've gotten the LWLock, re-increment the sema by the number of
+ * additional signals received.
+ */
+ LOG_LWDEBUG("LWLockAcquire", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (proc->lwWaiting == LW_WS_NOT_WAITING)
+ break;
+ extraWaits++;
+ }
+
+ /* Retrying, allow LWLockRelease to release waiters again. */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockAcquire", lock, "awakened");
+
+ /* Now loop back and try to acquire lock again. */
+ result = false;
+ }
+
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode);
+
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ return result;
+}
+
+/*
+ * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, return false with no side-effects.
+ *
+ * If successful, cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
+{
+ bool mustwait;
+
+ Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode);
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /* Check for the lock */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ /* Failed to get lock, so release interrupt holdoff */
+ RESUME_INTERRUPTS();
+
+ LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed");
+ if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode);
+ }
+ else
+ {
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+ if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode);
+ }
+ return !mustwait;
+}
+
+/*
+ * LWLockAcquireOrWait - Acquire lock, or wait until it's free
+ *
+ * The semantics of this function are a bit funky. If the lock is currently
+ * free, it is acquired in the given mode, and the function returns true. If
+ * the lock isn't immediately free, the function waits until it is released
+ * and returns false, but does not acquire the lock.
+ *
+ * This is currently used for WALWriteLock: when a backend flushes the WAL,
+ * holding WALWriteLock, it can flush the commit records of many other
+ * backends as a side-effect. Those other backends need to wait until the
+ * flush finishes, but don't need to acquire the lock anymore. They can just
+ * wake up, observe that their records have already been flushed, and return.
+ */
+bool
+LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
+{
+ PGPROC *proc = MyProc;
+ bool mustwait;
+ int extraWaits = 0;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode);
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * NB: We're using nearly the same twice-in-a-row lock acquisition
+ * protocol as LWLockAcquire(). Check its comments for details.
+ */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ /*
+ * Wait until awakened. Like in LWLockAcquire, be prepared for
+ * bogus wakeups.
+ */
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (proc->lwWaiting == LW_WS_NOT_WAITING)
+ break;
+ extraWaits++;
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened");
+ }
+ else
+ {
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue");
+
+ /*
+ * Got lock in the second attempt, undo queueing. We need to treat
+ * this as having successfully acquired the lock, otherwise we'd
+ * not necessarily wake up people we've prevented from acquiring
+ * the lock.
+ */
+ LWLockDequeueSelf(lock);
+ }
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ if (mustwait)
+ {
+ /* Failed to get lock, so release interrupt holdoff */
+ RESUME_INTERRUPTS();
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed");
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode);
+ }
+ else
+ {
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded");
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode);
+ }
+
+ return !mustwait;
+}
+
+/*
+ * Does the lwlock in its current state need to wait for the variable value to
+ * change?
+ *
+ * If we don't need to wait, and it's because the value of the variable has
+ * changed, store the current value in newval.
+ *
+ * *result is set to true if the lock was free, and false otherwise.
+ */
+static bool
+LWLockConflictsWithVar(LWLock *lock,
+ uint64 *valptr, uint64 oldval, uint64 *newval,
+ bool *result)
+{
+ bool mustwait;
+ uint64 value;
+
+ /*
+ * Test first to see if it the slot is free right now.
+ *
+ * XXX: the caller uses a spinlock before this, so we don't need a memory
+ * barrier here as far as the current usage is concerned. But that might
+ * not be safe in general.
+ */
+ mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0;
+
+ if (!mustwait)
+ {
+ *result = true;
+ return false;
+ }
+
+ *result = false;
+
+ /*
+ * Read value using the lwlock's wait list lock, as we can't generally
+ * rely on atomic 64 bit reads/stores. TODO: On platforms with a way to
+ * do atomic 64 bit reads/writes the spinlock should be optimized away.
+ */
+ LWLockWaitListLock(lock);
+ value = *valptr;
+ LWLockWaitListUnlock(lock);
+
+ if (value != oldval)
+ {
+ mustwait = false;
+ *newval = value;
+ }
+ else
+ {
+ mustwait = true;
+ }
+
+ return mustwait;
+}
+
+/*
+ * LWLockWaitForVar - Wait until lock is free, or a variable is updated.
+ *
+ * If the lock is held and *valptr equals oldval, waits until the lock is
+ * either freed, or the lock holder updates *valptr by calling
+ * LWLockUpdateVar. If the lock is free on exit (immediately or after
+ * waiting), returns true. If the lock is still held, but *valptr no longer
+ * matches oldval, returns false and sets *newval to the current value in
+ * *valptr.
+ *
+ * Note: this function ignores shared lock holders; if the lock is held
+ * in shared mode, returns 'true'.
+ */
+bool
+LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
+{
+ PGPROC *proc = MyProc;
+ int extraWaits = 0;
+ bool result = false;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE);
+
+ /*
+ * Lock out cancel/die interrupts while we sleep on the lock. There is no
+ * cleanup mechanism to remove us from the wait queue if we got
+ * interrupted.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * Loop here to check the lock's status after each time we are signaled.
+ */
+ for (;;)
+ {
+ bool mustwait;
+
+ mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+ &result);
+
+ if (!mustwait)
+ break; /* the lock was free or value didn't match */
+
+ /*
+ * Add myself to wait queue. Note that this is racy, somebody else
+ * could wakeup before we're finished queuing. NB: We're using nearly
+ * the same twice-in-a-row lock acquisition protocol as
+ * LWLockAcquire(). Check its comments for details. The only
+ * difference is that we also have to check the variable's values when
+ * checking the state of the lock.
+ */
+ LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+ /*
+ * Set RELEASE_OK flag, to make sure we get woken up as soon as the
+ * lock is released.
+ */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+ /*
+ * We're now guaranteed to be woken up if necessary. Recheck the lock
+ * and variables state.
+ */
+ mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+ &result);
+
+ /* Ok, no conflict after we queued ourselves. Undo queueing. */
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue");
+
+ LWLockDequeueSelf(lock);
+ break;
+ }
+
+ /*
+ * Wait until awakened.
+ *
+ * It is possible that we get awakened for a reason other than being
+ * signaled by LWLockRelease. If so, loop back and wait again. Once
+ * we've gotten the LWLock, re-increment the sema by the number of
+ * additional signals received.
+ */
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (proc->lwWaiting == LW_WS_NOT_WAITING)
+ break;
+ extraWaits++;
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened");
+
+ /* Now loop back and check the status of the lock again. */
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ /*
+ * Now okay to allow cancel/die interrupts.
+ */
+ RESUME_INTERRUPTS();
+
+ return result;
+}
+
+
+/*
+ * LWLockUpdateVar - Update a variable and wake up waiters atomically
+ *
+ * Sets *valptr to 'val', and wakes up all processes waiting for us with
+ * LWLockWaitForVar(). Setting the value and waking up the processes happen
+ * atomically so that any process calling LWLockWaitForVar() on the same lock
+ * is guaranteed to see the new value, and act accordingly.
+ *
+ * The caller must be holding the lock in exclusive mode.
+ */
+void
+LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+ proclist_head wakeup;
+ proclist_mutable_iter iter;
+
+ PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE);
+
+ proclist_init(&wakeup);
+
+ LWLockWaitListLock(lock);
+
+ Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE);
+
+ /* Update the lock's value */
+ *valptr = val;
+
+ /*
+ * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken
+ * up. They are always in the front of the queue.
+ */
+ proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+ break;
+
+ proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+ proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+ /* see LWLockWakeup() */
+ Assert(waiter->lwWaiting == LW_WS_WAITING);
+ waiter->lwWaiting = LW_WS_PENDING_WAKEUP;
+ }
+
+ /* We are done updating shared state of the lock itself. */
+ LWLockWaitListUnlock(lock);
+
+ /*
+ * Awaken any waiters I removed from the queue.
+ */
+ proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ proclist_delete(&wakeup, iter.cur, lwWaitLink);
+ /* check comment in LWLockWakeup() about this barrier */
+ pg_write_barrier();
+ waiter->lwWaiting = LW_WS_NOT_WAITING;
+ PGSemaphoreUnlock(waiter->sem);
+ }
+}
+
+
+/*
+ * LWLockRelease - release a previously acquired lock
+ */
+void
+LWLockRelease(LWLock *lock)
+{
+ LWLockMode mode;
+ uint32 oldstate;
+ bool check_waiters;
+ int i;
+
+ /*
+ * Remove lock from list of locks held. Usually, but not always, it will
+ * be the latest-acquired lock; so search array backwards.
+ */
+ for (i = num_held_lwlocks; --i >= 0;)
+ if (lock == held_lwlocks[i].lock)
+ break;
+
+ if (i < 0)
+ elog(ERROR, "lock %s is not held", T_NAME(lock));
+
+ mode = held_lwlocks[i].mode;
+
+ num_held_lwlocks--;
+ for (; i < num_held_lwlocks; i++)
+ held_lwlocks[i] = held_lwlocks[i + 1];
+
+ PRINT_LWDEBUG("LWLockRelease", lock, mode);
+
+ /*
+ * Release my hold on lock, after that it can immediately be acquired by
+ * others, even if we still have to wakeup other waiters.
+ */
+ if (mode == LW_EXCLUSIVE)
+ oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE);
+ else
+ oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);
+
+ /* nobody else can have that kind of lock */
+ Assert(!(oldstate & LW_VAL_EXCLUSIVE));
+
+ if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock));
+
+ /*
+ * We're still waiting for backends to get scheduled, don't wake them up
+ * again.
+ */
+ if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) ==
+ (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) &&
+ (oldstate & LW_LOCK_MASK) == 0)
+ check_waiters = true;
+ else
+ check_waiters = false;
+
+ /*
+ * As waking up waiters requires the spinlock to be acquired, only do so
+ * if necessary.
+ */
+ if (check_waiters)
+ {
+ /* XXX: remove before commit? */
+ LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters");
+ LWLockWakeup(lock);
+ }
+
+ /*
+ * Now okay to allow cancel/die interrupts.
+ */
+ RESUME_INTERRUPTS();
+}
+
+/*
+ * LWLockReleaseClearVar - release a previously acquired lock, reset variable
+ */
+void
+LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+ LWLockWaitListLock(lock);
+
+ /*
+ * Set the variable's value before releasing the lock, that prevents race
+ * a race condition wherein a new locker acquires the lock, but hasn't yet
+ * set the variables value.
+ */
+ *valptr = val;
+ LWLockWaitListUnlock(lock);
+
+ LWLockRelease(lock);
+}
+
+
+/*
+ * LWLockReleaseAll - release all currently-held locks
+ *
+ * Used to clean up after ereport(ERROR). An important difference between this
+ * function and retail LWLockRelease calls is that InterruptHoldoffCount is
+ * unchanged by this operation. This is necessary since InterruptHoldoffCount
+ * has been set to an appropriate level earlier in error recovery. We could
+ * decrement it below zero if we allow it to drop for each released lock!
+ */
+void
+LWLockReleaseAll(void)
+{
+ while (num_held_lwlocks > 0)
+ {
+ HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
+
+ LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock);
+ }
+}
+
+
+/*
+ * LWLockHeldByMe - test whether my process holds a lock in any mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMe(LWLock *lock)
+{
+ int i;
+
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ if (held_lwlocks[i].lock == lock)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * LWLockHeldByMe - test whether my process holds any of an array of locks
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockAnyHeldByMe(LWLock *lock, int nlocks, size_t stride)
+{
+ char *held_lock_addr;
+ char *begin;
+ char *end;
+ int i;
+
+ begin = (char *) lock;
+ end = begin + nlocks * stride;
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ held_lock_addr = (char *) held_lwlocks[i].lock;
+ if (held_lock_addr >= begin &&
+ held_lock_addr < end &&
+ (held_lock_addr - begin) % stride == 0)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * LWLockHeldByMeInMode - test whether my process holds a lock in given mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode)
+{
+ int i;
+
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ if (held_lwlocks[i].lock == lock && held_lwlocks[i].mode == mode)
+ return true;
+ }
+ return false;
+}
diff --git a/src/backend/storage/lmgr/lwlocknames.c b/src/backend/storage/lmgr/lwlocknames.c
new file mode 100644
index 0000000..65f7c5b
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.c
@@ -0,0 +1,52 @@
+/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */
+
+const char *const IndividualLWLockNames[] = {
+ "<unassigned:0>",
+ "ShmemIndex",
+ "OidGen",
+ "XidGen",
+ "ProcArray",
+ "SInvalRead",
+ "SInvalWrite",
+ "WALBufMapping",
+ "WALWrite",
+ "ControlFile",
+ "<unassigned:10>",
+ "XactSLRU",
+ "SubtransSLRU",
+ "MultiXactGen",
+ "MultiXactOffsetSLRU",
+ "MultiXactMemberSLRU",
+ "RelCacheInit",
+ "CheckpointerComm",
+ "TwoPhaseState",
+ "TablespaceCreate",
+ "BtreeVacuum",
+ "AddinShmemInit",
+ "Autovacuum",
+ "AutovacuumSchedule",
+ "SyncScan",
+ "RelationMapping",
+ "NotifySLRU",
+ "NotifyQueue",
+ "SerializableXactHash",
+ "SerializableFinishedList",
+ "SerializablePredicateList",
+ "SerialSLRU",
+ "SyncRep",
+ "BackgroundWorker",
+ "DynamicSharedMemoryControl",
+ "AutoFile",
+ "ReplicationSlotAllocation",
+ "ReplicationSlotControl",
+ "CommitTsSLRU",
+ "CommitTs",
+ "ReplicationOrigin",
+ "MultiXactTruncation",
+ "OldSnapshotTimeMap",
+ "LogicalRepWorker",
+ "XactTruncation",
+ "<unassigned:45>",
+ "WrapLimitsVacuum",
+ "NotifyQueueTail"
+};
diff --git a/src/backend/storage/lmgr/lwlocknames.h b/src/backend/storage/lmgr/lwlocknames.h
new file mode 100644
index 0000000..e279f72
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.h
@@ -0,0 +1,50 @@
+/* autogenerated from src/backend/storage/lmgr/lwlocknames.txt, do not edit */
+/* there is deliberately not an #ifndef LWLOCKNAMES_H here */
+
+#define ShmemIndexLock (&MainLWLockArray[1].lock)
+#define OidGenLock (&MainLWLockArray[2].lock)
+#define XidGenLock (&MainLWLockArray[3].lock)
+#define ProcArrayLock (&MainLWLockArray[4].lock)
+#define SInvalReadLock (&MainLWLockArray[5].lock)
+#define SInvalWriteLock (&MainLWLockArray[6].lock)
+#define WALBufMappingLock (&MainLWLockArray[7].lock)
+#define WALWriteLock (&MainLWLockArray[8].lock)
+#define ControlFileLock (&MainLWLockArray[9].lock)
+#define XactSLRULock (&MainLWLockArray[11].lock)
+#define SubtransSLRULock (&MainLWLockArray[12].lock)
+#define MultiXactGenLock (&MainLWLockArray[13].lock)
+#define MultiXactOffsetSLRULock (&MainLWLockArray[14].lock)
+#define MultiXactMemberSLRULock (&MainLWLockArray[15].lock)
+#define RelCacheInitLock (&MainLWLockArray[16].lock)
+#define CheckpointerCommLock (&MainLWLockArray[17].lock)
+#define TwoPhaseStateLock (&MainLWLockArray[18].lock)
+#define TablespaceCreateLock (&MainLWLockArray[19].lock)
+#define BtreeVacuumLock (&MainLWLockArray[20].lock)
+#define AddinShmemInitLock (&MainLWLockArray[21].lock)
+#define AutovacuumLock (&MainLWLockArray[22].lock)
+#define AutovacuumScheduleLock (&MainLWLockArray[23].lock)
+#define SyncScanLock (&MainLWLockArray[24].lock)
+#define RelationMappingLock (&MainLWLockArray[25].lock)
+#define NotifySLRULock (&MainLWLockArray[26].lock)
+#define NotifyQueueLock (&MainLWLockArray[27].lock)
+#define SerializableXactHashLock (&MainLWLockArray[28].lock)
+#define SerializableFinishedListLock (&MainLWLockArray[29].lock)
+#define SerializablePredicateListLock (&MainLWLockArray[30].lock)
+#define SerialSLRULock (&MainLWLockArray[31].lock)
+#define SyncRepLock (&MainLWLockArray[32].lock)
+#define BackgroundWorkerLock (&MainLWLockArray[33].lock)
+#define DynamicSharedMemoryControlLock (&MainLWLockArray[34].lock)
+#define AutoFileLock (&MainLWLockArray[35].lock)
+#define ReplicationSlotAllocationLock (&MainLWLockArray[36].lock)
+#define ReplicationSlotControlLock (&MainLWLockArray[37].lock)
+#define CommitTsSLRULock (&MainLWLockArray[38].lock)
+#define CommitTsLock (&MainLWLockArray[39].lock)
+#define ReplicationOriginLock (&MainLWLockArray[40].lock)
+#define MultiXactTruncationLock (&MainLWLockArray[41].lock)
+#define OldSnapshotTimeMapLock (&MainLWLockArray[42].lock)
+#define LogicalRepWorkerLock (&MainLWLockArray[43].lock)
+#define XactTruncationLock (&MainLWLockArray[44].lock)
+#define WrapLimitsVacuumLock (&MainLWLockArray[46].lock)
+#define NotifyQueueTailLock (&MainLWLockArray[47].lock)
+
+#define NUM_INDIVIDUAL_LWLOCKS 48
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
new file mode 100644
index 0000000..6c7cf6c
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -0,0 +1,55 @@
+# Some commonly-used locks have predefined positions within MainLWLockArray;
+# these are defined here. If you add a lock, add it to the end to avoid
+# renumbering the existing locks; if you remove a lock, consider leaving a gap
+# in the numbering sequence for the benefit of DTrace and other external
+# debugging scripts. Also, do not forget to update the list of wait events
+# in the user documentation.
+
+# 0 is available; was formerly BufFreelistLock
+ShmemIndexLock 1
+OidGenLock 2
+XidGenLock 3
+ProcArrayLock 4
+SInvalReadLock 5
+SInvalWriteLock 6
+WALBufMappingLock 7
+WALWriteLock 8
+ControlFileLock 9
+# 10 was CheckpointLock
+XactSLRULock 11
+SubtransSLRULock 12
+MultiXactGenLock 13
+MultiXactOffsetSLRULock 14
+MultiXactMemberSLRULock 15
+RelCacheInitLock 16
+CheckpointerCommLock 17
+TwoPhaseStateLock 18
+TablespaceCreateLock 19
+BtreeVacuumLock 20
+AddinShmemInitLock 21
+AutovacuumLock 22
+AutovacuumScheduleLock 23
+SyncScanLock 24
+RelationMappingLock 25
+NotifySLRULock 26
+NotifyQueueLock 27
+SerializableXactHashLock 28
+SerializableFinishedListLock 29
+SerializablePredicateListLock 30
+SerialSLRULock 31
+SyncRepLock 32
+BackgroundWorkerLock 33
+DynamicSharedMemoryControlLock 34
+AutoFileLock 35
+ReplicationSlotAllocationLock 36
+ReplicationSlotControlLock 37
+CommitTsSLRULock 38
+CommitTsLock 39
+ReplicationOriginLock 40
+MultiXactTruncationLock 41
+OldSnapshotTimeMapLock 42
+LogicalRepWorkerLock 43
+XactTruncationLock 44
+# 45 was XactTruncationLock until removal of BackendRandomLock
+WrapLimitsVacuumLock 46
+NotifyQueueTailLock 47
diff --git a/src/backend/storage/lmgr/meson.build b/src/backend/storage/lmgr/meson.build
new file mode 100644
index 0000000..0b2c93d
--- /dev/null
+++ b/src/backend/storage/lmgr/meson.build
@@ -0,0 +1,15 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'condition_variable.c',
+ 'deadlock.c',
+ 'lmgr.c',
+ 'lock.c',
+ 'lwlock.c',
+ 'predicate.c',
+ 'proc.c',
+ 's_lock.c',
+ 'spin.c',
+)
+
+generated_backend_sources += lwlocknames[1]
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
new file mode 100644
index 0000000..1af4121
--- /dev/null
+++ b/src/backend/storage/lmgr/predicate.c
@@ -0,0 +1,4997 @@
+/*-------------------------------------------------------------------------
+ *
+ * predicate.c
+ * POSTGRES predicate locking
+ * to support full serializable transaction isolation
+ *
+ *
+ * The approach taken is to implement Serializable Snapshot Isolation (SSI)
+ * as initially described in this paper:
+ *
+ * Michael J. Cahill, Uwe Röhm, and Alan D. Fekete. 2008.
+ * Serializable isolation for snapshot databases.
+ * In SIGMOD '08: Proceedings of the 2008 ACM SIGMOD
+ * international conference on Management of data,
+ * pages 729-738, New York, NY, USA. ACM.
+ * http://doi.acm.org/10.1145/1376616.1376690
+ *
+ * and further elaborated in Cahill's doctoral thesis:
+ *
+ * Michael James Cahill. 2009.
+ * Serializable Isolation for Snapshot Databases.
+ * Sydney Digital Theses.
+ * University of Sydney, School of Information Technologies.
+ * http://hdl.handle.net/2123/5353
+ *
+ *
+ * Predicate locks for Serializable Snapshot Isolation (SSI) are SIREAD
+ * locks, which are so different from normal locks that a distinct set of
+ * structures is required to handle them. They are needed to detect
+ * rw-conflicts when the read happens before the write. (When the write
+ * occurs first, the reading transaction can check for a conflict by
+ * examining the MVCC data.)
+ *
+ * (1) Besides tuples actually read, they must cover ranges of tuples
+ * which would have been read based on the predicate. This will
+ * require modelling the predicates through locks against database
+ * objects such as pages, index ranges, or entire tables.
+ *
+ * (2) They must be kept in RAM for quick access. Because of this, it
+ * isn't possible to always maintain tuple-level granularity -- when
+ * the space allocated to store these approaches exhaustion, a
+ * request for a lock may need to scan for situations where a single
+ * transaction holds many fine-grained locks which can be coalesced
+ * into a single coarser-grained lock.
+ *
+ * (3) They never block anything; they are more like flags than locks
+ * in that regard; although they refer to database objects and are
+ * used to identify rw-conflicts with normal write locks.
+ *
+ * (4) While they are associated with a transaction, they must survive
+ * a successful COMMIT of that transaction, and remain until all
+ * overlapping transactions complete. This even means that they
+ * must survive termination of the transaction's process. If a
+ * top level transaction is rolled back, however, it is immediately
+ * flagged so that it can be ignored, and its SIREAD locks can be
+ * released any time after that.
+ *
+ * (5) The only transactions which create SIREAD locks or check for
+ * conflicts with them are serializable transactions.
+ *
+ * (6) When a write lock for a top level transaction is found to cover
+ * an existing SIREAD lock for the same transaction, the SIREAD lock
+ * can be deleted.
+ *
+ * (7) A write from a serializable transaction must ensure that an xact
+ * record exists for the transaction, with the same lifespan (until
+ * all concurrent transaction complete or the transaction is rolled
+ * back) so that rw-dependencies to that transaction can be
+ * detected.
+ *
+ * We use an optimization for read-only transactions. Under certain
+ * circumstances, a read-only transaction's snapshot can be shown to
+ * never have conflicts with other transactions. This is referred to
+ * as a "safe" snapshot (and one known not to be is "unsafe").
+ * However, it can't be determined whether a snapshot is safe until
+ * all concurrent read/write transactions complete.
+ *
+ * Once a read-only transaction is known to have a safe snapshot, it
+ * can release its predicate locks and exempt itself from further
+ * predicate lock tracking. READ ONLY DEFERRABLE transactions run only
+ * on safe snapshots, waiting as necessary for one to be available.
+ *
+ *
+ * Lightweight locks to manage access to the predicate locking shared
+ * memory objects must be taken in this order, and should be released in
+ * reverse order:
+ *
+ * SerializableFinishedListLock
+ * - Protects the list of transactions which have completed but which
+ * may yet matter because they overlap still-active transactions.
+ *
+ * SerializablePredicateListLock
+ * - Protects the linked list of locks held by a transaction. Note
+ * that the locks themselves are also covered by the partition
+ * locks of their respective lock targets; this lock only affects
+ * the linked list connecting the locks related to a transaction.
+ * - All transactions share this single lock (with no partitioning).
+ * - There is never a need for a process other than the one running
+ * an active transaction to walk the list of locks held by that
+ * transaction, except parallel query workers sharing the leader's
+ * transaction. In the parallel case, an extra per-sxact lock is
+ * taken; see below.
+ * - It is relatively infrequent that another process needs to
+ * modify the list for a transaction, but it does happen for such
+ * things as index page splits for pages with predicate locks and
+ * freeing of predicate locked pages by a vacuum process. When
+ * removing a lock in such cases, the lock itself contains the
+ * pointers needed to remove it from the list. When adding a
+ * lock in such cases, the lock can be added using the anchor in
+ * the transaction structure. Neither requires walking the list.
+ * - Cleaning up the list for a terminated transaction is sometimes
+ * not done on a retail basis, in which case no lock is required.
+ * - Due to the above, a process accessing its active transaction's
+ * list always uses a shared lock, regardless of whether it is
+ * walking or maintaining the list. This improves concurrency
+ * for the common access patterns.
+ * - A process which needs to alter the list of a transaction other
+ * than its own active transaction must acquire an exclusive
+ * lock.
+ *
+ * SERIALIZABLEXACT's member 'perXactPredicateListLock'
+ * - Protects the linked list of predicate locks held by a transaction.
+ * Only needed for parallel mode, where multiple backends share the
+ * same SERIALIZABLEXACT object. Not needed if
+ * SerializablePredicateListLock is held exclusively.
+ *
+ * PredicateLockHashPartitionLock(hashcode)
+ * - The same lock protects a target, all locks on that target, and
+ * the linked list of locks on the target.
+ * - When more than one is needed, acquire in ascending address order.
+ * - When all are needed (rare), acquire in ascending index order with
+ * PredicateLockHashPartitionLockByIndex(index).
+ *
+ * SerializableXactHashLock
+ * - Protects both PredXact and SerializableXidHash.
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/predicate.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * INTERFACE ROUTINES
+ *
+ * housekeeping for setting up shared memory predicate lock structures
+ * InitPredicateLocks(void)
+ * PredicateLockShmemSize(void)
+ *
+ * predicate lock reporting
+ * GetPredicateLockStatusData(void)
+ * PageIsPredicateLocked(Relation relation, BlockNumber blkno)
+ *
+ * predicate lock maintenance
+ * GetSerializableTransactionSnapshot(Snapshot snapshot)
+ * SetSerializableTransactionSnapshot(Snapshot snapshot,
+ * VirtualTransactionId *sourcevxid)
+ * RegisterPredicateLockingXid(void)
+ * PredicateLockRelation(Relation relation, Snapshot snapshot)
+ * PredicateLockPage(Relation relation, BlockNumber blkno,
+ * Snapshot snapshot)
+ * PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+ * TransactionId tuple_xid)
+ * PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+ * BlockNumber newblkno)
+ * PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+ * BlockNumber newblkno)
+ * TransferPredicateLocksToHeapRelation(Relation relation)
+ * ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
+ *
+ * conflict detection (may also trigger rollback)
+ * CheckForSerializableConflictOut(Relation relation, TransactionId xid,
+ * Snapshot snapshot)
+ * CheckForSerializableConflictIn(Relation relation, ItemPointer tid,
+ * BlockNumber blkno)
+ * CheckTableForSerializableConflictIn(Relation relation)
+ *
+ * final rollback checking
+ * PreCommit_CheckForSerializationFailure(void)
+ *
+ * two-phase commit support
+ * AtPrepare_PredicateLocks(void);
+ * PostPrepare_PredicateLocks(TransactionId xid);
+ * PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit);
+ * predicatelock_twophase_recover(TransactionId xid, uint16 info,
+ * void *recdata, uint32 len);
+ */
+
+#include "postgres.h"
+
+#include "access/parallel.h"
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/twophase_rmgr.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/pg_lfind.h"
+#include "storage/bufmgr.h"
+#include "storage/predicate.h"
+#include "storage/predicate_internals.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+/* Uncomment the next line to test the graceful degradation code. */
+/* #define TEST_SUMMARIZE_SERIAL */
+
+/*
+ * Test the most selective fields first, for performance.
+ *
+ * a is covered by b if all of the following hold:
+ * 1) a.database = b.database
+ * 2) a.relation = b.relation
+ * 3) b.offset is invalid (b is page-granularity or higher)
+ * 4) either of the following:
+ * 4a) a.offset is valid (a is tuple-granularity) and a.page = b.page
+ * or 4b) a.offset is invalid and b.page is invalid (a is
+ * page-granularity and b is relation-granularity
+ */
+#define TargetTagIsCoveredBy(covered_target, covering_target) \
+ ((GET_PREDICATELOCKTARGETTAG_RELATION(covered_target) == /* (2) */ \
+ GET_PREDICATELOCKTARGETTAG_RELATION(covering_target)) \
+ && (GET_PREDICATELOCKTARGETTAG_OFFSET(covering_target) == \
+ InvalidOffsetNumber) /* (3) */ \
+ && (((GET_PREDICATELOCKTARGETTAG_OFFSET(covered_target) != \
+ InvalidOffsetNumber) /* (4a) */ \
+ && (GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \
+ GET_PREDICATELOCKTARGETTAG_PAGE(covered_target))) \
+ || ((GET_PREDICATELOCKTARGETTAG_PAGE(covering_target) == \
+ InvalidBlockNumber) /* (4b) */ \
+ && (GET_PREDICATELOCKTARGETTAG_PAGE(covered_target) \
+ != InvalidBlockNumber))) \
+ && (GET_PREDICATELOCKTARGETTAG_DB(covered_target) == /* (1) */ \
+ GET_PREDICATELOCKTARGETTAG_DB(covering_target)))
+
+/*
+ * The predicate locking target and lock shared hash tables are partitioned to
+ * reduce contention. To determine which partition a given target belongs to,
+ * compute the tag's hash code with PredicateLockTargetTagHashCode(), then
+ * apply one of these macros.
+ * NB: NUM_PREDICATELOCK_PARTITIONS must be a power of 2!
+ */
+#define PredicateLockHashPartition(hashcode) \
+ ((hashcode) % NUM_PREDICATELOCK_PARTITIONS)
+#define PredicateLockHashPartitionLock(hashcode) \
+ (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + \
+ PredicateLockHashPartition(hashcode)].lock)
+#define PredicateLockHashPartitionLockByIndex(i) \
+ (&MainLWLockArray[PREDICATELOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
+
+#define NPREDICATELOCKTARGETENTS() \
+ mul_size(max_predicate_locks_per_xact, add_size(MaxBackends, max_prepared_xacts))
+
+#define SxactIsOnFinishedList(sxact) (!dlist_node_is_detached(&(sxact)->finishedLink))
+
+/*
+ * Note that a sxact is marked "prepared" once it has passed
+ * PreCommit_CheckForSerializationFailure, even if it isn't using
+ * 2PC. This is the point at which it can no longer be aborted.
+ *
+ * The PREPARED flag remains set after commit, so SxactIsCommitted
+ * implies SxactIsPrepared.
+ */
+#define SxactIsCommitted(sxact) (((sxact)->flags & SXACT_FLAG_COMMITTED) != 0)
+#define SxactIsPrepared(sxact) (((sxact)->flags & SXACT_FLAG_PREPARED) != 0)
+#define SxactIsRolledBack(sxact) (((sxact)->flags & SXACT_FLAG_ROLLED_BACK) != 0)
+#define SxactIsDoomed(sxact) (((sxact)->flags & SXACT_FLAG_DOOMED) != 0)
+#define SxactIsReadOnly(sxact) (((sxact)->flags & SXACT_FLAG_READ_ONLY) != 0)
+#define SxactHasSummaryConflictIn(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_IN) != 0)
+#define SxactHasSummaryConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_SUMMARY_CONFLICT_OUT) != 0)
+/*
+ * The following macro actually means that the specified transaction has a
+ * conflict out *to a transaction which committed ahead of it*. It's hard
+ * to get that into a name of a reasonable length.
+ */
+#define SxactHasConflictOut(sxact) (((sxact)->flags & SXACT_FLAG_CONFLICT_OUT) != 0)
+#define SxactIsDeferrableWaiting(sxact) (((sxact)->flags & SXACT_FLAG_DEFERRABLE_WAITING) != 0)
+#define SxactIsROSafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_SAFE) != 0)
+#define SxactIsROUnsafe(sxact) (((sxact)->flags & SXACT_FLAG_RO_UNSAFE) != 0)
+#define SxactIsPartiallyReleased(sxact) (((sxact)->flags & SXACT_FLAG_PARTIALLY_RELEASED) != 0)
+
+/*
+ * Compute the hash code associated with a PREDICATELOCKTARGETTAG.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed. Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+#define PredicateLockTargetTagHashCode(predicatelocktargettag) \
+ get_hash_value(PredicateLockTargetHash, predicatelocktargettag)
+
+/*
+ * Given a predicate lock tag, and the hash for its target,
+ * compute the lock hash.
+ *
+ * To make the hash code also depend on the transaction, we xor the sxid
+ * struct's address into the hash code, left-shifted so that the
+ * partition-number bits don't change. Since this is only a hash, we
+ * don't care if we lose high-order bits of the address; use an
+ * intermediate variable to suppress cast-pointer-to-int warnings.
+ */
+#define PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash) \
+ ((targethash) ^ ((uint32) PointerGetDatum((predicatelocktag)->myXact)) \
+ << LOG2_NUM_PREDICATELOCK_PARTITIONS)
+
+
+/*
+ * The SLRU buffer area through which we access the old xids.
+ */
+static SlruCtlData SerialSlruCtlData;
+
+#define SerialSlruCtl (&SerialSlruCtlData)
+
+#define SERIAL_PAGESIZE BLCKSZ
+#define SERIAL_ENTRYSIZE sizeof(SerCommitSeqNo)
+#define SERIAL_ENTRIESPERPAGE (SERIAL_PAGESIZE / SERIAL_ENTRYSIZE)
+
+/*
+ * Set maximum pages based on the number needed to track all transactions.
+ */
+#define SERIAL_MAX_PAGE (MaxTransactionId / SERIAL_ENTRIESPERPAGE)
+
+#define SerialNextPage(page) (((page) >= SERIAL_MAX_PAGE) ? 0 : (page) + 1)
+
+#define SerialValue(slotno, xid) (*((SerCommitSeqNo *) \
+ (SerialSlruCtl->shared->page_buffer[slotno] + \
+ ((((uint32) (xid)) % SERIAL_ENTRIESPERPAGE) * SERIAL_ENTRYSIZE))))
+
+#define SerialPage(xid) (((uint32) (xid)) / SERIAL_ENTRIESPERPAGE)
+
+typedef struct SerialControlData
+{
+ int headPage; /* newest initialized page */
+ TransactionId headXid; /* newest valid Xid in the SLRU */
+ TransactionId tailXid; /* oldest xmin we might be interested in */
+} SerialControlData;
+
+typedef struct SerialControlData *SerialControl;
+
+static SerialControl serialControl;
+
+/*
+ * When the oldest committed transaction on the "finished" list is moved to
+ * SLRU, its predicate locks will be moved to this "dummy" transaction,
+ * collapsing duplicate targets. When a duplicate is found, the later
+ * commitSeqNo is used.
+ */
+static SERIALIZABLEXACT *OldCommittedSxact;
+
+
+/*
+ * These configuration variables are used to set the predicate lock table size
+ * and to control promotion of predicate locks to coarser granularity in an
+ * attempt to degrade performance (mostly as false positive serialization
+ * failure) gracefully in the face of memory pressure.
+ */
+int max_predicate_locks_per_xact; /* in guc_tables.c */
+int max_predicate_locks_per_relation; /* in guc_tables.c */
+int max_predicate_locks_per_page; /* in guc_tables.c */
+
+/*
+ * This provides a list of objects in order to track transactions
+ * participating in predicate locking. Entries in the list are fixed size,
+ * and reside in shared memory. The memory address of an entry must remain
+ * fixed during its lifetime. The list will be protected from concurrent
+ * update externally; no provision is made in this code to manage that. The
+ * number of entries in the list, and the size allowed for each entry is
+ * fixed upon creation.
+ */
+static PredXactList PredXact;
+
+/*
+ * This provides a pool of RWConflict data elements to use in conflict lists
+ * between transactions.
+ */
+static RWConflictPoolHeader RWConflictPool;
+
+/*
+ * The predicate locking hash tables are in shared memory.
+ * Each backend keeps pointers to them.
+ */
+static HTAB *SerializableXidHash;
+static HTAB *PredicateLockTargetHash;
+static HTAB *PredicateLockHash;
+static dlist_head *FinishedSerializableTransactions;
+
+/*
+ * Tag for a dummy entry in PredicateLockTargetHash. By temporarily removing
+ * this entry, you can ensure that there's enough scratch space available for
+ * inserting one entry in the hash table. This is an otherwise-invalid tag.
+ */
+static const PREDICATELOCKTARGETTAG ScratchTargetTag = {0, 0, 0, 0};
+static uint32 ScratchTargetTagHash;
+static LWLock *ScratchPartitionLock;
+
+/*
+ * The local hash table used to determine when to combine multiple fine-
+ * grained locks into a single courser-grained lock.
+ */
+static HTAB *LocalPredicateLockHash = NULL;
+
+/*
+ * Keep a pointer to the currently-running serializable transaction (if any)
+ * for quick reference. Also, remember if we have written anything that could
+ * cause a rw-conflict.
+ */
+static SERIALIZABLEXACT *MySerializableXact = InvalidSerializableXact;
+static bool MyXactDidWrite = false;
+
+/*
+ * The SXACT_FLAG_RO_UNSAFE optimization might lead us to release
+ * MySerializableXact early. If that happens in a parallel query, the leader
+ * needs to defer the destruction of the SERIALIZABLEXACT until end of
+ * transaction, because the workers still have a reference to it. In that
+ * case, the leader stores it here.
+ */
+static SERIALIZABLEXACT *SavedSerializableXact = InvalidSerializableXact;
+
+/* local functions */
+
+static SERIALIZABLEXACT *CreatePredXact(void);
+static void ReleasePredXact(SERIALIZABLEXACT *sxact);
+
+static bool RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer);
+static void SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+static void SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact, SERIALIZABLEXACT *activeXact);
+static void ReleaseRWConflict(RWConflict conflict);
+static void FlagSxactUnsafe(SERIALIZABLEXACT *sxact);
+
+static bool SerialPagePrecedesLogically(int page1, int page2);
+static void SerialInit(void);
+static void SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo);
+static SerCommitSeqNo SerialGetMinConflictCommitSeqNo(TransactionId xid);
+static void SerialSetActiveSerXmin(TransactionId xid);
+
+static uint32 predicatelock_hash(const void *key, Size keysize);
+static void SummarizeOldestCommittedSxact(void);
+static Snapshot GetSafeSnapshot(Snapshot origSnapshot);
+static Snapshot GetSerializableTransactionSnapshotInt(Snapshot snapshot,
+ VirtualTransactionId *sourcevxid,
+ int sourcepid);
+static bool PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag);
+static bool GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ PREDICATELOCKTARGETTAG *parent);
+static bool CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag);
+static void RemoveScratchTarget(bool lockheld);
+static void RestoreScratchTarget(bool lockheld);
+static void RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target,
+ uint32 targettaghash);
+static void DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag);
+static int MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag);
+static bool CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag);
+static void DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag);
+static void CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
+ uint32 targettaghash,
+ SERIALIZABLEXACT *sxact);
+static void DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash);
+static bool TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
+ PREDICATELOCKTARGETTAG newtargettag,
+ bool removeOld);
+static void PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag);
+static void DropAllPredicateLocksFromTable(Relation relation,
+ bool transfer);
+static void SetNewSxactGlobalXmin(void);
+static void ClearOldPredicateLocks(void);
+static void ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
+ bool summarize);
+static bool XidIsConcurrent(TransactionId xid);
+static void CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag);
+static void FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer);
+static void OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+ SERIALIZABLEXACT *writer);
+static void CreateLocalPredicateLockHash(void);
+static void ReleasePredicateLocksLocal(void);
+
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Does this relation participate in predicate locking? Temporary and system
+ * relations are exempt.
+ */
+static inline bool
+PredicateLockingNeededForRelation(Relation relation)
+{
+ return !(relation->rd_id < FirstUnpinnedObjectId ||
+ RelationUsesLocalBuffers(relation));
+}
+
+/*
+ * When a public interface method is called for a read, this is the test to
+ * see if we should do a quick return.
+ *
+ * Note: this function has side-effects! If this transaction has been flagged
+ * as RO-safe since the last call, we release all predicate locks and reset
+ * MySerializableXact. That makes subsequent calls to return quickly.
+ *
+ * This is marked as 'inline' to eliminate the function call overhead in the
+ * common case that serialization is not needed.
+ */
+static inline bool
+SerializationNeededForRead(Relation relation, Snapshot snapshot)
+{
+ /* Nothing to do if this is not a serializable transaction */
+ if (MySerializableXact == InvalidSerializableXact)
+ return false;
+
+ /*
+ * Don't acquire locks or conflict when scanning with a special snapshot.
+ * This excludes things like CLUSTER and REINDEX. They use the wholesale
+ * functions TransferPredicateLocksToHeapRelation() and
+ * CheckTableForSerializableConflictIn() to participate in serialization,
+ * but the scans involved don't need serialization.
+ */
+ if (!IsMVCCSnapshot(snapshot))
+ return false;
+
+ /*
+ * Check if we have just become "RO-safe". If we have, immediately release
+ * all locks as they're not needed anymore. This also resets
+ * MySerializableXact, so that subsequent calls to this function can exit
+ * quickly.
+ *
+ * A transaction is flagged as RO_SAFE if all concurrent R/W transactions
+ * commit without having conflicts out to an earlier snapshot, thus
+ * ensuring that no conflicts are possible for this transaction.
+ */
+ if (SxactIsROSafe(MySerializableXact))
+ {
+ ReleasePredicateLocks(false, true);
+ return false;
+ }
+
+ /* Check if the relation doesn't participate in predicate locking */
+ if (!PredicateLockingNeededForRelation(relation))
+ return false;
+
+ return true; /* no excuse to skip predicate locking */
+}
+
+/*
+ * Like SerializationNeededForRead(), but called on writes.
+ * The logic is the same, but there is no snapshot and we can't be RO-safe.
+ */
+static inline bool
+SerializationNeededForWrite(Relation relation)
+{
+ /* Nothing to do if this is not a serializable transaction */
+ if (MySerializableXact == InvalidSerializableXact)
+ return false;
+
+ /* Check if the relation doesn't participate in predicate locking */
+ if (!PredicateLockingNeededForRelation(relation))
+ return false;
+
+ return true; /* no excuse to skip predicate locking */
+}
+
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * These functions are a simple implementation of a list for this specific
+ * type of struct. If there is ever a generalized shared memory list, we
+ * should probably switch to that.
+ */
+static SERIALIZABLEXACT *
+CreatePredXact(void)
+{
+ SERIALIZABLEXACT *sxact;
+
+ if (dlist_is_empty(&PredXact->availableList))
+ return NULL;
+
+ sxact = dlist_container(SERIALIZABLEXACT, xactLink,
+ dlist_pop_head_node(&PredXact->availableList));
+ dlist_push_tail(&PredXact->activeList, &sxact->xactLink);
+ return sxact;
+}
+
+static void
+ReleasePredXact(SERIALIZABLEXACT *sxact)
+{
+ Assert(ShmemAddrIsValid(sxact));
+
+ dlist_delete(&sxact->xactLink);
+ dlist_push_tail(&PredXact->availableList, &sxact->xactLink);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * These functions manage primitive access to the RWConflict pool and lists.
+ */
+static bool
+RWConflictExists(const SERIALIZABLEXACT *reader, const SERIALIZABLEXACT *writer)
+{
+ dlist_iter iter;
+
+ Assert(reader != writer);
+
+ /* Check the ends of the purported conflict first. */
+ if (SxactIsDoomed(reader)
+ || SxactIsDoomed(writer)
+ || dlist_is_empty(&reader->outConflicts)
+ || dlist_is_empty(&writer->inConflicts))
+ return false;
+
+ /*
+ * A conflict is possible; walk the list to find out.
+ *
+ * The unconstify is needed as we have no const version of
+ * dlist_foreach().
+ */
+ dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->outConflicts)
+ {
+ RWConflict conflict =
+ dlist_container(RWConflictData, outLink, iter.cur);
+
+ if (conflict->sxactIn == writer)
+ return true;
+ }
+
+ /* No conflict found. */
+ return false;
+}
+
+static void
+SetRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+{
+ RWConflict conflict;
+
+ Assert(reader != writer);
+ Assert(!RWConflictExists(reader, writer));
+
+ if (dlist_is_empty(&RWConflictPool->availableList))
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("not enough elements in RWConflictPool to record a read/write conflict"),
+ errhint("You might need to run fewer transactions at a time or increase max_connections.")));
+
+ conflict = dlist_head_element(RWConflictData, outLink, &RWConflictPool->availableList);
+ dlist_delete(&conflict->outLink);
+
+ conflict->sxactOut = reader;
+ conflict->sxactIn = writer;
+ dlist_push_tail(&reader->outConflicts, &conflict->outLink);
+ dlist_push_tail(&writer->inConflicts, &conflict->inLink);
+}
+
+static void
+SetPossibleUnsafeConflict(SERIALIZABLEXACT *roXact,
+ SERIALIZABLEXACT *activeXact)
+{
+ RWConflict conflict;
+
+ Assert(roXact != activeXact);
+ Assert(SxactIsReadOnly(roXact));
+ Assert(!SxactIsReadOnly(activeXact));
+
+ if (dlist_is_empty(&RWConflictPool->availableList))
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("not enough elements in RWConflictPool to record a potential read/write conflict"),
+ errhint("You might need to run fewer transactions at a time or increase max_connections.")));
+
+ conflict = dlist_head_element(RWConflictData, outLink, &RWConflictPool->availableList);
+ dlist_delete(&conflict->outLink);
+
+ conflict->sxactOut = activeXact;
+ conflict->sxactIn = roXact;
+ dlist_push_tail(&activeXact->possibleUnsafeConflicts, &conflict->outLink);
+ dlist_push_tail(&roXact->possibleUnsafeConflicts, &conflict->inLink);
+}
+
+static void
+ReleaseRWConflict(RWConflict conflict)
+{
+ dlist_delete(&conflict->inLink);
+ dlist_delete(&conflict->outLink);
+ dlist_push_tail(&RWConflictPool->availableList, &conflict->outLink);
+}
+
+static void
+FlagSxactUnsafe(SERIALIZABLEXACT *sxact)
+{
+ dlist_mutable_iter iter;
+
+ Assert(SxactIsReadOnly(sxact));
+ Assert(!SxactIsROSafe(sxact));
+
+ sxact->flags |= SXACT_FLAG_RO_UNSAFE;
+
+ /*
+ * We know this isn't a safe snapshot, so we can stop looking for other
+ * potential conflicts.
+ */
+ dlist_foreach_modify(iter, &sxact->possibleUnsafeConflicts)
+ {
+ RWConflict conflict =
+ dlist_container(RWConflictData, inLink, iter.cur);
+
+ Assert(!SxactIsReadOnly(conflict->sxactOut));
+ Assert(sxact == conflict->sxactIn);
+
+ ReleaseRWConflict(conflict);
+ }
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Decide whether a Serial page number is "older" for truncation purposes.
+ * Analogous to CLOGPagePrecedes().
+ */
+static bool
+SerialPagePrecedesLogically(int page1, int page2)
+{
+ TransactionId xid1;
+ TransactionId xid2;
+
+ xid1 = ((TransactionId) page1) * SERIAL_ENTRIESPERPAGE;
+ xid1 += FirstNormalTransactionId + 1;
+ xid2 = ((TransactionId) page2) * SERIAL_ENTRIESPERPAGE;
+ xid2 += FirstNormalTransactionId + 1;
+
+ return (TransactionIdPrecedes(xid1, xid2) &&
+ TransactionIdPrecedes(xid1, xid2 + SERIAL_ENTRIESPERPAGE - 1));
+}
+
+#ifdef USE_ASSERT_CHECKING
+static void
+SerialPagePrecedesLogicallyUnitTests(void)
+{
+ int per_page = SERIAL_ENTRIESPERPAGE,
+ offset = per_page / 2;
+ int newestPage,
+ oldestPage,
+ headPage,
+ targetPage;
+ TransactionId newestXact,
+ oldestXact;
+
+ /* GetNewTransactionId() has assigned the last XID it can safely use. */
+ newestPage = 2 * SLRU_PAGES_PER_SEGMENT - 1; /* nothing special */
+ newestXact = newestPage * per_page + offset;
+ Assert(newestXact / per_page == newestPage);
+ oldestXact = newestXact + 1;
+ oldestXact -= 1U << 31;
+ oldestPage = oldestXact / per_page;
+
+ /*
+ * In this scenario, the SLRU headPage pertains to the last ~1000 XIDs
+ * assigned. oldestXact finishes, ~2B XIDs having elapsed since it
+ * started. Further transactions cause us to summarize oldestXact to
+ * tailPage. Function must return false so SerialAdd() doesn't zero
+ * tailPage (which may contain entries for other old, recently-finished
+ * XIDs) and half the SLRU. Reaching this requires burning ~2B XIDs in
+ * single-user mode, a negligible possibility.
+ */
+ headPage = newestPage;
+ targetPage = oldestPage;
+ Assert(!SerialPagePrecedesLogically(headPage, targetPage));
+
+ /*
+ * In this scenario, the SLRU headPage pertains to oldestXact. We're
+ * summarizing an XID near newestXact. (Assume few other XIDs used
+ * SERIALIZABLE, hence the minimal headPage advancement. Assume
+ * oldestXact was long-running and only recently reached the SLRU.)
+ * Function must return true to make SerialAdd() create targetPage.
+ *
+ * Today's implementation mishandles this case, but it doesn't matter
+ * enough to fix. Verify that the defect affects just one page by
+ * asserting correct treatment of its prior page. Reaching this case
+ * requires burning ~2B XIDs in single-user mode, a negligible
+ * possibility. Moreover, if it does happen, the consequence would be
+ * mild, namely a new transaction failing in SimpleLruReadPage().
+ */
+ headPage = oldestPage;
+ targetPage = newestPage;
+ Assert(SerialPagePrecedesLogically(headPage, targetPage - 1));
+#if 0
+ Assert(SerialPagePrecedesLogically(headPage, targetPage));
+#endif
+}
+#endif
+
+/*
+ * Initialize for the tracking of old serializable committed xids.
+ */
+static void
+SerialInit(void)
+{
+ bool found;
+
+ /*
+ * Set up SLRU management of the pg_serial data.
+ */
+ SerialSlruCtl->PagePrecedes = SerialPagePrecedesLogically;
+ SimpleLruInit(SerialSlruCtl, "Serial",
+ NUM_SERIAL_BUFFERS, 0, SerialSLRULock, "pg_serial",
+ LWTRANCHE_SERIAL_BUFFER, SYNC_HANDLER_NONE);
+#ifdef USE_ASSERT_CHECKING
+ SerialPagePrecedesLogicallyUnitTests();
+#endif
+ SlruPagePrecedesUnitTests(SerialSlruCtl, SERIAL_ENTRIESPERPAGE);
+
+ /*
+ * Create or attach to the SerialControl structure.
+ */
+ serialControl = (SerialControl)
+ ShmemInitStruct("SerialControlData", sizeof(SerialControlData), &found);
+
+ Assert(found == IsUnderPostmaster);
+ if (!found)
+ {
+ /*
+ * Set control information to reflect empty SLRU.
+ */
+ serialControl->headPage = -1;
+ serialControl->headXid = InvalidTransactionId;
+ serialControl->tailXid = InvalidTransactionId;
+ }
+}
+
+/*
+ * Record a committed read write serializable xid and the minimum
+ * commitSeqNo of any transactions to which this xid had a rw-conflict out.
+ * An invalid commitSeqNo means that there were no conflicts out from xid.
+ */
+static void
+SerialAdd(TransactionId xid, SerCommitSeqNo minConflictCommitSeqNo)
+{
+ TransactionId tailXid;
+ int targetPage;
+ int slotno;
+ int firstZeroPage;
+ bool isNewPage;
+
+ Assert(TransactionIdIsValid(xid));
+
+ targetPage = SerialPage(xid);
+
+ LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * If no serializable transactions are active, there shouldn't be anything
+ * to push out to the SLRU. Hitting this assert would mean there's
+ * something wrong with the earlier cleanup logic.
+ */
+ tailXid = serialControl->tailXid;
+ Assert(TransactionIdIsValid(tailXid));
+
+ /*
+ * If the SLRU is currently unused, zero out the whole active region from
+ * tailXid to headXid before taking it into use. Otherwise zero out only
+ * any new pages that enter the tailXid-headXid range as we advance
+ * headXid.
+ */
+ if (serialControl->headPage < 0)
+ {
+ firstZeroPage = SerialPage(tailXid);
+ isNewPage = true;
+ }
+ else
+ {
+ firstZeroPage = SerialNextPage(serialControl->headPage);
+ isNewPage = SerialPagePrecedesLogically(serialControl->headPage,
+ targetPage);
+ }
+
+ if (!TransactionIdIsValid(serialControl->headXid)
+ || TransactionIdFollows(xid, serialControl->headXid))
+ serialControl->headXid = xid;
+ if (isNewPage)
+ serialControl->headPage = targetPage;
+
+ if (isNewPage)
+ {
+ /* Initialize intervening pages. */
+ while (firstZeroPage != targetPage)
+ {
+ (void) SimpleLruZeroPage(SerialSlruCtl, firstZeroPage);
+ firstZeroPage = SerialNextPage(firstZeroPage);
+ }
+ slotno = SimpleLruZeroPage(SerialSlruCtl, targetPage);
+ }
+ else
+ slotno = SimpleLruReadPage(SerialSlruCtl, targetPage, true, xid);
+
+ SerialValue(slotno, xid) = minConflictCommitSeqNo;
+ SerialSlruCtl->shared->page_dirty[slotno] = true;
+
+ LWLockRelease(SerialSLRULock);
+}
+
+/*
+ * Get the minimum commitSeqNo for any conflict out for the given xid. For
+ * a transaction which exists but has no conflict out, InvalidSerCommitSeqNo
+ * will be returned.
+ */
+static SerCommitSeqNo
+SerialGetMinConflictCommitSeqNo(TransactionId xid)
+{
+ TransactionId headXid;
+ TransactionId tailXid;
+ SerCommitSeqNo val;
+ int slotno;
+
+ Assert(TransactionIdIsValid(xid));
+
+ LWLockAcquire(SerialSLRULock, LW_SHARED);
+ headXid = serialControl->headXid;
+ tailXid = serialControl->tailXid;
+ LWLockRelease(SerialSLRULock);
+
+ if (!TransactionIdIsValid(headXid))
+ return 0;
+
+ Assert(TransactionIdIsValid(tailXid));
+
+ if (TransactionIdPrecedes(xid, tailXid)
+ || TransactionIdFollows(xid, headXid))
+ return 0;
+
+ /*
+ * The following function must be called without holding SerialSLRULock,
+ * but will return with that lock held, which must then be released.
+ */
+ slotno = SimpleLruReadPage_ReadOnly(SerialSlruCtl,
+ SerialPage(xid), xid);
+ val = SerialValue(slotno, xid);
+ LWLockRelease(SerialSLRULock);
+ return val;
+}
+
+/*
+ * Call this whenever there is a new xmin for active serializable
+ * transactions. We don't need to keep information on transactions which
+ * precede that. InvalidTransactionId means none active, so everything in
+ * the SLRU can be discarded.
+ */
+static void
+SerialSetActiveSerXmin(TransactionId xid)
+{
+ LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+ /*
+ * When no sxacts are active, nothing overlaps, set the xid values to
+ * invalid to show that there are no valid entries. Don't clear headPage,
+ * though. A new xmin might still land on that page, and we don't want to
+ * repeatedly zero out the same page.
+ */
+ if (!TransactionIdIsValid(xid))
+ {
+ serialControl->tailXid = InvalidTransactionId;
+ serialControl->headXid = InvalidTransactionId;
+ LWLockRelease(SerialSLRULock);
+ return;
+ }
+
+ /*
+ * When we're recovering prepared transactions, the global xmin might move
+ * backwards depending on the order they're recovered. Normally that's not
+ * OK, but during recovery no serializable transactions will commit, so
+ * the SLRU is empty and we can get away with it.
+ */
+ if (RecoveryInProgress())
+ {
+ Assert(serialControl->headPage < 0);
+ if (!TransactionIdIsValid(serialControl->tailXid)
+ || TransactionIdPrecedes(xid, serialControl->tailXid))
+ {
+ serialControl->tailXid = xid;
+ }
+ LWLockRelease(SerialSLRULock);
+ return;
+ }
+
+ Assert(!TransactionIdIsValid(serialControl->tailXid)
+ || TransactionIdFollows(xid, serialControl->tailXid));
+
+ serialControl->tailXid = xid;
+
+ LWLockRelease(SerialSLRULock);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ *
+ * We don't have any data that needs to survive a restart, but this is a
+ * convenient place to truncate the SLRU.
+ */
+void
+CheckPointPredicate(void)
+{
+ int tailPage;
+
+ LWLockAcquire(SerialSLRULock, LW_EXCLUSIVE);
+
+ /* Exit quickly if the SLRU is currently not in use. */
+ if (serialControl->headPage < 0)
+ {
+ LWLockRelease(SerialSLRULock);
+ return;
+ }
+
+ if (TransactionIdIsValid(serialControl->tailXid))
+ {
+ /* We can truncate the SLRU up to the page containing tailXid */
+ tailPage = SerialPage(serialControl->tailXid);
+ }
+ else
+ {
+ /*----------
+ * The SLRU is no longer needed. Truncate to head before we set head
+ * invalid.
+ *
+ * XXX: It's possible that the SLRU is not needed again until XID
+ * wrap-around has happened, so that the segment containing headPage
+ * that we leave behind will appear to be new again. In that case it
+ * won't be removed until XID horizon advances enough to make it
+ * current again.
+ *
+ * XXX: This should happen in vac_truncate_clog(), not in checkpoints.
+ * Consider this scenario, starting from a system with no in-progress
+ * transactions and VACUUM FREEZE having maximized oldestXact:
+ * - Start a SERIALIZABLE transaction.
+ * - Start, finish, and summarize a SERIALIZABLE transaction, creating
+ * one SLRU page.
+ * - Consume XIDs to reach xidStopLimit.
+ * - Finish all transactions. Due to the long-running SERIALIZABLE
+ * transaction, earlier checkpoints did not touch headPage. The
+ * next checkpoint will change it, but that checkpoint happens after
+ * the end of the scenario.
+ * - VACUUM to advance XID limits.
+ * - Consume ~2M XIDs, crossing the former xidWrapLimit.
+ * - Start, finish, and summarize a SERIALIZABLE transaction.
+ * SerialAdd() declines to create the targetPage, because headPage
+ * is not regarded as in the past relative to that targetPage. The
+ * transaction instigating the summarize fails in
+ * SimpleLruReadPage().
+ */
+ tailPage = serialControl->headPage;
+ serialControl->headPage = -1;
+ }
+
+ LWLockRelease(SerialSLRULock);
+
+ /* Truncate away pages that are no longer required */
+ SimpleLruTruncate(SerialSlruCtl, tailPage);
+
+ /*
+ * Write dirty SLRU pages to disk
+ *
+ * This is not actually necessary from a correctness point of view. We do
+ * it merely as a debugging aid.
+ *
+ * We're doing this after the truncation to avoid writing pages right
+ * before deleting the file in which they sit, which would be completely
+ * pointless.
+ */
+ SimpleLruWriteAll(SerialSlruCtl, true);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * InitPredicateLocks -- Initialize the predicate locking data structures.
+ *
+ * This is called from CreateSharedMemoryAndSemaphores(), which see for
+ * more comments. In the normal postmaster case, the shared hash tables
+ * are created here. Backends inherit the pointers
+ * to the shared tables via fork(). In the EXEC_BACKEND case, each
+ * backend re-executes this code to obtain pointers to the already existing
+ * shared hash tables.
+ */
+void
+InitPredicateLocks(void)
+{
+ HASHCTL info;
+ long max_table_size;
+ Size requestSize;
+ bool found;
+
+#ifndef EXEC_BACKEND
+ Assert(!IsUnderPostmaster);
+#endif
+
+ /*
+ * Compute size of predicate lock target hashtable. Note these
+ * calculations must agree with PredicateLockShmemSize!
+ */
+ max_table_size = NPREDICATELOCKTARGETENTS();
+
+ /*
+ * Allocate hash table for PREDICATELOCKTARGET structs. This stores
+ * per-predicate-lock-target information.
+ */
+ info.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ info.entrysize = sizeof(PREDICATELOCKTARGET);
+ info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+
+ PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
+ max_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_BLOBS |
+ HASH_PARTITION | HASH_FIXED_SIZE);
+
+ /*
+ * Reserve a dummy entry in the hash table; we use it to make sure there's
+ * always one entry available when we need to split or combine a page,
+ * because running out of space there could mean aborting a
+ * non-serializable transaction.
+ */
+ if (!IsUnderPostmaster)
+ {
+ (void) hash_search(PredicateLockTargetHash, &ScratchTargetTag,
+ HASH_ENTER, &found);
+ Assert(!found);
+ }
+
+ /* Pre-calculate the hash and partition lock of the scratch entry */
+ ScratchTargetTagHash = PredicateLockTargetTagHashCode(&ScratchTargetTag);
+ ScratchPartitionLock = PredicateLockHashPartitionLock(ScratchTargetTagHash);
+
+ /*
+ * Allocate hash table for PREDICATELOCK structs. This stores per
+ * xact-lock-of-a-target information.
+ */
+ info.keysize = sizeof(PREDICATELOCKTAG);
+ info.entrysize = sizeof(PREDICATELOCK);
+ info.hash = predicatelock_hash;
+ info.num_partitions = NUM_PREDICATELOCK_PARTITIONS;
+
+ /* Assume an average of 2 xacts per target */
+ max_table_size *= 2;
+
+ PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
+ max_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_FUNCTION |
+ HASH_PARTITION | HASH_FIXED_SIZE);
+
+ /*
+ * Compute size for serializable transaction hashtable. Note these
+ * calculations must agree with PredicateLockShmemSize!
+ */
+ max_table_size = (MaxBackends + max_prepared_xacts);
+
+ /*
+ * Allocate a list to hold information on transactions participating in
+ * predicate locking.
+ *
+ * Assume an average of 10 predicate locking transactions per backend.
+ * This allows aggressive cleanup while detail is present before data must
+ * be summarized for storage in SLRU and the "dummy" transaction.
+ */
+ max_table_size *= 10;
+
+ PredXact = ShmemInitStruct("PredXactList",
+ PredXactListDataSize,
+ &found);
+ Assert(found == IsUnderPostmaster);
+ if (!found)
+ {
+ int i;
+
+ dlist_init(&PredXact->availableList);
+ dlist_init(&PredXact->activeList);
+ PredXact->SxactGlobalXmin = InvalidTransactionId;
+ PredXact->SxactGlobalXminCount = 0;
+ PredXact->WritableSxactCount = 0;
+ PredXact->LastSxactCommitSeqNo = FirstNormalSerCommitSeqNo - 1;
+ PredXact->CanPartialClearThrough = 0;
+ PredXact->HavePartialClearedThrough = 0;
+ requestSize = mul_size((Size) max_table_size,
+ sizeof(SERIALIZABLEXACT));
+ PredXact->element = ShmemAlloc(requestSize);
+ /* Add all elements to available list, clean. */
+ memset(PredXact->element, 0, requestSize);
+ for (i = 0; i < max_table_size; i++)
+ {
+ LWLockInitialize(&PredXact->element[i].perXactPredicateListLock,
+ LWTRANCHE_PER_XACT_PREDICATE_LIST);
+ dlist_push_tail(&PredXact->availableList, &PredXact->element[i].xactLink);
+ }
+ PredXact->OldCommittedSxact = CreatePredXact();
+ SetInvalidVirtualTransactionId(PredXact->OldCommittedSxact->vxid);
+ PredXact->OldCommittedSxact->prepareSeqNo = 0;
+ PredXact->OldCommittedSxact->commitSeqNo = 0;
+ PredXact->OldCommittedSxact->SeqNo.lastCommitBeforeSnapshot = 0;
+ dlist_init(&PredXact->OldCommittedSxact->outConflicts);
+ dlist_init(&PredXact->OldCommittedSxact->inConflicts);
+ dlist_init(&PredXact->OldCommittedSxact->predicateLocks);
+ dlist_node_init(&PredXact->OldCommittedSxact->finishedLink);
+ dlist_init(&PredXact->OldCommittedSxact->possibleUnsafeConflicts);
+ PredXact->OldCommittedSxact->topXid = InvalidTransactionId;
+ PredXact->OldCommittedSxact->finishedBefore = InvalidTransactionId;
+ PredXact->OldCommittedSxact->xmin = InvalidTransactionId;
+ PredXact->OldCommittedSxact->flags = SXACT_FLAG_COMMITTED;
+ PredXact->OldCommittedSxact->pid = 0;
+ PredXact->OldCommittedSxact->pgprocno = INVALID_PGPROCNO;
+ }
+ /* This never changes, so let's keep a local copy. */
+ OldCommittedSxact = PredXact->OldCommittedSxact;
+
+ /*
+ * Allocate hash table for SERIALIZABLEXID structs. This stores per-xid
+ * information for serializable transactions which have accessed data.
+ */
+ info.keysize = sizeof(SERIALIZABLEXIDTAG);
+ info.entrysize = sizeof(SERIALIZABLEXID);
+
+ SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
+ max_table_size,
+ max_table_size,
+ &info,
+ HASH_ELEM | HASH_BLOBS |
+ HASH_FIXED_SIZE);
+
+ /*
+ * Allocate space for tracking rw-conflicts in lists attached to the
+ * transactions.
+ *
+ * Assume an average of 5 conflicts per transaction. Calculations suggest
+ * that this will prevent resource exhaustion in even the most pessimal
+ * loads up to max_connections = 200 with all 200 connections pounding the
+ * database with serializable transactions. Beyond that, there may be
+ * occasional transactions canceled when trying to flag conflicts. That's
+ * probably OK.
+ */
+ max_table_size *= 5;
+
+ RWConflictPool = ShmemInitStruct("RWConflictPool",
+ RWConflictPoolHeaderDataSize,
+ &found);
+ Assert(found == IsUnderPostmaster);
+ if (!found)
+ {
+ int i;
+
+ dlist_init(&RWConflictPool->availableList);
+ requestSize = mul_size((Size) max_table_size,
+ RWConflictDataSize);
+ RWConflictPool->element = ShmemAlloc(requestSize);
+ /* Add all elements to available list, clean. */
+ memset(RWConflictPool->element, 0, requestSize);
+ for (i = 0; i < max_table_size; i++)
+ {
+ dlist_push_tail(&RWConflictPool->availableList,
+ &RWConflictPool->element[i].outLink);
+ }
+ }
+
+ /*
+ * Create or attach to the header for the list of finished serializable
+ * transactions.
+ */
+ FinishedSerializableTransactions = (dlist_head *)
+ ShmemInitStruct("FinishedSerializableTransactions",
+ sizeof(dlist_head),
+ &found);
+ Assert(found == IsUnderPostmaster);
+ if (!found)
+ dlist_init(FinishedSerializableTransactions);
+
+ /*
+ * Initialize the SLRU storage for old committed serializable
+ * transactions.
+ */
+ SerialInit();
+}
+
+/*
+ * Estimate shared-memory space used for predicate lock table
+ */
+Size
+PredicateLockShmemSize(void)
+{
+ Size size = 0;
+ long max_table_size;
+
+ /* predicate lock target hash table */
+ max_table_size = NPREDICATELOCKTARGETENTS();
+ size = add_size(size, hash_estimate_size(max_table_size,
+ sizeof(PREDICATELOCKTARGET)));
+
+ /* predicate lock hash table */
+ max_table_size *= 2;
+ size = add_size(size, hash_estimate_size(max_table_size,
+ sizeof(PREDICATELOCK)));
+
+ /*
+ * Since NPREDICATELOCKTARGETENTS is only an estimate, add 10% safety
+ * margin.
+ */
+ size = add_size(size, size / 10);
+
+ /* transaction list */
+ max_table_size = MaxBackends + max_prepared_xacts;
+ max_table_size *= 10;
+ size = add_size(size, PredXactListDataSize);
+ size = add_size(size, mul_size((Size) max_table_size,
+ sizeof(SERIALIZABLEXACT)));
+
+ /* transaction xid table */
+ size = add_size(size, hash_estimate_size(max_table_size,
+ sizeof(SERIALIZABLEXID)));
+
+ /* rw-conflict pool */
+ max_table_size *= 5;
+ size = add_size(size, RWConflictPoolHeaderDataSize);
+ size = add_size(size, mul_size((Size) max_table_size,
+ RWConflictDataSize));
+
+ /* Head for list of finished serializable transactions. */
+ size = add_size(size, sizeof(dlist_head));
+
+ /* Shared memory structures for SLRU tracking of old committed xids. */
+ size = add_size(size, sizeof(SerialControlData));
+ size = add_size(size, SimpleLruShmemSize(NUM_SERIAL_BUFFERS, 0));
+
+ return size;
+}
+
+
+/*
+ * Compute the hash code associated with a PREDICATELOCKTAG.
+ *
+ * Because we want to use just one set of partition locks for both the
+ * PREDICATELOCKTARGET and PREDICATELOCK hash tables, we have to make sure
+ * that PREDICATELOCKs fall into the same partition number as their
+ * associated PREDICATELOCKTARGETs. dynahash.c expects the partition number
+ * to be the low-order bits of the hash code, and therefore a
+ * PREDICATELOCKTAG's hash code must have the same low-order bits as the
+ * associated PREDICATELOCKTARGETTAG's hash code. We achieve this with this
+ * specialized hash function.
+ */
+static uint32
+predicatelock_hash(const void *key, Size keysize)
+{
+ const PREDICATELOCKTAG *predicatelocktag = (const PREDICATELOCKTAG *) key;
+ uint32 targethash;
+
+ Assert(keysize == sizeof(PREDICATELOCKTAG));
+
+ /* Look into the associated target object, and compute its hash code */
+ targethash = PredicateLockTargetTagHashCode(&predicatelocktag->myTarget->tag);
+
+ return PredicateLockHashCodeFromTargetHashCode(predicatelocktag, targethash);
+}
+
+
+/*
+ * GetPredicateLockStatusData
+ * Return a table containing the internal state of the predicate
+ * lock manager for use in pg_lock_status.
+ *
+ * Like GetLockStatusData, this function tries to hold the partition LWLocks
+ * for as short a time as possible by returning two arrays that simply
+ * contain the PREDICATELOCKTARGETTAG and SERIALIZABLEXACT for each lock
+ * table entry. Multiple copies of the same PREDICATELOCKTARGETTAG and
+ * SERIALIZABLEXACT will likely appear.
+ */
+PredicateLockData *
+GetPredicateLockStatusData(void)
+{
+ PredicateLockData *data;
+ int i;
+ int els,
+ el;
+ HASH_SEQ_STATUS seqstat;
+ PREDICATELOCK *predlock;
+
+ data = (PredicateLockData *) palloc(sizeof(PredicateLockData));
+
+ /*
+ * To ensure consistency, take simultaneous locks on all partition locks
+ * in ascending order, then SerializableXactHashLock.
+ */
+ for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+ /* Get number of locks and allocate appropriately-sized arrays. */
+ els = hash_get_num_entries(PredicateLockHash);
+ data->nelements = els;
+ data->locktags = (PREDICATELOCKTARGETTAG *)
+ palloc(sizeof(PREDICATELOCKTARGETTAG) * els);
+ data->xacts = (SERIALIZABLEXACT *)
+ palloc(sizeof(SERIALIZABLEXACT) * els);
+
+
+ /* Scan through PredicateLockHash and copy contents */
+ hash_seq_init(&seqstat, PredicateLockHash);
+
+ el = 0;
+
+ while ((predlock = (PREDICATELOCK *) hash_seq_search(&seqstat)))
+ {
+ data->locktags[el] = predlock->tag.myTarget->tag;
+ data->xacts[el] = *predlock->tag.myXact;
+ el++;
+ }
+
+ Assert(el == els);
+
+ /* Release locks in reverse order */
+ LWLockRelease(SerializableXactHashLock);
+ for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+
+ return data;
+}
+
+/*
+ * Free up shared memory structures by pushing the oldest sxact (the one at
+ * the front of the SummarizeOldestCommittedSxact queue) into summary form.
+ * Each call will free exactly one SERIALIZABLEXACT structure and may also
+ * free one or more of these structures: SERIALIZABLEXID, PREDICATELOCK,
+ * PREDICATELOCKTARGET, RWConflictData.
+ */
+static void
+SummarizeOldestCommittedSxact(void)
+{
+ SERIALIZABLEXACT *sxact;
+
+ LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+
+ /*
+ * This function is only called if there are no sxact slots available.
+ * Some of them must belong to old, already-finished transactions, so
+ * there should be something in FinishedSerializableTransactions list that
+ * we can summarize. However, there's a race condition: while we were not
+ * holding any locks, a transaction might have ended and cleaned up all
+ * the finished sxact entries already, freeing up their sxact slots. In
+ * that case, we have nothing to do here. The caller will find one of the
+ * slots released by the other backend when it retries.
+ */
+ if (dlist_is_empty(FinishedSerializableTransactions))
+ {
+ LWLockRelease(SerializableFinishedListLock);
+ return;
+ }
+
+ /*
+ * Grab the first sxact off the finished list -- this will be the earliest
+ * commit. Remove it from the list.
+ */
+ sxact = dlist_head_element(SERIALIZABLEXACT, finishedLink,
+ FinishedSerializableTransactions);
+ dlist_delete_thoroughly(&sxact->finishedLink);
+
+ /* Add to SLRU summary information. */
+ if (TransactionIdIsValid(sxact->topXid) && !SxactIsReadOnly(sxact))
+ SerialAdd(sxact->topXid, SxactHasConflictOut(sxact)
+ ? sxact->SeqNo.earliestOutConflictCommit : InvalidSerCommitSeqNo);
+
+ /* Summarize and release the detail. */
+ ReleaseOneSerializableXact(sxact, false, true);
+
+ LWLockRelease(SerializableFinishedListLock);
+}
+
+/*
+ * GetSafeSnapshot
+ * Obtain and register a snapshot for a READ ONLY DEFERRABLE
+ * transaction. Ensures that the snapshot is "safe", i.e. a
+ * read-only transaction running on it can execute serializably
+ * without further checks. This requires waiting for concurrent
+ * transactions to complete, and retrying with a new snapshot if
+ * one of them could possibly create a conflict.
+ *
+ * As with GetSerializableTransactionSnapshot (which this is a subroutine
+ * for), the passed-in Snapshot pointer should reference a static data
+ * area that can safely be passed to GetSnapshotData.
+ */
+static Snapshot
+GetSafeSnapshot(Snapshot origSnapshot)
+{
+ Snapshot snapshot;
+
+ Assert(XactReadOnly && XactDeferrable);
+
+ while (true)
+ {
+ /*
+ * GetSerializableTransactionSnapshotInt is going to call
+ * GetSnapshotData, so we need to provide it the static snapshot area
+ * our caller passed to us. The pointer returned is actually the same
+ * one passed to it, but we avoid assuming that here.
+ */
+ snapshot = GetSerializableTransactionSnapshotInt(origSnapshot,
+ NULL, InvalidPid);
+
+ if (MySerializableXact == InvalidSerializableXact)
+ return snapshot; /* no concurrent r/w xacts; it's safe */
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Wait for concurrent transactions to finish. Stop early if one of
+ * them marked us as conflicted.
+ */
+ MySerializableXact->flags |= SXACT_FLAG_DEFERRABLE_WAITING;
+ while (!(dlist_is_empty(&MySerializableXact->possibleUnsafeConflicts) ||
+ SxactIsROUnsafe(MySerializableXact)))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ProcWaitForSignal(WAIT_EVENT_SAFE_SNAPSHOT);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ }
+ MySerializableXact->flags &= ~SXACT_FLAG_DEFERRABLE_WAITING;
+
+ if (!SxactIsROUnsafe(MySerializableXact))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ break; /* success */
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+
+ /* else, need to retry... */
+ ereport(DEBUG2,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg_internal("deferrable snapshot was unsafe; trying a new one")));
+ ReleasePredicateLocks(false, false);
+ }
+
+ /*
+ * Now we have a safe snapshot, so we don't need to do any further checks.
+ */
+ Assert(SxactIsROSafe(MySerializableXact));
+ ReleasePredicateLocks(false, true);
+
+ return snapshot;
+}
+
+/*
+ * GetSafeSnapshotBlockingPids
+ * If the specified process is currently blocked in GetSafeSnapshot,
+ * write the process IDs of all processes that it is blocked by
+ * into the caller-supplied buffer output[]. The list is truncated at
+ * output_size, and the number of PIDs written into the buffer is
+ * returned. Returns zero if the given PID is not currently blocked
+ * in GetSafeSnapshot.
+ */
+int
+GetSafeSnapshotBlockingPids(int blocked_pid, int *output, int output_size)
+{
+ int num_written = 0;
+ dlist_iter iter;
+ SERIALIZABLEXACT *blocking_sxact = NULL;
+
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+ /* Find blocked_pid's SERIALIZABLEXACT by linear search. */
+ dlist_foreach(iter, &PredXact->activeList)
+ {
+ SERIALIZABLEXACT *sxact =
+ dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
+
+ if (sxact->pid == blocked_pid)
+ {
+ blocking_sxact = sxact;
+ break;
+ }
+ }
+
+ /* Did we find it, and is it currently waiting in GetSafeSnapshot? */
+ if (blocking_sxact != NULL && SxactIsDeferrableWaiting(blocking_sxact))
+ {
+ /* Traverse the list of possible unsafe conflicts collecting PIDs. */
+ dlist_foreach(iter, &blocking_sxact->possibleUnsafeConflicts)
+ {
+ RWConflict possibleUnsafeConflict =
+ dlist_container(RWConflictData, inLink, iter.cur);
+
+ output[num_written++] = possibleUnsafeConflict->sxactOut->pid;
+
+ if (num_written >= output_size)
+ break;
+ }
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+
+ return num_written;
+}
+
+/*
+ * Acquire a snapshot that can be used for the current transaction.
+ *
+ * Make sure we have a SERIALIZABLEXACT reference in MySerializableXact.
+ * It should be current for this process and be contained in PredXact.
+ *
+ * The passed-in Snapshot pointer should reference a static data area that
+ * can safely be passed to GetSnapshotData. The return value is actually
+ * always this same pointer; no new snapshot data structure is allocated
+ * within this function.
+ */
+Snapshot
+GetSerializableTransactionSnapshot(Snapshot snapshot)
+{
+ Assert(IsolationIsSerializable());
+
+ /*
+ * Can't use serializable mode while recovery is still active, as it is,
+ * for example, on a hot standby. We could get here despite the check in
+ * check_transaction_isolation() if default_transaction_isolation is set
+ * to serializable, so phrase the hint accordingly.
+ */
+ if (RecoveryInProgress())
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("cannot use serializable mode in a hot standby"),
+ errdetail("\"default_transaction_isolation\" is set to \"serializable\"."),
+ errhint("You can use \"SET default_transaction_isolation = 'repeatable read'\" to change the default.")));
+
+ /*
+ * A special optimization is available for SERIALIZABLE READ ONLY
+ * DEFERRABLE transactions -- we can wait for a suitable snapshot and
+ * thereby avoid all SSI overhead once it's running.
+ */
+ if (XactReadOnly && XactDeferrable)
+ return GetSafeSnapshot(snapshot);
+
+ return GetSerializableTransactionSnapshotInt(snapshot,
+ NULL, InvalidPid);
+}
+
+/*
+ * Import a snapshot to be used for the current transaction.
+ *
+ * This is nearly the same as GetSerializableTransactionSnapshot, except that
+ * we don't take a new snapshot, but rather use the data we're handed.
+ *
+ * The caller must have verified that the snapshot came from a serializable
+ * transaction; and if we're read-write, the source transaction must not be
+ * read-only.
+ */
+void
+SetSerializableTransactionSnapshot(Snapshot snapshot,
+ VirtualTransactionId *sourcevxid,
+ int sourcepid)
+{
+ Assert(IsolationIsSerializable());
+
+ /*
+ * If this is called by parallel.c in a parallel worker, we don't want to
+ * create a SERIALIZABLEXACT just yet because the leader's
+ * SERIALIZABLEXACT will be installed with AttachSerializableXact(). We
+ * also don't want to reject SERIALIZABLE READ ONLY DEFERRABLE in this
+ * case, because the leader has already determined that the snapshot it
+ * has passed us is safe. So there is nothing for us to do.
+ */
+ if (IsParallelWorker())
+ return;
+
+ /*
+ * We do not allow SERIALIZABLE READ ONLY DEFERRABLE transactions to
+ * import snapshots, since there's no way to wait for a safe snapshot when
+ * we're using the snap we're told to. (XXX instead of throwing an error,
+ * we could just ignore the XactDeferrable flag?)
+ */
+ if (XactReadOnly && XactDeferrable)
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("a snapshot-importing transaction must not be READ ONLY DEFERRABLE")));
+
+ (void) GetSerializableTransactionSnapshotInt(snapshot, sourcevxid,
+ sourcepid);
+}
+
+/*
+ * Guts of GetSerializableTransactionSnapshot
+ *
+ * If sourcevxid is valid, this is actually an import operation and we should
+ * skip calling GetSnapshotData, because the snapshot contents are already
+ * loaded up. HOWEVER: to avoid race conditions, we must check that the
+ * source xact is still running after we acquire SerializableXactHashLock.
+ * We do that by calling ProcArrayInstallImportedXmin.
+ */
+static Snapshot
+GetSerializableTransactionSnapshotInt(Snapshot snapshot,
+ VirtualTransactionId *sourcevxid,
+ int sourcepid)
+{
+ PGPROC *proc;
+ VirtualTransactionId vxid;
+ SERIALIZABLEXACT *sxact,
+ *othersxact;
+
+ /* We only do this for serializable transactions. Once. */
+ Assert(MySerializableXact == InvalidSerializableXact);
+
+ Assert(!RecoveryInProgress());
+
+ /*
+ * Since all parts of a serializable transaction must use the same
+ * snapshot, it is too late to establish one after a parallel operation
+ * has begun.
+ */
+ if (IsInParallelMode())
+ elog(ERROR, "cannot establish serializable snapshot during a parallel operation");
+
+ proc = MyProc;
+ Assert(proc != NULL);
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+
+ /*
+ * First we get the sxact structure, which may involve looping and access
+ * to the "finished" list to free a structure for use.
+ *
+ * We must hold SerializableXactHashLock when taking/checking the snapshot
+ * to avoid race conditions, for much the same reasons that
+ * GetSnapshotData takes the ProcArrayLock. Since we might have to
+ * release SerializableXactHashLock to call SummarizeOldestCommittedSxact,
+ * this means we have to create the sxact first, which is a bit annoying
+ * (in particular, an elog(ERROR) in procarray.c would cause us to leak
+ * the sxact). Consider refactoring to avoid this.
+ */
+#ifdef TEST_SUMMARIZE_SERIAL
+ SummarizeOldestCommittedSxact();
+#endif
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ do
+ {
+ sxact = CreatePredXact();
+ /* If null, push out committed sxact to SLRU summary & retry. */
+ if (!sxact)
+ {
+ LWLockRelease(SerializableXactHashLock);
+ SummarizeOldestCommittedSxact();
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ }
+ } while (!sxact);
+
+ /* Get the snapshot, or check that it's safe to use */
+ if (!sourcevxid)
+ snapshot = GetSnapshotData(snapshot);
+ else if (!ProcArrayInstallImportedXmin(snapshot->xmin, sourcevxid))
+ {
+ ReleasePredXact(sxact);
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+ errmsg("could not import the requested snapshot"),
+ errdetail("The source process with PID %d is not running anymore.",
+ sourcepid)));
+ }
+
+ /*
+ * If there are no serializable transactions which are not read-only, we
+ * can "opt out" of predicate locking and conflict checking for a
+ * read-only transaction.
+ *
+ * The reason this is safe is that a read-only transaction can only become
+ * part of a dangerous structure if it overlaps a writable transaction
+ * which in turn overlaps a writable transaction which committed before
+ * the read-only transaction started. A new writable transaction can
+ * overlap this one, but it can't meet the other condition of overlapping
+ * a transaction which committed before this one started.
+ */
+ if (XactReadOnly && PredXact->WritableSxactCount == 0)
+ {
+ ReleasePredXact(sxact);
+ LWLockRelease(SerializableXactHashLock);
+ return snapshot;
+ }
+
+ /* Initialize the structure. */
+ sxact->vxid = vxid;
+ sxact->SeqNo.lastCommitBeforeSnapshot = PredXact->LastSxactCommitSeqNo;
+ sxact->prepareSeqNo = InvalidSerCommitSeqNo;
+ sxact->commitSeqNo = InvalidSerCommitSeqNo;
+ dlist_init(&(sxact->outConflicts));
+ dlist_init(&(sxact->inConflicts));
+ dlist_init(&(sxact->possibleUnsafeConflicts));
+ sxact->topXid = GetTopTransactionIdIfAny();
+ sxact->finishedBefore = InvalidTransactionId;
+ sxact->xmin = snapshot->xmin;
+ sxact->pid = MyProcPid;
+ sxact->pgprocno = MyProc->pgprocno;
+ dlist_init(&sxact->predicateLocks);
+ dlist_node_init(&sxact->finishedLink);
+ sxact->flags = 0;
+ if (XactReadOnly)
+ {
+ dlist_iter iter;
+
+ sxact->flags |= SXACT_FLAG_READ_ONLY;
+
+ /*
+ * Register all concurrent r/w transactions as possible conflicts; if
+ * all of them commit without any outgoing conflicts to earlier
+ * transactions then this snapshot can be deemed safe (and we can run
+ * without tracking predicate locks).
+ */
+ dlist_foreach(iter, &PredXact->activeList)
+ {
+ othersxact = dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
+
+ if (!SxactIsCommitted(othersxact)
+ && !SxactIsDoomed(othersxact)
+ && !SxactIsReadOnly(othersxact))
+ {
+ SetPossibleUnsafeConflict(sxact, othersxact);
+ }
+ }
+
+ /*
+ * If we didn't find any possibly unsafe conflicts because every
+ * uncommitted writable transaction turned out to be doomed, then we
+ * can "opt out" immediately. See comments above the earlier check
+ * for PredXact->WritableSxactCount == 0.
+ */
+ if (dlist_is_empty(&sxact->possibleUnsafeConflicts))
+ {
+ ReleasePredXact(sxact);
+ LWLockRelease(SerializableXactHashLock);
+ return snapshot;
+ }
+ }
+ else
+ {
+ ++(PredXact->WritableSxactCount);
+ Assert(PredXact->WritableSxactCount <=
+ (MaxBackends + max_prepared_xacts));
+ }
+
+ /* Maintain serializable global xmin info. */
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+ {
+ Assert(PredXact->SxactGlobalXminCount == 0);
+ PredXact->SxactGlobalXmin = snapshot->xmin;
+ PredXact->SxactGlobalXminCount = 1;
+ SerialSetActiveSerXmin(snapshot->xmin);
+ }
+ else if (TransactionIdEquals(snapshot->xmin, PredXact->SxactGlobalXmin))
+ {
+ Assert(PredXact->SxactGlobalXminCount > 0);
+ PredXact->SxactGlobalXminCount++;
+ }
+ else
+ {
+ Assert(TransactionIdFollows(snapshot->xmin, PredXact->SxactGlobalXmin));
+ }
+
+ MySerializableXact = sxact;
+ MyXactDidWrite = false; /* haven't written anything yet */
+
+ LWLockRelease(SerializableXactHashLock);
+
+ CreateLocalPredicateLockHash();
+
+ return snapshot;
+}
+
+static void
+CreateLocalPredicateLockHash(void)
+{
+ HASHCTL hash_ctl;
+
+ /* Initialize the backend-local hash table of parent locks */
+ Assert(LocalPredicateLockHash == NULL);
+ hash_ctl.keysize = sizeof(PREDICATELOCKTARGETTAG);
+ hash_ctl.entrysize = sizeof(LOCALPREDICATELOCK);
+ LocalPredicateLockHash = hash_create("Local predicate lock",
+ max_predicate_locks_per_xact,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS);
+}
+
+/*
+ * Register the top level XID in SerializableXidHash.
+ * Also store it for easy reference in MySerializableXact.
+ */
+void
+RegisterPredicateLockingXid(TransactionId xid)
+{
+ SERIALIZABLEXIDTAG sxidtag;
+ SERIALIZABLEXID *sxid;
+ bool found;
+
+ /*
+ * If we're not tracking predicate lock data for this transaction, we
+ * should ignore the request and return quickly.
+ */
+ if (MySerializableXact == InvalidSerializableXact)
+ return;
+
+ /* We should have a valid XID and be at the top level. */
+ Assert(TransactionIdIsValid(xid));
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /* This should only be done once per transaction. */
+ Assert(MySerializableXact->topXid == InvalidTransactionId);
+
+ MySerializableXact->topXid = xid;
+
+ sxidtag.xid = xid;
+ sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+ &sxidtag,
+ HASH_ENTER, &found);
+ Assert(!found);
+
+ /* Initialize the structure. */
+ sxid->myXact = MySerializableXact;
+ LWLockRelease(SerializableXactHashLock);
+}
+
+
+/*
+ * Check whether there are any predicate locks held by any transaction
+ * for the page at the given block number.
+ *
+ * Note that the transaction may be completed but not yet subject to
+ * cleanup due to overlapping serializable transactions. This must
+ * return valid information regardless of transaction isolation level.
+ *
+ * Also note that this doesn't check for a conflicting relation lock,
+ * just a lock specifically on the given page.
+ *
+ * One use is to support proper behavior during GiST index vacuum.
+ */
+bool
+PageIsPredicateLocked(Relation relation, BlockNumber blkno)
+{
+ PREDICATELOCKTARGETTAG targettag;
+ uint32 targettaghash;
+ LWLock *partitionLock;
+ PREDICATELOCKTARGET *target;
+
+ SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+ relation->rd_locator.dbOid,
+ relation->rd_id,
+ blkno);
+
+ targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ LWLockAcquire(partitionLock, LW_SHARED);
+ target = (PREDICATELOCKTARGET *)
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ &targettag, targettaghash,
+ HASH_FIND, NULL);
+ LWLockRelease(partitionLock);
+
+ return (target != NULL);
+}
+
+
+/*
+ * Check whether a particular lock is held by this transaction.
+ *
+ * Important note: this function may return false even if the lock is
+ * being held, because it uses the local lock table which is not
+ * updated if another transaction modifies our lock list (e.g. to
+ * split an index page). It can also return true when a coarser
+ * granularity lock that covers this target is being held. Be careful
+ * to only use this function in circumstances where such errors are
+ * acceptable!
+ */
+static bool
+PredicateLockExists(const PREDICATELOCKTARGETTAG *targettag)
+{
+ LOCALPREDICATELOCK *lock;
+
+ /* check local hash table */
+ lock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ targettag,
+ HASH_FIND, NULL);
+
+ if (!lock)
+ return false;
+
+ /*
+ * Found entry in the table, but still need to check whether it's actually
+ * held -- it could just be a parent of some held lock.
+ */
+ return lock->held;
+}
+
+/*
+ * Return the parent lock tag in the lock hierarchy: the next coarser
+ * lock that covers the provided tag.
+ *
+ * Returns true and sets *parent to the parent tag if one exists,
+ * returns false if none exists.
+ */
+static bool
+GetParentPredicateLockTag(const PREDICATELOCKTARGETTAG *tag,
+ PREDICATELOCKTARGETTAG *parent)
+{
+ switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ {
+ case PREDLOCKTAG_RELATION:
+ /* relation locks have no parent lock */
+ return false;
+
+ case PREDLOCKTAG_PAGE:
+ /* parent lock is relation lock */
+ SET_PREDICATELOCKTARGETTAG_RELATION(*parent,
+ GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ GET_PREDICATELOCKTARGETTAG_RELATION(*tag));
+
+ return true;
+
+ case PREDLOCKTAG_TUPLE:
+ /* parent lock is page lock */
+ SET_PREDICATELOCKTARGETTAG_PAGE(*parent,
+ GET_PREDICATELOCKTARGETTAG_DB(*tag),
+ GET_PREDICATELOCKTARGETTAG_RELATION(*tag),
+ GET_PREDICATELOCKTARGETTAG_PAGE(*tag));
+ return true;
+ }
+
+ /* not reachable */
+ Assert(false);
+ return false;
+}
+
+/*
+ * Check whether the lock we are considering is already covered by a
+ * coarser lock for our transaction.
+ *
+ * Like PredicateLockExists, this function might return a false
+ * negative, but it will never return a false positive.
+ */
+static bool
+CoarserLockCovers(const PREDICATELOCKTARGETTAG *newtargettag)
+{
+ PREDICATELOCKTARGETTAG targettag,
+ parenttag;
+
+ targettag = *newtargettag;
+
+ /* check parents iteratively until no more */
+ while (GetParentPredicateLockTag(&targettag, &parenttag))
+ {
+ targettag = parenttag;
+ if (PredicateLockExists(&targettag))
+ return true;
+ }
+
+ /* no more parents to check; lock is not covered */
+ return false;
+}
+
+/*
+ * Remove the dummy entry from the predicate lock target hash, to free up some
+ * scratch space. The caller must be holding SerializablePredicateListLock,
+ * and must restore the entry with RestoreScratchTarget() before releasing the
+ * lock.
+ *
+ * If lockheld is true, the caller is already holding the partition lock
+ * of the partition containing the scratch entry.
+ */
+static void
+RemoveScratchTarget(bool lockheld)
+{
+ bool found;
+
+ Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+ if (!lockheld)
+ LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ &ScratchTargetTag,
+ ScratchTargetTagHash,
+ HASH_REMOVE, &found);
+ Assert(found);
+ if (!lockheld)
+ LWLockRelease(ScratchPartitionLock);
+}
+
+/*
+ * Re-insert the dummy entry in predicate lock target hash.
+ */
+static void
+RestoreScratchTarget(bool lockheld)
+{
+ bool found;
+
+ Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+ if (!lockheld)
+ LWLockAcquire(ScratchPartitionLock, LW_EXCLUSIVE);
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ &ScratchTargetTag,
+ ScratchTargetTagHash,
+ HASH_ENTER, &found);
+ Assert(!found);
+ if (!lockheld)
+ LWLockRelease(ScratchPartitionLock);
+}
+
+/*
+ * Check whether the list of related predicate locks is empty for a
+ * predicate lock target, and remove the target if it is.
+ */
+static void
+RemoveTargetIfNoLongerUsed(PREDICATELOCKTARGET *target, uint32 targettaghash)
+{
+ PREDICATELOCKTARGET *rmtarget PG_USED_FOR_ASSERTS_ONLY;
+
+ Assert(LWLockHeldByMe(SerializablePredicateListLock));
+
+ /* Can't remove it until no locks at this target. */
+ if (!dlist_is_empty(&target->predicateLocks))
+ return;
+
+ /* Actually remove the target. */
+ rmtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ &target->tag,
+ targettaghash,
+ HASH_REMOVE, NULL);
+ Assert(rmtarget == target);
+}
+
+/*
+ * Delete child target locks owned by this process.
+ * This implementation is assuming that the usage of each target tag field
+ * is uniform. No need to make this hard if we don't have to.
+ *
+ * We acquire an LWLock in the case of parallel mode, because worker
+ * backends have access to the leader's SERIALIZABLEXACT. Otherwise,
+ * we aren't acquiring LWLocks for the predicate lock or lock
+ * target structures associated with this transaction unless we're going
+ * to modify them, because no other process is permitted to modify our
+ * locks.
+ */
+static void
+DeleteChildTargetLocks(const PREDICATELOCKTARGETTAG *newtargettag)
+{
+ SERIALIZABLEXACT *sxact;
+ PREDICATELOCK *predlock;
+ dlist_mutable_iter iter;
+
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ sxact = MySerializableXact;
+ if (IsInParallelMode())
+ LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+
+ dlist_foreach_modify(iter, &sxact->predicateLocks)
+ {
+ PREDICATELOCKTAG oldlocktag;
+ PREDICATELOCKTARGET *oldtarget;
+ PREDICATELOCKTARGETTAG oldtargettag;
+
+ predlock = dlist_container(PREDICATELOCK, xactLink, iter.cur);
+
+ oldlocktag = predlock->tag;
+ Assert(oldlocktag.myXact == sxact);
+ oldtarget = oldlocktag.myTarget;
+ oldtargettag = oldtarget->tag;
+
+ if (TargetTagIsCoveredBy(oldtargettag, *newtargettag))
+ {
+ uint32 oldtargettaghash;
+ LWLock *partitionLock;
+ PREDICATELOCK *rmpredlock PG_USED_FOR_ASSERTS_ONLY;
+
+ oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ partitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ dlist_delete(&predlock->xactLink);
+ dlist_delete(&predlock->targetLink);
+ rmpredlock = hash_search_with_hash_value
+ (PredicateLockHash,
+ &oldlocktag,
+ PredicateLockHashCodeFromTargetHashCode(&oldlocktag,
+ oldtargettaghash),
+ HASH_REMOVE, NULL);
+ Assert(rmpredlock == predlock);
+
+ RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
+
+ LWLockRelease(partitionLock);
+
+ DecrementParentLocks(&oldtargettag);
+ }
+ }
+ if (IsInParallelMode())
+ LWLockRelease(&sxact->perXactPredicateListLock);
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * Returns the promotion limit for a given predicate lock target. This is the
+ * max number of descendant locks allowed before promoting to the specified
+ * tag. Note that the limit includes non-direct descendants (e.g., both tuples
+ * and pages for a relation lock).
+ *
+ * Currently the default limit is 2 for a page lock, and half of the value of
+ * max_pred_locks_per_transaction - 1 for a relation lock, to match behavior
+ * of earlier releases when upgrading.
+ *
+ * TODO SSI: We should probably add additional GUCs to allow a maximum ratio
+ * of page and tuple locks based on the pages in a relation, and the maximum
+ * ratio of tuple locks to tuples in a page. This would provide more
+ * generally "balanced" allocation of locks to where they are most useful,
+ * while still allowing the absolute numbers to prevent one relation from
+ * tying up all predicate lock resources.
+ */
+static int
+MaxPredicateChildLocks(const PREDICATELOCKTARGETTAG *tag)
+{
+ switch (GET_PREDICATELOCKTARGETTAG_TYPE(*tag))
+ {
+ case PREDLOCKTAG_RELATION:
+ return max_predicate_locks_per_relation < 0
+ ? (max_predicate_locks_per_xact
+ / (-max_predicate_locks_per_relation)) - 1
+ : max_predicate_locks_per_relation;
+
+ case PREDLOCKTAG_PAGE:
+ return max_predicate_locks_per_page;
+
+ case PREDLOCKTAG_TUPLE:
+
+ /*
+ * not reachable: nothing is finer-granularity than a tuple, so we
+ * should never try to promote to it.
+ */
+ Assert(false);
+ return 0;
+ }
+
+ /* not reachable */
+ Assert(false);
+ return 0;
+}
+
+/*
+ * For all ancestors of a newly-acquired predicate lock, increment
+ * their child count in the parent hash table. If any of them have
+ * more descendants than their promotion threshold, acquire the
+ * coarsest such lock.
+ *
+ * Returns true if a parent lock was acquired and false otherwise.
+ */
+static bool
+CheckAndPromotePredicateLockRequest(const PREDICATELOCKTARGETTAG *reqtag)
+{
+ PREDICATELOCKTARGETTAG targettag,
+ nexttag,
+ promotiontag;
+ LOCALPREDICATELOCK *parentlock;
+ bool found,
+ promote;
+
+ promote = false;
+
+ targettag = *reqtag;
+
+ /* check parents iteratively */
+ while (GetParentPredicateLockTag(&targettag, &nexttag))
+ {
+ targettag = nexttag;
+ parentlock = (LOCALPREDICATELOCK *) hash_search(LocalPredicateLockHash,
+ &targettag,
+ HASH_ENTER,
+ &found);
+ if (!found)
+ {
+ parentlock->held = false;
+ parentlock->childLocks = 1;
+ }
+ else
+ parentlock->childLocks++;
+
+ if (parentlock->childLocks >
+ MaxPredicateChildLocks(&targettag))
+ {
+ /*
+ * We should promote to this parent lock. Continue to check its
+ * ancestors, however, both to get their child counts right and to
+ * check whether we should just go ahead and promote to one of
+ * them.
+ */
+ promotiontag = targettag;
+ promote = true;
+ }
+ }
+
+ if (promote)
+ {
+ /* acquire coarsest ancestor eligible for promotion */
+ PredicateLockAcquire(&promotiontag);
+ return true;
+ }
+ else
+ return false;
+}
+
+/*
+ * When releasing a lock, decrement the child count on all ancestor
+ * locks.
+ *
+ * This is called only when releasing a lock via
+ * DeleteChildTargetLocks (i.e. when a lock becomes redundant because
+ * we've acquired its parent, possibly due to promotion) or when a new
+ * MVCC write lock makes the predicate lock unnecessary. There's no
+ * point in calling it when locks are released at transaction end, as
+ * this information is no longer needed.
+ */
+static void
+DecrementParentLocks(const PREDICATELOCKTARGETTAG *targettag)
+{
+ PREDICATELOCKTARGETTAG parenttag,
+ nexttag;
+
+ parenttag = *targettag;
+
+ while (GetParentPredicateLockTag(&parenttag, &nexttag))
+ {
+ uint32 targettaghash;
+ LOCALPREDICATELOCK *parentlock,
+ *rmlock PG_USED_FOR_ASSERTS_ONLY;
+
+ parenttag = nexttag;
+ targettaghash = PredicateLockTargetTagHashCode(&parenttag);
+ parentlock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ &parenttag, targettaghash,
+ HASH_FIND, NULL);
+
+ /*
+ * There's a small chance the parent lock doesn't exist in the lock
+ * table. This can happen if we prematurely removed it because an
+ * index split caused the child refcount to be off.
+ */
+ if (parentlock == NULL)
+ continue;
+
+ parentlock->childLocks--;
+
+ /*
+ * Under similar circumstances the parent lock's refcount might be
+ * zero. This only happens if we're holding that lock (otherwise we
+ * would have removed the entry).
+ */
+ if (parentlock->childLocks < 0)
+ {
+ Assert(parentlock->held);
+ parentlock->childLocks = 0;
+ }
+
+ if ((parentlock->childLocks == 0) && (!parentlock->held))
+ {
+ rmlock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ &parenttag, targettaghash,
+ HASH_REMOVE, NULL);
+ Assert(rmlock == parentlock);
+ }
+ }
+}
+
+/*
+ * Indicate that a predicate lock on the given target is held by the
+ * specified transaction. Has no effect if the lock is already held.
+ *
+ * This updates the lock table and the sxact's lock list, and creates
+ * the lock target if necessary, but does *not* do anything related to
+ * granularity promotion or the local lock table. See
+ * PredicateLockAcquire for that.
+ */
+static void
+CreatePredicateLock(const PREDICATELOCKTARGETTAG *targettag,
+ uint32 targettaghash,
+ SERIALIZABLEXACT *sxact)
+{
+ PREDICATELOCKTARGET *target;
+ PREDICATELOCKTAG locktag;
+ PREDICATELOCK *lock;
+ LWLock *partitionLock;
+ bool found;
+
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ if (IsInParallelMode())
+ LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /* Make sure that the target is represented. */
+ target = (PREDICATELOCKTARGET *)
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ targettag, targettaghash,
+ HASH_ENTER_NULL, &found);
+ if (!target)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
+ if (!found)
+ dlist_init(&target->predicateLocks);
+
+ /* We've got the sxact and target, make sure they're joined. */
+ locktag.myTarget = target;
+ locktag.myXact = sxact;
+ lock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash, &locktag,
+ PredicateLockHashCodeFromTargetHashCode(&locktag, targettaghash),
+ HASH_ENTER_NULL, &found);
+ if (!lock)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
+
+ if (!found)
+ {
+ dlist_push_tail(&target->predicateLocks, &lock->targetLink);
+ dlist_push_tail(&sxact->predicateLocks, &lock->xactLink);
+ lock->commitSeqNo = InvalidSerCommitSeqNo;
+ }
+
+ LWLockRelease(partitionLock);
+ if (IsInParallelMode())
+ LWLockRelease(&sxact->perXactPredicateListLock);
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * Acquire a predicate lock on the specified target for the current
+ * connection if not already held. This updates the local lock table
+ * and uses it to implement granularity promotion. It will consolidate
+ * multiple locks into a coarser lock if warranted, and will release
+ * any finer-grained locks covered by the new one.
+ */
+static void
+PredicateLockAcquire(const PREDICATELOCKTARGETTAG *targettag)
+{
+ uint32 targettaghash;
+ bool found;
+ LOCALPREDICATELOCK *locallock;
+
+ /* Do we have the lock already, or a covering lock? */
+ if (PredicateLockExists(targettag))
+ return;
+
+ if (CoarserLockCovers(targettag))
+ return;
+
+ /* the same hash and LW lock apply to the lock target and the local lock. */
+ targettaghash = PredicateLockTargetTagHashCode(targettag);
+
+ /* Acquire lock in local table */
+ locallock = (LOCALPREDICATELOCK *)
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ targettag, targettaghash,
+ HASH_ENTER, &found);
+ locallock->held = true;
+ if (!found)
+ locallock->childLocks = 0;
+
+ /* Actually create the lock */
+ CreatePredicateLock(targettag, targettaghash, MySerializableXact);
+
+ /*
+ * Lock has been acquired. Check whether it should be promoted to a
+ * coarser granularity, or whether there are finer-granularity locks to
+ * clean up.
+ */
+ if (CheckAndPromotePredicateLockRequest(targettag))
+ {
+ /*
+ * Lock request was promoted to a coarser-granularity lock, and that
+ * lock was acquired. It will delete this lock and any of its
+ * children, so we're done.
+ */
+ }
+ else
+ {
+ /* Clean up any finer-granularity locks */
+ if (GET_PREDICATELOCKTARGETTAG_TYPE(*targettag) != PREDLOCKTAG_TUPLE)
+ DeleteChildTargetLocks(targettag);
+ }
+}
+
+
+/*
+ * PredicateLockRelation
+ *
+ * Gets a predicate lock at the relation level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ * Clear any finer-grained predicate locks this session has on the relation.
+ */
+void
+PredicateLockRelation(Relation relation, Snapshot snapshot)
+{
+ PREDICATELOCKTARGETTAG tag;
+
+ if (!SerializationNeededForRead(relation, snapshot))
+ return;
+
+ SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+ relation->rd_locator.dbOid,
+ relation->rd_id);
+ PredicateLockAcquire(&tag);
+}
+
+/*
+ * PredicateLockPage
+ *
+ * Gets a predicate lock at the page level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ * Skip if a coarser predicate lock already covers this page.
+ * Clear any finer-grained predicate locks this session has on the relation.
+ */
+void
+PredicateLockPage(Relation relation, BlockNumber blkno, Snapshot snapshot)
+{
+ PREDICATELOCKTARGETTAG tag;
+
+ if (!SerializationNeededForRead(relation, snapshot))
+ return;
+
+ SET_PREDICATELOCKTARGETTAG_PAGE(tag,
+ relation->rd_locator.dbOid,
+ relation->rd_id,
+ blkno);
+ PredicateLockAcquire(&tag);
+}
+
+/*
+ * PredicateLockTID
+ *
+ * Gets a predicate lock at the tuple level.
+ * Skip if not in full serializable transaction isolation level.
+ * Skip if this is a temporary table.
+ */
+void
+PredicateLockTID(Relation relation, ItemPointer tid, Snapshot snapshot,
+ TransactionId tuple_xid)
+{
+ PREDICATELOCKTARGETTAG tag;
+
+ if (!SerializationNeededForRead(relation, snapshot))
+ return;
+
+ /*
+ * Return if this xact wrote it.
+ */
+ if (relation->rd_index == NULL)
+ {
+ /* If we wrote it; we already have a write lock. */
+ if (TransactionIdIsCurrentTransactionId(tuple_xid))
+ return;
+ }
+
+ /*
+ * Do quick-but-not-definitive test for a relation lock first. This will
+ * never cause a return when the relation is *not* locked, but will
+ * occasionally let the check continue when there really *is* a relation
+ * level lock.
+ */
+ SET_PREDICATELOCKTARGETTAG_RELATION(tag,
+ relation->rd_locator.dbOid,
+ relation->rd_id);
+ if (PredicateLockExists(&tag))
+ return;
+
+ SET_PREDICATELOCKTARGETTAG_TUPLE(tag,
+ relation->rd_locator.dbOid,
+ relation->rd_id,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ PredicateLockAcquire(&tag);
+}
+
+
+/*
+ * DeleteLockTarget
+ *
+ * Remove a predicate lock target along with any locks held for it.
+ *
+ * Caller must hold SerializablePredicateListLock and the
+ * appropriate hash partition lock for the target.
+ */
+static void
+DeleteLockTarget(PREDICATELOCKTARGET *target, uint32 targettaghash)
+{
+ dlist_mutable_iter iter;
+
+ Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
+ LW_EXCLUSIVE));
+ Assert(LWLockHeldByMe(PredicateLockHashPartitionLock(targettaghash)));
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ dlist_foreach_modify(iter, &target->predicateLocks)
+ {
+ PREDICATELOCK *predlock =
+ dlist_container(PREDICATELOCK, targetLink, iter.cur);
+ bool found;
+
+ dlist_delete(&(predlock->xactLink));
+ dlist_delete(&(predlock->targetLink));
+
+ hash_search_with_hash_value
+ (PredicateLockHash,
+ &predlock->tag,
+ PredicateLockHashCodeFromTargetHashCode(&predlock->tag,
+ targettaghash),
+ HASH_REMOVE, &found);
+ Assert(found);
+ }
+ LWLockRelease(SerializableXactHashLock);
+
+ /* Remove the target itself, if possible. */
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+}
+
+
+/*
+ * TransferPredicateLocksToNewTarget
+ *
+ * Move or copy all the predicate locks for a lock target, for use by
+ * index page splits/combines and other things that create or replace
+ * lock targets. If 'removeOld' is true, the old locks and the target
+ * will be removed.
+ *
+ * Returns true on success, or false if we ran out of shared memory to
+ * allocate the new target or locks. Guaranteed to always succeed if
+ * removeOld is set (by using the scratch entry in PredicateLockTargetHash
+ * for scratch space).
+ *
+ * Warning: the "removeOld" option should be used only with care,
+ * because this function does not (indeed, can not) update other
+ * backends' LocalPredicateLockHash. If we are only adding new
+ * entries, this is not a problem: the local lock table is used only
+ * as a hint, so missing entries for locks that are held are
+ * OK. Having entries for locks that are no longer held, as can happen
+ * when using "removeOld", is not in general OK. We can only use it
+ * safely when replacing a lock with a coarser-granularity lock that
+ * covers it, or if we are absolutely certain that no one will need to
+ * refer to that lock in the future.
+ *
+ * Caller must hold SerializablePredicateListLock exclusively.
+ */
+static bool
+TransferPredicateLocksToNewTarget(PREDICATELOCKTARGETTAG oldtargettag,
+ PREDICATELOCKTARGETTAG newtargettag,
+ bool removeOld)
+{
+ uint32 oldtargettaghash;
+ LWLock *oldpartitionLock;
+ PREDICATELOCKTARGET *oldtarget;
+ uint32 newtargettaghash;
+ LWLock *newpartitionLock;
+ bool found;
+ bool outOfShmem = false;
+
+ Assert(LWLockHeldByMeInMode(SerializablePredicateListLock,
+ LW_EXCLUSIVE));
+
+ oldtargettaghash = PredicateLockTargetTagHashCode(&oldtargettag);
+ newtargettaghash = PredicateLockTargetTagHashCode(&newtargettag);
+ oldpartitionLock = PredicateLockHashPartitionLock(oldtargettaghash);
+ newpartitionLock = PredicateLockHashPartitionLock(newtargettaghash);
+
+ if (removeOld)
+ {
+ /*
+ * Remove the dummy entry to give us scratch space, so we know we'll
+ * be able to create the new lock target.
+ */
+ RemoveScratchTarget(false);
+ }
+
+ /*
+ * We must get the partition locks in ascending sequence to avoid
+ * deadlocks. If old and new partitions are the same, we must request the
+ * lock only once.
+ */
+ if (oldpartitionLock < newpartitionLock)
+ {
+ LWLockAcquire(oldpartitionLock,
+ (removeOld ? LW_EXCLUSIVE : LW_SHARED));
+ LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ }
+ else if (oldpartitionLock > newpartitionLock)
+ {
+ LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+ LWLockAcquire(oldpartitionLock,
+ (removeOld ? LW_EXCLUSIVE : LW_SHARED));
+ }
+ else
+ LWLockAcquire(newpartitionLock, LW_EXCLUSIVE);
+
+ /*
+ * Look for the old target. If not found, that's OK; no predicate locks
+ * are affected, so we can just clean up and return. If it does exist,
+ * walk its list of predicate locks and move or copy them to the new
+ * target.
+ */
+ oldtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ &oldtargettag,
+ oldtargettaghash,
+ HASH_FIND, NULL);
+
+ if (oldtarget)
+ {
+ PREDICATELOCKTARGET *newtarget;
+ PREDICATELOCKTAG newpredlocktag;
+ dlist_mutable_iter iter;
+
+ newtarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ &newtargettag,
+ newtargettaghash,
+ HASH_ENTER_NULL, &found);
+
+ if (!newtarget)
+ {
+ /* Failed to allocate due to insufficient shmem */
+ outOfShmem = true;
+ goto exit;
+ }
+
+ /* If we created a new entry, initialize it */
+ if (!found)
+ dlist_init(&newtarget->predicateLocks);
+
+ newpredlocktag.myTarget = newtarget;
+
+ /*
+ * Loop through all the locks on the old target, replacing them with
+ * locks on the new target.
+ */
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ dlist_foreach_modify(iter, &oldtarget->predicateLocks)
+ {
+ PREDICATELOCK *oldpredlock =
+ dlist_container(PREDICATELOCK, targetLink, iter.cur);
+ PREDICATELOCK *newpredlock;
+ SerCommitSeqNo oldCommitSeqNo = oldpredlock->commitSeqNo;
+
+ newpredlocktag.myXact = oldpredlock->tag.myXact;
+
+ if (removeOld)
+ {
+ dlist_delete(&(oldpredlock->xactLink));
+ dlist_delete(&(oldpredlock->targetLink));
+
+ hash_search_with_hash_value
+ (PredicateLockHash,
+ &oldpredlock->tag,
+ PredicateLockHashCodeFromTargetHashCode(&oldpredlock->tag,
+ oldtargettaghash),
+ HASH_REMOVE, &found);
+ Assert(found);
+ }
+
+ newpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &newpredlocktag,
+ PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+ newtargettaghash),
+ HASH_ENTER_NULL,
+ &found);
+ if (!newpredlock)
+ {
+ /* Out of shared memory. Undo what we've done so far. */
+ LWLockRelease(SerializableXactHashLock);
+ DeleteLockTarget(newtarget, newtargettaghash);
+ outOfShmem = true;
+ goto exit;
+ }
+ if (!found)
+ {
+ dlist_push_tail(&(newtarget->predicateLocks),
+ &(newpredlock->targetLink));
+ dlist_push_tail(&(newpredlocktag.myXact->predicateLocks),
+ &(newpredlock->xactLink));
+ newpredlock->commitSeqNo = oldCommitSeqNo;
+ }
+ else
+ {
+ if (newpredlock->commitSeqNo < oldCommitSeqNo)
+ newpredlock->commitSeqNo = oldCommitSeqNo;
+ }
+
+ Assert(newpredlock->commitSeqNo != 0);
+ Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
+ || (newpredlock->tag.myXact == OldCommittedSxact));
+ }
+ LWLockRelease(SerializableXactHashLock);
+
+ if (removeOld)
+ {
+ Assert(dlist_is_empty(&oldtarget->predicateLocks));
+ RemoveTargetIfNoLongerUsed(oldtarget, oldtargettaghash);
+ }
+ }
+
+
+exit:
+ /* Release partition locks in reverse order of acquisition. */
+ if (oldpartitionLock < newpartitionLock)
+ {
+ LWLockRelease(newpartitionLock);
+ LWLockRelease(oldpartitionLock);
+ }
+ else if (oldpartitionLock > newpartitionLock)
+ {
+ LWLockRelease(oldpartitionLock);
+ LWLockRelease(newpartitionLock);
+ }
+ else
+ LWLockRelease(newpartitionLock);
+
+ if (removeOld)
+ {
+ /* We shouldn't run out of memory if we're moving locks */
+ Assert(!outOfShmem);
+
+ /* Put the scratch entry back */
+ RestoreScratchTarget(false);
+ }
+
+ return !outOfShmem;
+}
+
+/*
+ * Drop all predicate locks of any granularity from the specified relation,
+ * which can be a heap relation or an index relation. If 'transfer' is true,
+ * acquire a relation lock on the heap for any transactions with any lock(s)
+ * on the specified relation.
+ *
+ * This requires grabbing a lot of LW locks and scanning the entire lock
+ * target table for matches. That makes this more expensive than most
+ * predicate lock management functions, but it will only be called for DDL
+ * type commands that are expensive anyway, and there are fast returns when
+ * no serializable transactions are active or the relation is temporary.
+ *
+ * We don't use the TransferPredicateLocksToNewTarget function because it
+ * acquires its own locks on the partitions of the two targets involved,
+ * and we'll already be holding all partition locks.
+ *
+ * We can't throw an error from here, because the call could be from a
+ * transaction which is not serializable.
+ *
+ * NOTE: This is currently only called with transfer set to true, but that may
+ * change. If we decide to clean up the locks from a table on commit of a
+ * transaction which executed DROP TABLE, the false condition will be useful.
+ */
+static void
+DropAllPredicateLocksFromTable(Relation relation, bool transfer)
+{
+ HASH_SEQ_STATUS seqstat;
+ PREDICATELOCKTARGET *oldtarget;
+ PREDICATELOCKTARGET *heaptarget;
+ Oid dbId;
+ Oid relId;
+ Oid heapId;
+ int i;
+ bool isIndex;
+ bool found;
+ uint32 heaptargettaghash;
+
+ /*
+ * Bail out quickly if there are no serializable transactions running.
+ * It's safe to check this without taking locks because the caller is
+ * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which
+ * would matter here can be acquired while that is held.
+ */
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+ return;
+
+ if (!PredicateLockingNeededForRelation(relation))
+ return;
+
+ dbId = relation->rd_locator.dbOid;
+ relId = relation->rd_id;
+ if (relation->rd_index == NULL)
+ {
+ isIndex = false;
+ heapId = relId;
+ }
+ else
+ {
+ isIndex = true;
+ heapId = relation->rd_index->indrelid;
+ }
+ Assert(heapId != InvalidOid);
+ Assert(transfer || !isIndex); /* index OID only makes sense with
+ * transfer */
+
+ /* Retrieve first time needed, then keep. */
+ heaptargettaghash = 0;
+ heaptarget = NULL;
+
+ /* Acquire locks on all lock partitions */
+ LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+ for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Remove the dummy entry to give us scratch space, so we know we'll be
+ * able to create the new lock target.
+ */
+ if (transfer)
+ RemoveScratchTarget(true);
+
+ /* Scan through target map */
+ hash_seq_init(&seqstat, PredicateLockTargetHash);
+
+ while ((oldtarget = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
+ {
+ dlist_mutable_iter iter;
+
+ /*
+ * Check whether this is a target which needs attention.
+ */
+ if (GET_PREDICATELOCKTARGETTAG_RELATION(oldtarget->tag) != relId)
+ continue; /* wrong relation id */
+ if (GET_PREDICATELOCKTARGETTAG_DB(oldtarget->tag) != dbId)
+ continue; /* wrong database id */
+ if (transfer && !isIndex
+ && GET_PREDICATELOCKTARGETTAG_TYPE(oldtarget->tag) == PREDLOCKTAG_RELATION)
+ continue; /* already the right lock */
+
+ /*
+ * If we made it here, we have work to do. We make sure the heap
+ * relation lock exists, then we walk the list of predicate locks for
+ * the old target we found, moving all locks to the heap relation lock
+ * -- unless they already hold that.
+ */
+
+ /*
+ * First make sure we have the heap relation target. We only need to
+ * do this once.
+ */
+ if (transfer && heaptarget == NULL)
+ {
+ PREDICATELOCKTARGETTAG heaptargettag;
+
+ SET_PREDICATELOCKTARGETTAG_RELATION(heaptargettag, dbId, heapId);
+ heaptargettaghash = PredicateLockTargetTagHashCode(&heaptargettag);
+ heaptarget = hash_search_with_hash_value(PredicateLockTargetHash,
+ &heaptargettag,
+ heaptargettaghash,
+ HASH_ENTER, &found);
+ if (!found)
+ dlist_init(&heaptarget->predicateLocks);
+ }
+
+ /*
+ * Loop through all the locks on the old target, replacing them with
+ * locks on the new target.
+ */
+ dlist_foreach_modify(iter, &oldtarget->predicateLocks)
+ {
+ PREDICATELOCK *oldpredlock =
+ dlist_container(PREDICATELOCK, targetLink, iter.cur);
+ PREDICATELOCK *newpredlock;
+ SerCommitSeqNo oldCommitSeqNo;
+ SERIALIZABLEXACT *oldXact;
+
+ /*
+ * Remove the old lock first. This avoids the chance of running
+ * out of lock structure entries for the hash table.
+ */
+ oldCommitSeqNo = oldpredlock->commitSeqNo;
+ oldXact = oldpredlock->tag.myXact;
+
+ dlist_delete(&(oldpredlock->xactLink));
+
+ /*
+ * No need for retail delete from oldtarget list, we're removing
+ * the whole target anyway.
+ */
+ hash_search(PredicateLockHash,
+ &oldpredlock->tag,
+ HASH_REMOVE, &found);
+ Assert(found);
+
+ if (transfer)
+ {
+ PREDICATELOCKTAG newpredlocktag;
+
+ newpredlocktag.myTarget = heaptarget;
+ newpredlocktag.myXact = oldXact;
+ newpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &newpredlocktag,
+ PredicateLockHashCodeFromTargetHashCode(&newpredlocktag,
+ heaptargettaghash),
+ HASH_ENTER,
+ &found);
+ if (!found)
+ {
+ dlist_push_tail(&(heaptarget->predicateLocks),
+ &(newpredlock->targetLink));
+ dlist_push_tail(&(newpredlocktag.myXact->predicateLocks),
+ &(newpredlock->xactLink));
+ newpredlock->commitSeqNo = oldCommitSeqNo;
+ }
+ else
+ {
+ if (newpredlock->commitSeqNo < oldCommitSeqNo)
+ newpredlock->commitSeqNo = oldCommitSeqNo;
+ }
+
+ Assert(newpredlock->commitSeqNo != 0);
+ Assert((newpredlock->commitSeqNo == InvalidSerCommitSeqNo)
+ || (newpredlock->tag.myXact == OldCommittedSxact));
+ }
+ }
+
+ hash_search(PredicateLockTargetHash, &oldtarget->tag, HASH_REMOVE,
+ &found);
+ Assert(found);
+ }
+
+ /* Put the scratch entry back */
+ if (transfer)
+ RestoreScratchTarget(true);
+
+ /* Release locks in reverse order */
+ LWLockRelease(SerializableXactHashLock);
+ for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * TransferPredicateLocksToHeapRelation
+ * For all transactions, transfer all predicate locks for the given
+ * relation to a single relation lock on the heap.
+ */
+void
+TransferPredicateLocksToHeapRelation(Relation relation)
+{
+ DropAllPredicateLocksFromTable(relation, true);
+}
+
+
+/*
+ * PredicateLockPageSplit
+ *
+ * Copies any predicate locks for the old page to the new page.
+ * Skip if this is a temporary table or toast table.
+ *
+ * NOTE: A page split (or overflow) affects all serializable transactions,
+ * even if it occurs in the context of another transaction isolation level.
+ *
+ * NOTE: This currently leaves the local copy of the locks without
+ * information on the new lock which is in shared memory. This could cause
+ * problems if enough page splits occur on locked pages without the processes
+ * which hold the locks getting in and noticing.
+ */
+void
+PredicateLockPageSplit(Relation relation, BlockNumber oldblkno,
+ BlockNumber newblkno)
+{
+ PREDICATELOCKTARGETTAG oldtargettag;
+ PREDICATELOCKTARGETTAG newtargettag;
+ bool success;
+
+ /*
+ * Bail out quickly if there are no serializable transactions running.
+ *
+ * It's safe to do this check without taking any additional locks. Even if
+ * a serializable transaction starts concurrently, we know it can't take
+ * any SIREAD locks on the page being split because the caller is holding
+ * the associated buffer page lock. Memory reordering isn't an issue; the
+ * memory barrier in the LWLock acquisition guarantees that this read
+ * occurs while the buffer page lock is held.
+ */
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+ return;
+
+ if (!PredicateLockingNeededForRelation(relation))
+ return;
+
+ Assert(oldblkno != newblkno);
+ Assert(BlockNumberIsValid(oldblkno));
+ Assert(BlockNumberIsValid(newblkno));
+
+ SET_PREDICATELOCKTARGETTAG_PAGE(oldtargettag,
+ relation->rd_locator.dbOid,
+ relation->rd_id,
+ oldblkno);
+ SET_PREDICATELOCKTARGETTAG_PAGE(newtargettag,
+ relation->rd_locator.dbOid,
+ relation->rd_id,
+ newblkno);
+
+ LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+
+ /*
+ * Try copying the locks over to the new page's tag, creating it if
+ * necessary.
+ */
+ success = TransferPredicateLocksToNewTarget(oldtargettag,
+ newtargettag,
+ false);
+
+ if (!success)
+ {
+ /*
+ * No more predicate lock entries are available. Failure isn't an
+ * option here, so promote the page lock to a relation lock.
+ */
+
+ /* Get the parent relation lock's lock tag */
+ success = GetParentPredicateLockTag(&oldtargettag,
+ &newtargettag);
+ Assert(success);
+
+ /*
+ * Move the locks to the parent. This shouldn't fail.
+ *
+ * Note that here we are removing locks held by other backends,
+ * leading to a possible inconsistency in their local lock hash table.
+ * This is OK because we're replacing it with a lock that covers the
+ * old one.
+ */
+ success = TransferPredicateLocksToNewTarget(oldtargettag,
+ newtargettag,
+ true);
+ Assert(success);
+ }
+
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * PredicateLockPageCombine
+ *
+ * Combines predicate locks for two existing pages.
+ * Skip if this is a temporary table or toast table.
+ *
+ * NOTE: A page combine affects all serializable transactions, even if it
+ * occurs in the context of another transaction isolation level.
+ */
+void
+PredicateLockPageCombine(Relation relation, BlockNumber oldblkno,
+ BlockNumber newblkno)
+{
+ /*
+ * Page combines differ from page splits in that we ought to be able to
+ * remove the locks on the old page after transferring them to the new
+ * page, instead of duplicating them. However, because we can't edit other
+ * backends' local lock tables, removing the old lock would leave them
+ * with an entry in their LocalPredicateLockHash for a lock they're not
+ * holding, which isn't acceptable. So we wind up having to do the same
+ * work as a page split, acquiring a lock on the new page and keeping the
+ * old page locked too. That can lead to some false positives, but should
+ * be rare in practice.
+ */
+ PredicateLockPageSplit(relation, oldblkno, newblkno);
+}
+
+/*
+ * Walk the list of in-progress serializable transactions and find the new
+ * xmin.
+ */
+static void
+SetNewSxactGlobalXmin(void)
+{
+ dlist_iter iter;
+
+ Assert(LWLockHeldByMe(SerializableXactHashLock));
+
+ PredXact->SxactGlobalXmin = InvalidTransactionId;
+ PredXact->SxactGlobalXminCount = 0;
+
+ dlist_foreach(iter, &PredXact->activeList)
+ {
+ SERIALIZABLEXACT *sxact =
+ dlist_container(SERIALIZABLEXACT, xactLink, iter.cur);
+
+ if (!SxactIsRolledBack(sxact)
+ && !SxactIsCommitted(sxact)
+ && sxact != OldCommittedSxact)
+ {
+ Assert(sxact->xmin != InvalidTransactionId);
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
+ || TransactionIdPrecedes(sxact->xmin,
+ PredXact->SxactGlobalXmin))
+ {
+ PredXact->SxactGlobalXmin = sxact->xmin;
+ PredXact->SxactGlobalXminCount = 1;
+ }
+ else if (TransactionIdEquals(sxact->xmin,
+ PredXact->SxactGlobalXmin))
+ PredXact->SxactGlobalXminCount++;
+ }
+ }
+
+ SerialSetActiveSerXmin(PredXact->SxactGlobalXmin);
+}
+
+/*
+ * ReleasePredicateLocks
+ *
+ * Releases predicate locks based on completion of the current transaction,
+ * whether committed or rolled back. It can also be called for a read only
+ * transaction when it becomes impossible for the transaction to become
+ * part of a dangerous structure.
+ *
+ * We do nothing unless this is a serializable transaction.
+ *
+ * This method must ensure that shared memory hash tables are cleaned
+ * up in some relatively timely fashion.
+ *
+ * If this transaction is committing and is holding any predicate locks,
+ * it must be added to a list of completed serializable transactions still
+ * holding locks.
+ *
+ * If isReadOnlySafe is true, then predicate locks are being released before
+ * the end of the transaction because MySerializableXact has been determined
+ * to be RO_SAFE. In non-parallel mode we can release it completely, but it
+ * in parallel mode we partially release the SERIALIZABLEXACT and keep it
+ * around until the end of the transaction, allowing each backend to clear its
+ * MySerializableXact variable and benefit from the optimization in its own
+ * time.
+ */
+void
+ReleasePredicateLocks(bool isCommit, bool isReadOnlySafe)
+{
+ bool partiallyReleasing = false;
+ bool needToClear;
+ SERIALIZABLEXACT *roXact;
+ dlist_mutable_iter iter;
+
+ /*
+ * We can't trust XactReadOnly here, because a transaction which started
+ * as READ WRITE can show as READ ONLY later, e.g., within
+ * subtransactions. We want to flag a transaction as READ ONLY if it
+ * commits without writing so that de facto READ ONLY transactions get the
+ * benefit of some RO optimizations, so we will use this local variable to
+ * get some cleanup logic right which is based on whether the transaction
+ * was declared READ ONLY at the top level.
+ */
+ bool topLevelIsDeclaredReadOnly;
+
+ /* We can't be both committing and releasing early due to RO_SAFE. */
+ Assert(!(isCommit && isReadOnlySafe));
+
+ /* Are we at the end of a transaction, that is, a commit or abort? */
+ if (!isReadOnlySafe)
+ {
+ /*
+ * Parallel workers mustn't release predicate locks at the end of
+ * their transaction. The leader will do that at the end of its
+ * transaction.
+ */
+ if (IsParallelWorker())
+ {
+ ReleasePredicateLocksLocal();
+ return;
+ }
+
+ /*
+ * By the time the leader in a parallel query reaches end of
+ * transaction, it has waited for all workers to exit.
+ */
+ Assert(!ParallelContextActive());
+
+ /*
+ * If the leader in a parallel query earlier stashed a partially
+ * released SERIALIZABLEXACT for final clean-up at end of transaction
+ * (because workers might still have been accessing it), then it's
+ * time to restore it.
+ */
+ if (SavedSerializableXact != InvalidSerializableXact)
+ {
+ Assert(MySerializableXact == InvalidSerializableXact);
+ MySerializableXact = SavedSerializableXact;
+ SavedSerializableXact = InvalidSerializableXact;
+ Assert(SxactIsPartiallyReleased(MySerializableXact));
+ }
+ }
+
+ if (MySerializableXact == InvalidSerializableXact)
+ {
+ Assert(LocalPredicateLockHash == NULL);
+ return;
+ }
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * If the transaction is committing, but it has been partially released
+ * already, then treat this as a roll back. It was marked as rolled back.
+ */
+ if (isCommit && SxactIsPartiallyReleased(MySerializableXact))
+ isCommit = false;
+
+ /*
+ * If we're called in the middle of a transaction because we discovered
+ * that the SXACT_FLAG_RO_SAFE flag was set, then we'll partially release
+ * it (that is, release the predicate locks and conflicts, but not the
+ * SERIALIZABLEXACT itself) if we're the first backend to have noticed.
+ */
+ if (isReadOnlySafe && IsInParallelMode())
+ {
+ /*
+ * The leader needs to stash a pointer to it, so that it can
+ * completely release it at end-of-transaction.
+ */
+ if (!IsParallelWorker())
+ SavedSerializableXact = MySerializableXact;
+
+ /*
+ * The first backend to reach this condition will partially release
+ * the SERIALIZABLEXACT. All others will just clear their
+ * backend-local state so that they stop doing SSI checks for the rest
+ * of the transaction.
+ */
+ if (SxactIsPartiallyReleased(MySerializableXact))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ReleasePredicateLocksLocal();
+ return;
+ }
+ else
+ {
+ MySerializableXact->flags |= SXACT_FLAG_PARTIALLY_RELEASED;
+ partiallyReleasing = true;
+ /* ... and proceed to perform the partial release below. */
+ }
+ }
+ Assert(!isCommit || SxactIsPrepared(MySerializableXact));
+ Assert(!isCommit || !SxactIsDoomed(MySerializableXact));
+ Assert(!SxactIsCommitted(MySerializableXact));
+ Assert(SxactIsPartiallyReleased(MySerializableXact)
+ || !SxactIsRolledBack(MySerializableXact));
+
+ /* may not be serializable during COMMIT/ROLLBACK PREPARED */
+ Assert(MySerializableXact->pid == 0 || IsolationIsSerializable());
+
+ /* We'd better not already be on the cleanup list. */
+ Assert(!SxactIsOnFinishedList(MySerializableXact));
+
+ topLevelIsDeclaredReadOnly = SxactIsReadOnly(MySerializableXact);
+
+ /*
+ * We don't hold XidGenLock lock here, assuming that TransactionId is
+ * atomic!
+ *
+ * If this value is changing, we don't care that much whether we get the
+ * old or new value -- it is just used to determine how far
+ * SxactGlobalXmin must advance before this transaction can be fully
+ * cleaned up. The worst that could happen is we wait for one more
+ * transaction to complete before freeing some RAM; correctness of visible
+ * behavior is not affected.
+ */
+ MySerializableXact->finishedBefore = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+ /*
+ * If it's not a commit it's either a rollback or a read-only transaction
+ * flagged SXACT_FLAG_RO_SAFE, and we can clear our locks immediately.
+ */
+ if (isCommit)
+ {
+ MySerializableXact->flags |= SXACT_FLAG_COMMITTED;
+ MySerializableXact->commitSeqNo = ++(PredXact->LastSxactCommitSeqNo);
+ /* Recognize implicit read-only transaction (commit without write). */
+ if (!MyXactDidWrite)
+ MySerializableXact->flags |= SXACT_FLAG_READ_ONLY;
+ }
+ else
+ {
+ /*
+ * The DOOMED flag indicates that we intend to roll back this
+ * transaction and so it should not cause serialization failures for
+ * other transactions that conflict with it. Note that this flag might
+ * already be set, if another backend marked this transaction for
+ * abort.
+ *
+ * The ROLLED_BACK flag further indicates that ReleasePredicateLocks
+ * has been called, and so the SerializableXact is eligible for
+ * cleanup. This means it should not be considered when calculating
+ * SxactGlobalXmin.
+ */
+ MySerializableXact->flags |= SXACT_FLAG_DOOMED;
+ MySerializableXact->flags |= SXACT_FLAG_ROLLED_BACK;
+
+ /*
+ * If the transaction was previously prepared, but is now failing due
+ * to a ROLLBACK PREPARED or (hopefully very rare) error after the
+ * prepare, clear the prepared flag. This simplifies conflict
+ * checking.
+ */
+ MySerializableXact->flags &= ~SXACT_FLAG_PREPARED;
+ }
+
+ if (!topLevelIsDeclaredReadOnly)
+ {
+ Assert(PredXact->WritableSxactCount > 0);
+ if (--(PredXact->WritableSxactCount) == 0)
+ {
+ /*
+ * Release predicate locks and rw-conflicts in for all committed
+ * transactions. There are no longer any transactions which might
+ * conflict with the locks and no chance for new transactions to
+ * overlap. Similarly, existing conflicts in can't cause pivots,
+ * and any conflicts in which could have completed a dangerous
+ * structure would already have caused a rollback, so any
+ * remaining ones must be benign.
+ */
+ PredXact->CanPartialClearThrough = PredXact->LastSxactCommitSeqNo;
+ }
+ }
+ else
+ {
+ /*
+ * Read-only transactions: clear the list of transactions that might
+ * make us unsafe. Note that we use 'inLink' for the iteration as
+ * opposed to 'outLink' for the r/w xacts.
+ */
+ dlist_foreach_modify(iter, &MySerializableXact->possibleUnsafeConflicts)
+ {
+ RWConflict possibleUnsafeConflict =
+ dlist_container(RWConflictData, inLink, iter.cur);
+
+ Assert(!SxactIsReadOnly(possibleUnsafeConflict->sxactOut));
+ Assert(MySerializableXact == possibleUnsafeConflict->sxactIn);
+
+ ReleaseRWConflict(possibleUnsafeConflict);
+ }
+ }
+
+ /* Check for conflict out to old committed transactions. */
+ if (isCommit
+ && !SxactIsReadOnly(MySerializableXact)
+ && SxactHasSummaryConflictOut(MySerializableXact))
+ {
+ /*
+ * we don't know which old committed transaction we conflicted with,
+ * so be conservative and use FirstNormalSerCommitSeqNo here
+ */
+ MySerializableXact->SeqNo.earliestOutConflictCommit =
+ FirstNormalSerCommitSeqNo;
+ MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
+ }
+
+ /*
+ * Release all outConflicts to committed transactions. If we're rolling
+ * back clear them all. Set SXACT_FLAG_CONFLICT_OUT if any point to
+ * previously committed transactions.
+ */
+ dlist_foreach_modify(iter, &MySerializableXact->outConflicts)
+ {
+ RWConflict conflict =
+ dlist_container(RWConflictData, outLink, iter.cur);
+
+ if (isCommit
+ && !SxactIsReadOnly(MySerializableXact)
+ && SxactIsCommitted(conflict->sxactIn))
+ {
+ if ((MySerializableXact->flags & SXACT_FLAG_CONFLICT_OUT) == 0
+ || conflict->sxactIn->prepareSeqNo < MySerializableXact->SeqNo.earliestOutConflictCommit)
+ MySerializableXact->SeqNo.earliestOutConflictCommit = conflict->sxactIn->prepareSeqNo;
+ MySerializableXact->flags |= SXACT_FLAG_CONFLICT_OUT;
+ }
+
+ if (!isCommit
+ || SxactIsCommitted(conflict->sxactIn)
+ || (conflict->sxactIn->SeqNo.lastCommitBeforeSnapshot >= PredXact->LastSxactCommitSeqNo))
+ ReleaseRWConflict(conflict);
+ }
+
+ /*
+ * Release all inConflicts from committed and read-only transactions. If
+ * we're rolling back, clear them all.
+ */
+ dlist_foreach_modify(iter, &MySerializableXact->inConflicts)
+ {
+ RWConflict conflict =
+ dlist_container(RWConflictData, inLink, iter.cur);
+
+ if (!isCommit
+ || SxactIsCommitted(conflict->sxactOut)
+ || SxactIsReadOnly(conflict->sxactOut))
+ ReleaseRWConflict(conflict);
+ }
+
+ if (!topLevelIsDeclaredReadOnly)
+ {
+ /*
+ * Remove ourselves from the list of possible conflicts for concurrent
+ * READ ONLY transactions, flagging them as unsafe if we have a
+ * conflict out. If any are waiting DEFERRABLE transactions, wake them
+ * up if they are known safe or known unsafe.
+ */
+ dlist_foreach_modify(iter, &MySerializableXact->possibleUnsafeConflicts)
+ {
+ RWConflict possibleUnsafeConflict =
+ dlist_container(RWConflictData, outLink, iter.cur);
+
+ roXact = possibleUnsafeConflict->sxactIn;
+ Assert(MySerializableXact == possibleUnsafeConflict->sxactOut);
+ Assert(SxactIsReadOnly(roXact));
+
+ /* Mark conflicted if necessary. */
+ if (isCommit
+ && MyXactDidWrite
+ && SxactHasConflictOut(MySerializableXact)
+ && (MySerializableXact->SeqNo.earliestOutConflictCommit
+ <= roXact->SeqNo.lastCommitBeforeSnapshot))
+ {
+ /*
+ * This releases possibleUnsafeConflict (as well as all other
+ * possible conflicts for roXact)
+ */
+ FlagSxactUnsafe(roXact);
+ }
+ else
+ {
+ ReleaseRWConflict(possibleUnsafeConflict);
+
+ /*
+ * If we were the last possible conflict, flag it safe. The
+ * transaction can now safely release its predicate locks (but
+ * that transaction's backend has to do that itself).
+ */
+ if (dlist_is_empty(&roXact->possibleUnsafeConflicts))
+ roXact->flags |= SXACT_FLAG_RO_SAFE;
+ }
+
+ /*
+ * Wake up the process for a waiting DEFERRABLE transaction if we
+ * now know it's either safe or conflicted.
+ */
+ if (SxactIsDeferrableWaiting(roXact) &&
+ (SxactIsROUnsafe(roXact) || SxactIsROSafe(roXact)))
+ ProcSendSignal(roXact->pgprocno);
+ }
+ }
+
+ /*
+ * Check whether it's time to clean up old transactions. This can only be
+ * done when the last serializable transaction with the oldest xmin among
+ * serializable transactions completes. We then find the "new oldest"
+ * xmin and purge any transactions which finished before this transaction
+ * was launched.
+ *
+ * For parallel queries in read-only transactions, it might run twice. We
+ * only release the reference on the first call.
+ */
+ needToClear = false;
+ if ((partiallyReleasing ||
+ !SxactIsPartiallyReleased(MySerializableXact)) &&
+ TransactionIdEquals(MySerializableXact->xmin,
+ PredXact->SxactGlobalXmin))
+ {
+ Assert(PredXact->SxactGlobalXminCount > 0);
+ if (--(PredXact->SxactGlobalXminCount) == 0)
+ {
+ SetNewSxactGlobalXmin();
+ needToClear = true;
+ }
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+
+ LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+
+ /* Add this to the list of transactions to check for later cleanup. */
+ if (isCommit)
+ dlist_push_tail(FinishedSerializableTransactions,
+ &MySerializableXact->finishedLink);
+
+ /*
+ * If we're releasing a RO_SAFE transaction in parallel mode, we'll only
+ * partially release it. That's necessary because other backends may have
+ * a reference to it. The leader will release the SERIALIZABLEXACT itself
+ * at the end of the transaction after workers have stopped running.
+ */
+ if (!isCommit)
+ ReleaseOneSerializableXact(MySerializableXact,
+ isReadOnlySafe && IsInParallelMode(),
+ false);
+
+ LWLockRelease(SerializableFinishedListLock);
+
+ if (needToClear)
+ ClearOldPredicateLocks();
+
+ ReleasePredicateLocksLocal();
+}
+
+static void
+ReleasePredicateLocksLocal(void)
+{
+ MySerializableXact = InvalidSerializableXact;
+ MyXactDidWrite = false;
+
+ /* Delete per-transaction lock table */
+ if (LocalPredicateLockHash != NULL)
+ {
+ hash_destroy(LocalPredicateLockHash);
+ LocalPredicateLockHash = NULL;
+ }
+}
+
+/*
+ * Clear old predicate locks, belonging to committed transactions that are no
+ * longer interesting to any in-progress transaction.
+ */
+static void
+ClearOldPredicateLocks(void)
+{
+ dlist_mutable_iter iter;
+
+ /*
+ * Loop through finished transactions. They are in commit order, so we can
+ * stop as soon as we find one that's still interesting.
+ */
+ LWLockAcquire(SerializableFinishedListLock, LW_EXCLUSIVE);
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ dlist_foreach_modify(iter, FinishedSerializableTransactions)
+ {
+ SERIALIZABLEXACT *finishedSxact =
+ dlist_container(SERIALIZABLEXACT, finishedLink, iter.cur);
+
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin)
+ || TransactionIdPrecedesOrEquals(finishedSxact->finishedBefore,
+ PredXact->SxactGlobalXmin))
+ {
+ /*
+ * This transaction committed before any in-progress transaction
+ * took its snapshot. It's no longer interesting.
+ */
+ LWLockRelease(SerializableXactHashLock);
+ dlist_delete_thoroughly(&finishedSxact->finishedLink);
+ ReleaseOneSerializableXact(finishedSxact, false, false);
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ }
+ else if (finishedSxact->commitSeqNo > PredXact->HavePartialClearedThrough
+ && finishedSxact->commitSeqNo <= PredXact->CanPartialClearThrough)
+ {
+ /*
+ * Any active transactions that took their snapshot before this
+ * transaction committed are read-only, so we can clear part of
+ * its state.
+ */
+ LWLockRelease(SerializableXactHashLock);
+
+ if (SxactIsReadOnly(finishedSxact))
+ {
+ /* A read-only transaction can be removed entirely */
+ dlist_delete_thoroughly(&(finishedSxact->finishedLink));
+ ReleaseOneSerializableXact(finishedSxact, false, false);
+ }
+ else
+ {
+ /*
+ * A read-write transaction can only be partially cleared. We
+ * need to keep the SERIALIZABLEXACT but can release the
+ * SIREAD locks and conflicts in.
+ */
+ ReleaseOneSerializableXact(finishedSxact, true, false);
+ }
+
+ PredXact->HavePartialClearedThrough = finishedSxact->commitSeqNo;
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ }
+ else
+ {
+ /* Still interesting. */
+ break;
+ }
+ }
+ LWLockRelease(SerializableXactHashLock);
+
+ /*
+ * Loop through predicate locks on dummy transaction for summarized data.
+ */
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ dlist_foreach_modify(iter, &OldCommittedSxact->predicateLocks)
+ {
+ PREDICATELOCK *predlock =
+ dlist_container(PREDICATELOCK, xactLink, iter.cur);
+ bool canDoPartialCleanup;
+
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ Assert(predlock->commitSeqNo != 0);
+ Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
+ canDoPartialCleanup = (predlock->commitSeqNo <= PredXact->CanPartialClearThrough);
+ LWLockRelease(SerializableXactHashLock);
+
+ /*
+ * If this lock originally belonged to an old enough transaction, we
+ * can release it.
+ */
+ if (canDoPartialCleanup)
+ {
+ PREDICATELOCKTAG tag;
+ PREDICATELOCKTARGET *target;
+ PREDICATELOCKTARGETTAG targettag;
+ uint32 targettaghash;
+ LWLock *partitionLock;
+
+ tag = predlock->tag;
+ target = tag.myTarget;
+ targettag = target->tag;
+ targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ dlist_delete(&(predlock->targetLink));
+ dlist_delete(&(predlock->xactLink));
+
+ hash_search_with_hash_value(PredicateLockHash, &tag,
+ PredicateLockHashCodeFromTargetHashCode(&tag,
+ targettaghash),
+ HASH_REMOVE, NULL);
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+ LWLockRelease(partitionLock);
+ }
+ }
+
+ LWLockRelease(SerializablePredicateListLock);
+ LWLockRelease(SerializableFinishedListLock);
+}
+
+/*
+ * This is the normal way to delete anything from any of the predicate
+ * locking hash tables. Given a transaction which we know can be deleted:
+ * delete all predicate locks held by that transaction and any predicate
+ * lock targets which are now unreferenced by a lock; delete all conflicts
+ * for the transaction; delete all xid values for the transaction; then
+ * delete the transaction.
+ *
+ * When the partial flag is set, we can release all predicate locks and
+ * in-conflict information -- we've established that there are no longer
+ * any overlapping read write transactions for which this transaction could
+ * matter -- but keep the transaction entry itself and any outConflicts.
+ *
+ * When the summarize flag is set, we've run short of room for sxact data
+ * and must summarize to the SLRU. Predicate locks are transferred to a
+ * dummy "old" transaction, with duplicate locks on a single target
+ * collapsing to a single lock with the "latest" commitSeqNo from among
+ * the conflicting locks..
+ */
+static void
+ReleaseOneSerializableXact(SERIALIZABLEXACT *sxact, bool partial,
+ bool summarize)
+{
+ SERIALIZABLEXIDTAG sxidtag;
+ dlist_mutable_iter iter;
+
+ Assert(sxact != NULL);
+ Assert(SxactIsRolledBack(sxact) || SxactIsCommitted(sxact));
+ Assert(partial || !SxactIsOnFinishedList(sxact));
+ Assert(LWLockHeldByMe(SerializableFinishedListLock));
+
+ /*
+ * First release all the predicate locks held by this xact (or transfer
+ * them to OldCommittedSxact if summarize is true)
+ */
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ if (IsInParallelMode())
+ LWLockAcquire(&sxact->perXactPredicateListLock, LW_EXCLUSIVE);
+ dlist_foreach_modify(iter, &sxact->predicateLocks)
+ {
+ PREDICATELOCK *predlock =
+ dlist_container(PREDICATELOCK, xactLink, iter.cur);
+ PREDICATELOCKTAG tag;
+ PREDICATELOCKTARGET *target;
+ PREDICATELOCKTARGETTAG targettag;
+ uint32 targettaghash;
+ LWLock *partitionLock;
+
+ tag = predlock->tag;
+ target = tag.myTarget;
+ targettag = target->tag;
+ targettaghash = PredicateLockTargetTagHashCode(&targettag);
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ dlist_delete(&predlock->targetLink);
+
+ hash_search_with_hash_value(PredicateLockHash, &tag,
+ PredicateLockHashCodeFromTargetHashCode(&tag,
+ targettaghash),
+ HASH_REMOVE, NULL);
+ if (summarize)
+ {
+ bool found;
+
+ /* Fold into dummy transaction list. */
+ tag.myXact = OldCommittedSxact;
+ predlock = hash_search_with_hash_value(PredicateLockHash, &tag,
+ PredicateLockHashCodeFromTargetHashCode(&tag,
+ targettaghash),
+ HASH_ENTER_NULL, &found);
+ if (!predlock)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory"),
+ errhint("You might need to increase %s.", "max_pred_locks_per_transaction")));
+ if (found)
+ {
+ Assert(predlock->commitSeqNo != 0);
+ Assert(predlock->commitSeqNo != InvalidSerCommitSeqNo);
+ if (predlock->commitSeqNo < sxact->commitSeqNo)
+ predlock->commitSeqNo = sxact->commitSeqNo;
+ }
+ else
+ {
+ dlist_push_tail(&target->predicateLocks,
+ &predlock->targetLink);
+ dlist_push_tail(&OldCommittedSxact->predicateLocks,
+ &predlock->xactLink);
+ predlock->commitSeqNo = sxact->commitSeqNo;
+ }
+ }
+ else
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+
+ LWLockRelease(partitionLock);
+ }
+
+ /*
+ * Rather than retail removal, just re-init the head after we've run
+ * through the list.
+ */
+ dlist_init(&sxact->predicateLocks);
+
+ if (IsInParallelMode())
+ LWLockRelease(&sxact->perXactPredicateListLock);
+ LWLockRelease(SerializablePredicateListLock);
+
+ sxidtag.xid = sxact->topXid;
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /* Release all outConflicts (unless 'partial' is true) */
+ if (!partial)
+ {
+ dlist_foreach_modify(iter, &sxact->outConflicts)
+ {
+ RWConflict conflict =
+ dlist_container(RWConflictData, outLink, iter.cur);
+
+ if (summarize)
+ conflict->sxactIn->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+ ReleaseRWConflict(conflict);
+ }
+ }
+
+ /* Release all inConflicts. */
+ dlist_foreach_modify(iter, &sxact->inConflicts)
+ {
+ RWConflict conflict =
+ dlist_container(RWConflictData, inLink, iter.cur);
+
+ if (summarize)
+ conflict->sxactOut->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+ ReleaseRWConflict(conflict);
+ }
+
+ /* Finally, get rid of the xid and the record of the transaction itself. */
+ if (!partial)
+ {
+ if (sxidtag.xid != InvalidTransactionId)
+ hash_search(SerializableXidHash, &sxidtag, HASH_REMOVE, NULL);
+ ReleasePredXact(sxact);
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+}
+
+/*
+ * Tests whether the given top level transaction is concurrent with
+ * (overlaps) our current transaction.
+ *
+ * We need to identify the top level transaction for SSI, anyway, so pass
+ * that to this function to save the overhead of checking the snapshot's
+ * subxip array.
+ */
+static bool
+XidIsConcurrent(TransactionId xid)
+{
+ Snapshot snap;
+
+ Assert(TransactionIdIsValid(xid));
+ Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
+
+ snap = GetTransactionSnapshot();
+
+ if (TransactionIdPrecedes(xid, snap->xmin))
+ return false;
+
+ if (TransactionIdFollowsOrEquals(xid, snap->xmax))
+ return true;
+
+ return pg_lfind32(xid, snap->xip, snap->xcnt);
+}
+
+bool
+CheckForSerializableConflictOutNeeded(Relation relation, Snapshot snapshot)
+{
+ if (!SerializationNeededForRead(relation, snapshot))
+ return false;
+
+ /* Check if someone else has already decided that we need to die */
+ if (SxactIsDoomed(MySerializableXact))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
+ errhint("The transaction might succeed if retried.")));
+ }
+
+ return true;
+}
+
+/*
+ * CheckForSerializableConflictOut
+ * A table AM is reading a tuple that has been modified. If it determines
+ * that the tuple version it is reading is not visible to us, it should
+ * pass in the top level xid of the transaction that created it.
+ * Otherwise, if it determines that it is visible to us but it has been
+ * deleted or there is a newer version available due to an update, it
+ * should pass in the top level xid of the modifying transaction.
+ *
+ * This function will check for overlap with our own transaction. If the given
+ * xid is also serializable and the transactions overlap (i.e., they cannot see
+ * each other's writes), then we have a conflict out.
+ */
+void
+CheckForSerializableConflictOut(Relation relation, TransactionId xid, Snapshot snapshot)
+{
+ SERIALIZABLEXIDTAG sxidtag;
+ SERIALIZABLEXID *sxid;
+ SERIALIZABLEXACT *sxact;
+
+ if (!SerializationNeededForRead(relation, snapshot))
+ return;
+
+ /* Check if someone else has already decided that we need to die */
+ if (SxactIsDoomed(MySerializableXact))
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict out checking."),
+ errhint("The transaction might succeed if retried.")));
+ }
+ Assert(TransactionIdIsValid(xid));
+
+ if (TransactionIdEquals(xid, GetTopTransactionIdIfAny()))
+ return;
+
+ /*
+ * Find sxact or summarized info for the top level xid.
+ */
+ sxidtag.xid = xid;
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ sxid = (SERIALIZABLEXID *)
+ hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ if (!sxid)
+ {
+ /*
+ * Transaction not found in "normal" SSI structures. Check whether it
+ * got pushed out to SLRU storage for "old committed" transactions.
+ */
+ SerCommitSeqNo conflictCommitSeqNo;
+
+ conflictCommitSeqNo = SerialGetMinConflictCommitSeqNo(xid);
+ if (conflictCommitSeqNo != 0)
+ {
+ if (conflictCommitSeqNo != InvalidSerCommitSeqNo
+ && (!SxactIsReadOnly(MySerializableXact)
+ || conflictCommitSeqNo
+ <= MySerializableXact->SeqNo.lastCommitBeforeSnapshot))
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on conflict out to old pivot %u.", xid),
+ errhint("The transaction might succeed if retried.")));
+
+ if (SxactHasSummaryConflictIn(MySerializableXact)
+ || !dlist_is_empty(&MySerializableXact->inConflicts))
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, with conflict out to old committed transaction %u.", xid),
+ errhint("The transaction might succeed if retried.")));
+
+ MySerializableXact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+ }
+
+ /* It's not serializable or otherwise not important. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+ sxact = sxid->myXact;
+ Assert(TransactionIdEquals(sxact->topXid, xid));
+ if (sxact == MySerializableXact || SxactIsDoomed(sxact))
+ {
+ /* Can't conflict with ourself or a transaction that will roll back. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+
+ /*
+ * We have a conflict out to a transaction which has a conflict out to a
+ * summarized transaction. That summarized transaction must have
+ * committed first, and we can't tell when it committed in relation to our
+ * snapshot acquisition, so something needs to be canceled.
+ */
+ if (SxactHasSummaryConflictOut(sxact))
+ {
+ if (!SxactIsPrepared(sxact))
+ {
+ sxact->flags |= SXACT_FLAG_DOOMED;
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+ else
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on conflict out to old pivot."),
+ errhint("The transaction might succeed if retried.")));
+ }
+ }
+
+ /*
+ * If this is a read-only transaction and the writing transaction has
+ * committed, and it doesn't have a rw-conflict to a transaction which
+ * committed before it, no conflict.
+ */
+ if (SxactIsReadOnly(MySerializableXact)
+ && SxactIsCommitted(sxact)
+ && !SxactHasSummaryConflictOut(sxact)
+ && (!SxactHasConflictOut(sxact)
+ || MySerializableXact->SeqNo.lastCommitBeforeSnapshot < sxact->SeqNo.earliestOutConflictCommit))
+ {
+ /* Read-only transaction will appear to run first. No conflict. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+
+ if (!XidIsConcurrent(xid))
+ {
+ /* This write was already in our snapshot; no conflict. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+
+ if (RWConflictExists(MySerializableXact, sxact))
+ {
+ /* We don't want duplicate conflict records in the list. */
+ LWLockRelease(SerializableXactHashLock);
+ return;
+ }
+
+ /*
+ * Flag the conflict. But first, if this conflict creates a dangerous
+ * structure, ereport an error.
+ */
+ FlagRWConflict(MySerializableXact, sxact);
+ LWLockRelease(SerializableXactHashLock);
+}
+
+/*
+ * Check a particular target for rw-dependency conflict in. A subroutine of
+ * CheckForSerializableConflictIn().
+ */
+static void
+CheckTargetForConflictsIn(PREDICATELOCKTARGETTAG *targettag)
+{
+ uint32 targettaghash;
+ LWLock *partitionLock;
+ PREDICATELOCKTARGET *target;
+ PREDICATELOCK *mypredlock = NULL;
+ PREDICATELOCKTAG mypredlocktag;
+ dlist_mutable_iter iter;
+
+ Assert(MySerializableXact != InvalidSerializableXact);
+
+ /*
+ * The same hash and LW lock apply to the lock target and the lock itself.
+ */
+ targettaghash = PredicateLockTargetTagHashCode(targettag);
+ partitionLock = PredicateLockHashPartitionLock(targettaghash);
+ LWLockAcquire(partitionLock, LW_SHARED);
+ target = (PREDICATELOCKTARGET *)
+ hash_search_with_hash_value(PredicateLockTargetHash,
+ targettag, targettaghash,
+ HASH_FIND, NULL);
+ if (!target)
+ {
+ /* Nothing has this target locked; we're done here. */
+ LWLockRelease(partitionLock);
+ return;
+ }
+
+ /*
+ * Each lock for an overlapping transaction represents a conflict: a
+ * rw-dependency in to this transaction.
+ */
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+
+ dlist_foreach_modify(iter, &target->predicateLocks)
+ {
+ PREDICATELOCK *predlock =
+ dlist_container(PREDICATELOCK, targetLink, iter.cur);
+ SERIALIZABLEXACT *sxact = predlock->tag.myXact;
+
+ if (sxact == MySerializableXact)
+ {
+ /*
+ * If we're getting a write lock on a tuple, we don't need a
+ * predicate (SIREAD) lock on the same tuple. We can safely remove
+ * our SIREAD lock, but we'll defer doing so until after the loop
+ * because that requires upgrading to an exclusive partition lock.
+ *
+ * We can't use this optimization within a subtransaction because
+ * the subtransaction could roll back, and we would be left
+ * without any lock at the top level.
+ */
+ if (!IsSubTransaction()
+ && GET_PREDICATELOCKTARGETTAG_OFFSET(*targettag))
+ {
+ mypredlock = predlock;
+ mypredlocktag = predlock->tag;
+ }
+ }
+ else if (!SxactIsDoomed(sxact)
+ && (!SxactIsCommitted(sxact)
+ || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+ sxact->finishedBefore))
+ && !RWConflictExists(sxact, MySerializableXact))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Re-check after getting exclusive lock because the other
+ * transaction may have flagged a conflict.
+ */
+ if (!SxactIsDoomed(sxact)
+ && (!SxactIsCommitted(sxact)
+ || TransactionIdPrecedes(GetTransactionSnapshot()->xmin,
+ sxact->finishedBefore))
+ && !RWConflictExists(sxact, MySerializableXact))
+ {
+ FlagRWConflict(sxact, MySerializableXact);
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ }
+ }
+ LWLockRelease(SerializableXactHashLock);
+ LWLockRelease(partitionLock);
+
+ /*
+ * If we found one of our own SIREAD locks to remove, remove it now.
+ *
+ * At this point our transaction already has a RowExclusiveLock on the
+ * relation, so we are OK to drop the predicate lock on the tuple, if
+ * found, without fearing that another write against the tuple will occur
+ * before the MVCC information makes it to the buffer.
+ */
+ if (mypredlock != NULL)
+ {
+ uint32 predlockhashcode;
+ PREDICATELOCK *rmpredlock;
+
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+ if (IsInParallelMode())
+ LWLockAcquire(&MySerializableXact->perXactPredicateListLock, LW_EXCLUSIVE);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Remove the predicate lock from shared memory, if it wasn't removed
+ * while the locks were released. One way that could happen is from
+ * autovacuum cleaning up an index.
+ */
+ predlockhashcode = PredicateLockHashCodeFromTargetHashCode
+ (&mypredlocktag, targettaghash);
+ rmpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &mypredlocktag,
+ predlockhashcode,
+ HASH_FIND, NULL);
+ if (rmpredlock != NULL)
+ {
+ Assert(rmpredlock == mypredlock);
+
+ dlist_delete(&(mypredlock->targetLink));
+ dlist_delete(&(mypredlock->xactLink));
+
+ rmpredlock = (PREDICATELOCK *)
+ hash_search_with_hash_value(PredicateLockHash,
+ &mypredlocktag,
+ predlockhashcode,
+ HASH_REMOVE, NULL);
+ Assert(rmpredlock == mypredlock);
+
+ RemoveTargetIfNoLongerUsed(target, targettaghash);
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+ LWLockRelease(partitionLock);
+ if (IsInParallelMode())
+ LWLockRelease(&MySerializableXact->perXactPredicateListLock);
+ LWLockRelease(SerializablePredicateListLock);
+
+ if (rmpredlock != NULL)
+ {
+ /*
+ * Remove entry in local lock table if it exists. It's OK if it
+ * doesn't exist; that means the lock was transferred to a new
+ * target by a different backend.
+ */
+ hash_search_with_hash_value(LocalPredicateLockHash,
+ targettag, targettaghash,
+ HASH_REMOVE, NULL);
+
+ DecrementParentLocks(targettag);
+ }
+ }
+}
+
+/*
+ * CheckForSerializableConflictIn
+ * We are writing the given tuple. If that indicates a rw-conflict
+ * in from another serializable transaction, take appropriate action.
+ *
+ * Skip checking for any granularity for which a parameter is missing.
+ *
+ * A tuple update or delete is in conflict if we have a predicate lock
+ * against the relation or page in which the tuple exists, or against the
+ * tuple itself.
+ */
+void
+CheckForSerializableConflictIn(Relation relation, ItemPointer tid, BlockNumber blkno)
+{
+ PREDICATELOCKTARGETTAG targettag;
+
+ if (!SerializationNeededForWrite(relation))
+ return;
+
+ /* Check if someone else has already decided that we need to die */
+ if (SxactIsDoomed(MySerializableXact))
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during conflict in checking."),
+ errhint("The transaction might succeed if retried.")));
+
+ /*
+ * We're doing a write which might cause rw-conflicts now or later.
+ * Memorize that fact.
+ */
+ MyXactDidWrite = true;
+
+ /*
+ * It is important that we check for locks from the finest granularity to
+ * the coarsest granularity, so that granularity promotion doesn't cause
+ * us to miss a lock. The new (coarser) lock will be acquired before the
+ * old (finer) locks are released.
+ *
+ * It is not possible to take and hold a lock across the checks for all
+ * granularities because each target could be in a separate partition.
+ */
+ if (tid != NULL)
+ {
+ SET_PREDICATELOCKTARGETTAG_TUPLE(targettag,
+ relation->rd_locator.dbOid,
+ relation->rd_id,
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
+ CheckTargetForConflictsIn(&targettag);
+ }
+
+ if (blkno != InvalidBlockNumber)
+ {
+ SET_PREDICATELOCKTARGETTAG_PAGE(targettag,
+ relation->rd_locator.dbOid,
+ relation->rd_id,
+ blkno);
+ CheckTargetForConflictsIn(&targettag);
+ }
+
+ SET_PREDICATELOCKTARGETTAG_RELATION(targettag,
+ relation->rd_locator.dbOid,
+ relation->rd_id);
+ CheckTargetForConflictsIn(&targettag);
+}
+
+/*
+ * CheckTableForSerializableConflictIn
+ * The entire table is going through a DDL-style logical mass delete
+ * like TRUNCATE or DROP TABLE. If that causes a rw-conflict in from
+ * another serializable transaction, take appropriate action.
+ *
+ * While these operations do not operate entirely within the bounds of
+ * snapshot isolation, they can occur inside a serializable transaction, and
+ * will logically occur after any reads which saw rows which were destroyed
+ * by these operations, so we do what we can to serialize properly under
+ * SSI.
+ *
+ * The relation passed in must be a heap relation. Any predicate lock of any
+ * granularity on the heap will cause a rw-conflict in to this transaction.
+ * Predicate locks on indexes do not matter because they only exist to guard
+ * against conflicting inserts into the index, and this is a mass *delete*.
+ * When a table is truncated or dropped, the index will also be truncated
+ * or dropped, and we'll deal with locks on the index when that happens.
+ *
+ * Dropping or truncating a table also needs to drop any existing predicate
+ * locks on heap tuples or pages, because they're about to go away. This
+ * should be done before altering the predicate locks because the transaction
+ * could be rolled back because of a conflict, in which case the lock changes
+ * are not needed. (At the moment, we don't actually bother to drop the
+ * existing locks on a dropped or truncated table at the moment. That might
+ * lead to some false positives, but it doesn't seem worth the trouble.)
+ */
+void
+CheckTableForSerializableConflictIn(Relation relation)
+{
+ HASH_SEQ_STATUS seqstat;
+ PREDICATELOCKTARGET *target;
+ Oid dbId;
+ Oid heapId;
+ int i;
+
+ /*
+ * Bail out quickly if there are no serializable transactions running.
+ * It's safe to check this without taking locks because the caller is
+ * holding an ACCESS EXCLUSIVE lock on the relation. No new locks which
+ * would matter here can be acquired while that is held.
+ */
+ if (!TransactionIdIsValid(PredXact->SxactGlobalXmin))
+ return;
+
+ if (!SerializationNeededForWrite(relation))
+ return;
+
+ /*
+ * We're doing a write which might cause rw-conflicts now or later.
+ * Memorize that fact.
+ */
+ MyXactDidWrite = true;
+
+ Assert(relation->rd_index == NULL); /* not an index relation */
+
+ dbId = relation->rd_locator.dbOid;
+ heapId = relation->rd_id;
+
+ LWLockAcquire(SerializablePredicateListLock, LW_EXCLUSIVE);
+ for (i = 0; i < NUM_PREDICATELOCK_PARTITIONS; i++)
+ LWLockAcquire(PredicateLockHashPartitionLockByIndex(i), LW_SHARED);
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /* Scan through target list */
+ hash_seq_init(&seqstat, PredicateLockTargetHash);
+
+ while ((target = (PREDICATELOCKTARGET *) hash_seq_search(&seqstat)))
+ {
+ dlist_mutable_iter iter;
+
+ /*
+ * Check whether this is a target which needs attention.
+ */
+ if (GET_PREDICATELOCKTARGETTAG_RELATION(target->tag) != heapId)
+ continue; /* wrong relation id */
+ if (GET_PREDICATELOCKTARGETTAG_DB(target->tag) != dbId)
+ continue; /* wrong database id */
+
+ /*
+ * Loop through locks for this target and flag conflicts.
+ */
+ dlist_foreach_modify(iter, &target->predicateLocks)
+ {
+ PREDICATELOCK *predlock =
+ dlist_container(PREDICATELOCK, targetLink, iter.cur);
+
+ if (predlock->tag.myXact != MySerializableXact
+ && !RWConflictExists(predlock->tag.myXact, MySerializableXact))
+ {
+ FlagRWConflict(predlock->tag.myXact, MySerializableXact);
+ }
+ }
+ }
+
+ /* Release locks in reverse order */
+ LWLockRelease(SerializableXactHashLock);
+ for (i = NUM_PREDICATELOCK_PARTITIONS - 1; i >= 0; i--)
+ LWLockRelease(PredicateLockHashPartitionLockByIndex(i));
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+
+/*
+ * Flag a rw-dependency between two serializable transactions.
+ *
+ * The caller is responsible for ensuring that we have a LW lock on
+ * the transaction hash table.
+ */
+static void
+FlagRWConflict(SERIALIZABLEXACT *reader, SERIALIZABLEXACT *writer)
+{
+ Assert(reader != writer);
+
+ /* First, see if this conflict causes failure. */
+ OnConflict_CheckForSerializationFailure(reader, writer);
+
+ /* Actually do the conflict flagging. */
+ if (reader == OldCommittedSxact)
+ writer->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+ else if (writer == OldCommittedSxact)
+ reader->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+ else
+ SetRWConflict(reader, writer);
+}
+
+/*----------------------------------------------------------------------------
+ * We are about to add a RW-edge to the dependency graph - check that we don't
+ * introduce a dangerous structure by doing so, and abort one of the
+ * transactions if so.
+ *
+ * A serialization failure can only occur if there is a dangerous structure
+ * in the dependency graph:
+ *
+ * Tin ------> Tpivot ------> Tout
+ * rw rw
+ *
+ * Furthermore, Tout must commit first.
+ *
+ * One more optimization is that if Tin is declared READ ONLY (or commits
+ * without writing), we can only have a problem if Tout committed before Tin
+ * acquired its snapshot.
+ *----------------------------------------------------------------------------
+ */
+static void
+OnConflict_CheckForSerializationFailure(const SERIALIZABLEXACT *reader,
+ SERIALIZABLEXACT *writer)
+{
+ bool failure;
+
+ Assert(LWLockHeldByMe(SerializableXactHashLock));
+
+ failure = false;
+
+ /*------------------------------------------------------------------------
+ * Check for already-committed writer with rw-conflict out flagged
+ * (conflict-flag on W means that T2 committed before W):
+ *
+ * R ------> W ------> T2
+ * rw rw
+ *
+ * That is a dangerous structure, so we must abort. (Since the writer
+ * has already committed, we must be the reader)
+ *------------------------------------------------------------------------
+ */
+ if (SxactIsCommitted(writer)
+ && (SxactHasConflictOut(writer) || SxactHasSummaryConflictOut(writer)))
+ failure = true;
+
+ /*------------------------------------------------------------------------
+ * Check whether the writer has become a pivot with an out-conflict
+ * committed transaction (T2), and T2 committed first:
+ *
+ * R ------> W ------> T2
+ * rw rw
+ *
+ * Because T2 must've committed first, there is no anomaly if:
+ * - the reader committed before T2
+ * - the writer committed before T2
+ * - the reader is a READ ONLY transaction and the reader was concurrent
+ * with T2 (= reader acquired its snapshot before T2 committed)
+ *
+ * We also handle the case that T2 is prepared but not yet committed
+ * here. In that case T2 has already checked for conflicts, so if it
+ * commits first, making the above conflict real, it's too late for it
+ * to abort.
+ *------------------------------------------------------------------------
+ */
+ if (!failure && SxactHasSummaryConflictOut(writer))
+ failure = true;
+ else if (!failure)
+ {
+ dlist_iter iter;
+
+ dlist_foreach(iter, &writer->outConflicts)
+ {
+ RWConflict conflict =
+ dlist_container(RWConflictData, outLink, iter.cur);
+ SERIALIZABLEXACT *t2 = conflict->sxactIn;
+
+ if (SxactIsPrepared(t2)
+ && (!SxactIsCommitted(reader)
+ || t2->prepareSeqNo <= reader->commitSeqNo)
+ && (!SxactIsCommitted(writer)
+ || t2->prepareSeqNo <= writer->commitSeqNo)
+ && (!SxactIsReadOnly(reader)
+ || t2->prepareSeqNo <= reader->SeqNo.lastCommitBeforeSnapshot))
+ {
+ failure = true;
+ break;
+ }
+ }
+ }
+
+ /*------------------------------------------------------------------------
+ * Check whether the reader has become a pivot with a writer
+ * that's committed (or prepared):
+ *
+ * T0 ------> R ------> W
+ * rw rw
+ *
+ * Because W must've committed first for an anomaly to occur, there is no
+ * anomaly if:
+ * - T0 committed before the writer
+ * - T0 is READ ONLY, and overlaps the writer
+ *------------------------------------------------------------------------
+ */
+ if (!failure && SxactIsPrepared(writer) && !SxactIsReadOnly(reader))
+ {
+ if (SxactHasSummaryConflictIn(reader))
+ {
+ failure = true;
+ }
+ else
+ {
+ dlist_iter iter;
+
+ /*
+ * The unconstify is needed as we have no const version of
+ * dlist_foreach().
+ */
+ dlist_foreach(iter, &unconstify(SERIALIZABLEXACT *, reader)->inConflicts)
+ {
+ const RWConflict conflict =
+ dlist_container(RWConflictData, inLink, iter.cur);
+ const SERIALIZABLEXACT *t0 = conflict->sxactOut;
+
+ if (!SxactIsDoomed(t0)
+ && (!SxactIsCommitted(t0)
+ || t0->commitSeqNo >= writer->prepareSeqNo)
+ && (!SxactIsReadOnly(t0)
+ || t0->SeqNo.lastCommitBeforeSnapshot >= writer->prepareSeqNo))
+ {
+ failure = true;
+ break;
+ }
+ }
+ }
+ }
+
+ if (failure)
+ {
+ /*
+ * We have to kill a transaction to avoid a possible anomaly from
+ * occurring. If the writer is us, we can just ereport() to cause a
+ * transaction abort. Otherwise we flag the writer for termination,
+ * causing it to abort when it tries to commit. However, if the writer
+ * is a prepared transaction, already prepared, we can't abort it
+ * anymore, so we have to kill the reader instead.
+ */
+ if (MySerializableXact == writer)
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during write."),
+ errhint("The transaction might succeed if retried.")));
+ }
+ else if (SxactIsPrepared(writer))
+ {
+ LWLockRelease(SerializableXactHashLock);
+
+ /* if we're not the writer, we have to be the reader */
+ Assert(MySerializableXact == reader);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on conflict out to pivot %u, during read.", writer->topXid),
+ errhint("The transaction might succeed if retried.")));
+ }
+ writer->flags |= SXACT_FLAG_DOOMED;
+ }
+}
+
+/*
+ * PreCommit_CheckForSerializationFailure
+ * Check for dangerous structures in a serializable transaction
+ * at commit.
+ *
+ * We're checking for a dangerous structure as each conflict is recorded.
+ * The only way we could have a problem at commit is if this is the "out"
+ * side of a pivot, and neither the "in" side nor the pivot has yet
+ * committed.
+ *
+ * If a dangerous structure is found, the pivot (the near conflict) is
+ * marked for death, because rolling back another transaction might mean
+ * that we fail without ever making progress. This transaction is
+ * committing writes, so letting it commit ensures progress. If we
+ * canceled the far conflict, it might immediately fail again on retry.
+ */
+void
+PreCommit_CheckForSerializationFailure(void)
+{
+ dlist_iter near_iter;
+
+ if (MySerializableXact == InvalidSerializableXact)
+ return;
+
+ Assert(IsolationIsSerializable());
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+
+ /*
+ * Check if someone else has already decided that we need to die. Since
+ * we set our own DOOMED flag when partially releasing, ignore in that
+ * case.
+ */
+ if (SxactIsDoomed(MySerializableXact) &&
+ !SxactIsPartiallyReleased(MySerializableXact))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on identification as a pivot, during commit attempt."),
+ errhint("The transaction might succeed if retried.")));
+ }
+
+ dlist_foreach(near_iter, &MySerializableXact->inConflicts)
+ {
+ RWConflict nearConflict =
+ dlist_container(RWConflictData, inLink, near_iter.cur);
+
+ if (!SxactIsCommitted(nearConflict->sxactOut)
+ && !SxactIsDoomed(nearConflict->sxactOut))
+ {
+ dlist_iter far_iter;
+
+ dlist_foreach(far_iter, &nearConflict->sxactOut->inConflicts)
+ {
+ RWConflict farConflict =
+ dlist_container(RWConflictData, inLink, far_iter.cur);
+
+ if (farConflict->sxactOut == MySerializableXact
+ || (!SxactIsCommitted(farConflict->sxactOut)
+ && !SxactIsReadOnly(farConflict->sxactOut)
+ && !SxactIsDoomed(farConflict->sxactOut)))
+ {
+ /*
+ * Normally, we kill the pivot transaction to make sure we
+ * make progress if the failing transaction is retried.
+ * However, we can't kill it if it's already prepared, so
+ * in that case we commit suicide instead.
+ */
+ if (SxactIsPrepared(nearConflict->sxactOut))
+ {
+ LWLockRelease(SerializableXactHashLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
+ errmsg("could not serialize access due to read/write dependencies among transactions"),
+ errdetail_internal("Reason code: Canceled on commit attempt with conflict in from prepared pivot."),
+ errhint("The transaction might succeed if retried.")));
+ }
+ nearConflict->sxactOut->flags |= SXACT_FLAG_DOOMED;
+ break;
+ }
+ }
+ }
+ }
+
+ MySerializableXact->prepareSeqNo = ++(PredXact->LastSxactCommitSeqNo);
+ MySerializableXact->flags |= SXACT_FLAG_PREPARED;
+
+ LWLockRelease(SerializableXactHashLock);
+}
+
+/*------------------------------------------------------------------------*/
+
+/*
+ * Two-phase commit support
+ */
+
+/*
+ * AtPrepare_Locks
+ * Do the preparatory work for a PREPARE: make 2PC state file
+ * records for all predicate locks currently held.
+ */
+void
+AtPrepare_PredicateLocks(void)
+{
+ SERIALIZABLEXACT *sxact;
+ TwoPhasePredicateRecord record;
+ TwoPhasePredicateXactRecord *xactRecord;
+ TwoPhasePredicateLockRecord *lockRecord;
+ dlist_iter iter;
+
+ sxact = MySerializableXact;
+ xactRecord = &(record.data.xactRecord);
+ lockRecord = &(record.data.lockRecord);
+
+ if (MySerializableXact == InvalidSerializableXact)
+ return;
+
+ /* Generate an xact record for our SERIALIZABLEXACT */
+ record.type = TWOPHASEPREDICATERECORD_XACT;
+ xactRecord->xmin = MySerializableXact->xmin;
+ xactRecord->flags = MySerializableXact->flags;
+
+ /*
+ * Note that we don't include the list of conflicts in our out in the
+ * statefile, because new conflicts can be added even after the
+ * transaction prepares. We'll just make a conservative assumption during
+ * recovery instead.
+ */
+
+ RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
+ &record, sizeof(record));
+
+ /*
+ * Generate a lock record for each lock.
+ *
+ * To do this, we need to walk the predicate lock list in our sxact rather
+ * than using the local predicate lock table because the latter is not
+ * guaranteed to be accurate.
+ */
+ LWLockAcquire(SerializablePredicateListLock, LW_SHARED);
+
+ /*
+ * No need to take sxact->perXactPredicateListLock in parallel mode
+ * because there cannot be any parallel workers running while we are
+ * preparing a transaction.
+ */
+ Assert(!IsParallelWorker() && !ParallelContextActive());
+
+ dlist_foreach(iter, &sxact->predicateLocks)
+ {
+ PREDICATELOCK *predlock =
+ dlist_container(PREDICATELOCK, xactLink, iter.cur);
+
+ record.type = TWOPHASEPREDICATERECORD_LOCK;
+ lockRecord->target = predlock->tag.myTarget->tag;
+
+ RegisterTwoPhaseRecord(TWOPHASE_RM_PREDICATELOCK_ID, 0,
+ &record, sizeof(record));
+ }
+
+ LWLockRelease(SerializablePredicateListLock);
+}
+
+/*
+ * PostPrepare_Locks
+ * Clean up after successful PREPARE. Unlike the non-predicate
+ * lock manager, we do not need to transfer locks to a dummy
+ * PGPROC because our SERIALIZABLEXACT will stay around
+ * anyway. We only need to clean up our local state.
+ */
+void
+PostPrepare_PredicateLocks(TransactionId xid)
+{
+ if (MySerializableXact == InvalidSerializableXact)
+ return;
+
+ Assert(SxactIsPrepared(MySerializableXact));
+
+ MySerializableXact->pid = 0;
+ MySerializableXact->pgprocno = INVALID_PGPROCNO;
+
+ hash_destroy(LocalPredicateLockHash);
+ LocalPredicateLockHash = NULL;
+
+ MySerializableXact = InvalidSerializableXact;
+ MyXactDidWrite = false;
+}
+
+/*
+ * PredicateLockTwoPhaseFinish
+ * Release a prepared transaction's predicate locks once it
+ * commits or aborts.
+ */
+void
+PredicateLockTwoPhaseFinish(TransactionId xid, bool isCommit)
+{
+ SERIALIZABLEXID *sxid;
+ SERIALIZABLEXIDTAG sxidtag;
+
+ sxidtag.xid = xid;
+
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ sxid = (SERIALIZABLEXID *)
+ hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ LWLockRelease(SerializableXactHashLock);
+
+ /* xid will not be found if it wasn't a serializable transaction */
+ if (sxid == NULL)
+ return;
+
+ /* Release its locks */
+ MySerializableXact = sxid->myXact;
+ MyXactDidWrite = true; /* conservatively assume that we wrote
+ * something */
+ ReleasePredicateLocks(isCommit, false);
+}
+
+/*
+ * Re-acquire a predicate lock belonging to a transaction that was prepared.
+ */
+void
+predicatelock_twophase_recover(TransactionId xid, uint16 info,
+ void *recdata, uint32 len)
+{
+ TwoPhasePredicateRecord *record;
+
+ Assert(len == sizeof(TwoPhasePredicateRecord));
+
+ record = (TwoPhasePredicateRecord *) recdata;
+
+ Assert((record->type == TWOPHASEPREDICATERECORD_XACT) ||
+ (record->type == TWOPHASEPREDICATERECORD_LOCK));
+
+ if (record->type == TWOPHASEPREDICATERECORD_XACT)
+ {
+ /* Per-transaction record. Set up a SERIALIZABLEXACT. */
+ TwoPhasePredicateXactRecord *xactRecord;
+ SERIALIZABLEXACT *sxact;
+ SERIALIZABLEXID *sxid;
+ SERIALIZABLEXIDTAG sxidtag;
+ bool found;
+
+ xactRecord = (TwoPhasePredicateXactRecord *) &record->data.xactRecord;
+
+ LWLockAcquire(SerializableXactHashLock, LW_EXCLUSIVE);
+ sxact = CreatePredXact();
+ if (!sxact)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+
+ /* vxid for a prepared xact is InvalidBackendId/xid; no pid */
+ sxact->vxid.backendId = InvalidBackendId;
+ sxact->vxid.localTransactionId = (LocalTransactionId) xid;
+ sxact->pid = 0;
+ sxact->pgprocno = INVALID_PGPROCNO;
+
+ /* a prepared xact hasn't committed yet */
+ sxact->prepareSeqNo = RecoverySerCommitSeqNo;
+ sxact->commitSeqNo = InvalidSerCommitSeqNo;
+ sxact->finishedBefore = InvalidTransactionId;
+
+ sxact->SeqNo.lastCommitBeforeSnapshot = RecoverySerCommitSeqNo;
+
+ /*
+ * Don't need to track this; no transactions running at the time the
+ * recovered xact started are still active, except possibly other
+ * prepared xacts and we don't care whether those are RO_SAFE or not.
+ */
+ dlist_init(&(sxact->possibleUnsafeConflicts));
+
+ dlist_init(&(sxact->predicateLocks));
+ dlist_node_init(&sxact->finishedLink);
+
+ sxact->topXid = xid;
+ sxact->xmin = xactRecord->xmin;
+ sxact->flags = xactRecord->flags;
+ Assert(SxactIsPrepared(sxact));
+ if (!SxactIsReadOnly(sxact))
+ {
+ ++(PredXact->WritableSxactCount);
+ Assert(PredXact->WritableSxactCount <=
+ (MaxBackends + max_prepared_xacts));
+ }
+
+ /*
+ * We don't know whether the transaction had any conflicts or not, so
+ * we'll conservatively assume that it had both a conflict in and a
+ * conflict out, and represent that with the summary conflict flags.
+ */
+ dlist_init(&(sxact->outConflicts));
+ dlist_init(&(sxact->inConflicts));
+ sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_IN;
+ sxact->flags |= SXACT_FLAG_SUMMARY_CONFLICT_OUT;
+
+ /* Register the transaction's xid */
+ sxidtag.xid = xid;
+ sxid = (SERIALIZABLEXID *) hash_search(SerializableXidHash,
+ &sxidtag,
+ HASH_ENTER, &found);
+ Assert(sxid != NULL);
+ Assert(!found);
+ sxid->myXact = (SERIALIZABLEXACT *) sxact;
+
+ /*
+ * Update global xmin. Note that this is a special case compared to
+ * registering a normal transaction, because the global xmin might go
+ * backwards. That's OK, because until recovery is over we're not
+ * going to complete any transactions or create any non-prepared
+ * transactions, so there's no danger of throwing away.
+ */
+ if ((!TransactionIdIsValid(PredXact->SxactGlobalXmin)) ||
+ (TransactionIdFollows(PredXact->SxactGlobalXmin, sxact->xmin)))
+ {
+ PredXact->SxactGlobalXmin = sxact->xmin;
+ PredXact->SxactGlobalXminCount = 1;
+ SerialSetActiveSerXmin(sxact->xmin);
+ }
+ else if (TransactionIdEquals(sxact->xmin, PredXact->SxactGlobalXmin))
+ {
+ Assert(PredXact->SxactGlobalXminCount > 0);
+ PredXact->SxactGlobalXminCount++;
+ }
+
+ LWLockRelease(SerializableXactHashLock);
+ }
+ else if (record->type == TWOPHASEPREDICATERECORD_LOCK)
+ {
+ /* Lock record. Recreate the PREDICATELOCK */
+ TwoPhasePredicateLockRecord *lockRecord;
+ SERIALIZABLEXID *sxid;
+ SERIALIZABLEXACT *sxact;
+ SERIALIZABLEXIDTAG sxidtag;
+ uint32 targettaghash;
+
+ lockRecord = (TwoPhasePredicateLockRecord *) &record->data.lockRecord;
+ targettaghash = PredicateLockTargetTagHashCode(&lockRecord->target);
+
+ LWLockAcquire(SerializableXactHashLock, LW_SHARED);
+ sxidtag.xid = xid;
+ sxid = (SERIALIZABLEXID *)
+ hash_search(SerializableXidHash, &sxidtag, HASH_FIND, NULL);
+ LWLockRelease(SerializableXactHashLock);
+
+ Assert(sxid != NULL);
+ sxact = sxid->myXact;
+ Assert(sxact != InvalidSerializableXact);
+
+ CreatePredicateLock(&lockRecord->target, targettaghash, sxact);
+ }
+}
+
+/*
+ * Prepare to share the current SERIALIZABLEXACT with parallel workers.
+ * Return a handle object that can be used by AttachSerializableXact() in a
+ * parallel worker.
+ */
+SerializableXactHandle
+ShareSerializableXact(void)
+{
+ return MySerializableXact;
+}
+
+/*
+ * Allow parallel workers to import the leader's SERIALIZABLEXACT.
+ */
+void
+AttachSerializableXact(SerializableXactHandle handle)
+{
+
+ Assert(MySerializableXact == InvalidSerializableXact);
+
+ MySerializableXact = (SERIALIZABLEXACT *) handle;
+ if (MySerializableXact != InvalidSerializableXact)
+ CreateLocalPredicateLockHash();
+}
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
new file mode 100644
index 0000000..e9e445b
--- /dev/null
+++ b/src/backend/storage/lmgr/proc.c
@@ -0,0 +1,1897 @@
+/*-------------------------------------------------------------------------
+ *
+ * proc.c
+ * routines to manage per-process shared memory data structure
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/proc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * Interface (a):
+ * ProcSleep(), ProcWakeup(),
+ *
+ * Waiting for a lock causes the backend to be put to sleep. Whoever releases
+ * the lock wakes the process up again (and gives it an error code so it knows
+ * whether it was awoken on an error condition).
+ *
+ * Interface (b):
+ *
+ * ProcReleaseLocks -- frees the locks associated with current transaction
+ *
+ * ProcKill -- destroys the shared memory state (and locks)
+ * associated with the process.
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/time.h>
+
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xlogutils.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "replication/slot.h"
+#include "replication/syncrep.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/lmgr.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/spin.h"
+#include "storage/standby.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/* GUC variables */
+int DeadlockTimeout = 1000;
+int StatementTimeout = 0;
+int LockTimeout = 0;
+int IdleInTransactionSessionTimeout = 0;
+int IdleSessionTimeout = 0;
+bool log_lock_waits = false;
+
+/* Pointer to this process's PGPROC struct, if any */
+PGPROC *MyProc = NULL;
+
+/*
+ * This spinlock protects the freelist of recycled PGPROC structures.
+ * We cannot use an LWLock because the LWLock manager depends on already
+ * having a PGPROC and a wait semaphore! But these structures are touched
+ * relatively infrequently (only at backend startup or shutdown) and not for
+ * very long, so a spinlock is okay.
+ */
+NON_EXEC_STATIC slock_t *ProcStructLock = NULL;
+
+/* Pointers to shared-memory structures */
+PROC_HDR *ProcGlobal = NULL;
+NON_EXEC_STATIC PGPROC *AuxiliaryProcs = NULL;
+PGPROC *PreparedXactProcs = NULL;
+
+/* If we are waiting for a lock, this points to the associated LOCALLOCK */
+static LOCALLOCK *lockAwaited = NULL;
+
+static DeadLockState deadlock_state = DS_NOT_YET_CHECKED;
+
+/* Is a deadlock check pending? */
+static volatile sig_atomic_t got_deadlock_timeout;
+
+static void RemoveProcFromArray(int code, Datum arg);
+static void ProcKill(int code, Datum arg);
+static void AuxiliaryProcKill(int code, Datum arg);
+static void CheckDeadLock(void);
+
+
+/*
+ * Report shared-memory space needed by InitProcGlobal.
+ */
+Size
+ProcGlobalShmemSize(void)
+{
+ Size size = 0;
+ Size TotalProcs =
+ add_size(MaxBackends, add_size(NUM_AUXILIARY_PROCS, max_prepared_xacts));
+
+ /* ProcGlobal */
+ size = add_size(size, sizeof(PROC_HDR));
+ size = add_size(size, mul_size(TotalProcs, sizeof(PGPROC)));
+ size = add_size(size, sizeof(slock_t));
+
+ size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->xids)));
+ size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->subxidStates)));
+ size = add_size(size, mul_size(TotalProcs, sizeof(*ProcGlobal->statusFlags)));
+
+ return size;
+}
+
+/*
+ * Report number of semaphores needed by InitProcGlobal.
+ */
+int
+ProcGlobalSemas(void)
+{
+ /*
+ * We need a sema per backend (including autovacuum), plus one for each
+ * auxiliary process.
+ */
+ return MaxBackends + NUM_AUXILIARY_PROCS;
+}
+
+/*
+ * InitProcGlobal -
+ * Initialize the global process table during postmaster or standalone
+ * backend startup.
+ *
+ * We also create all the per-process semaphores we will need to support
+ * the requested number of backends. We used to allocate semaphores
+ * only when backends were actually started up, but that is bad because
+ * it lets Postgres fail under load --- a lot of Unix systems are
+ * (mis)configured with small limits on the number of semaphores, and
+ * running out when trying to start another backend is a common failure.
+ * So, now we grab enough semaphores to support the desired max number
+ * of backends immediately at initialization --- if the sysadmin has set
+ * MaxConnections, max_worker_processes, max_wal_senders, or
+ * autovacuum_max_workers higher than his kernel will support, he'll
+ * find out sooner rather than later.
+ *
+ * Another reason for creating semaphores here is that the semaphore
+ * implementation typically requires us to create semaphores in the
+ * postmaster, not in backends.
+ *
+ * Note: this is NOT called by individual backends under a postmaster,
+ * not even in the EXEC_BACKEND case. The ProcGlobal and AuxiliaryProcs
+ * pointers must be propagated specially for EXEC_BACKEND operation.
+ */
+void
+InitProcGlobal(void)
+{
+ PGPROC *procs;
+ int i,
+ j;
+ bool found;
+ uint32 TotalProcs = MaxBackends + NUM_AUXILIARY_PROCS + max_prepared_xacts;
+
+ /* Create the ProcGlobal shared structure */
+ ProcGlobal = (PROC_HDR *)
+ ShmemInitStruct("Proc Header", sizeof(PROC_HDR), &found);
+ Assert(!found);
+
+ /*
+ * Initialize the data structures.
+ */
+ ProcGlobal->spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+ dlist_init(&ProcGlobal->freeProcs);
+ dlist_init(&ProcGlobal->autovacFreeProcs);
+ dlist_init(&ProcGlobal->bgworkerFreeProcs);
+ dlist_init(&ProcGlobal->walsenderFreeProcs);
+ ProcGlobal->startupBufferPinWaitBufId = -1;
+ ProcGlobal->walwriterLatch = NULL;
+ ProcGlobal->checkpointerLatch = NULL;
+ pg_atomic_init_u32(&ProcGlobal->procArrayGroupFirst, INVALID_PGPROCNO);
+ pg_atomic_init_u32(&ProcGlobal->clogGroupFirst, INVALID_PGPROCNO);
+
+ /*
+ * Create and initialize all the PGPROC structures we'll need. There are
+ * five separate consumers: (1) normal backends, (2) autovacuum workers
+ * and the autovacuum launcher, (3) background workers, (4) auxiliary
+ * processes, and (5) prepared transactions. Each PGPROC structure is
+ * dedicated to exactly one of these purposes, and they do not move
+ * between groups.
+ */
+ procs = (PGPROC *) ShmemAlloc(TotalProcs * sizeof(PGPROC));
+ MemSet(procs, 0, TotalProcs * sizeof(PGPROC));
+ ProcGlobal->allProcs = procs;
+ /* XXX allProcCount isn't really all of them; it excludes prepared xacts */
+ ProcGlobal->allProcCount = MaxBackends + NUM_AUXILIARY_PROCS;
+
+ /*
+ * Allocate arrays mirroring PGPROC fields in a dense manner. See
+ * PROC_HDR.
+ *
+ * XXX: It might make sense to increase padding for these arrays, given
+ * how hotly they are accessed.
+ */
+ ProcGlobal->xids =
+ (TransactionId *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->xids));
+ MemSet(ProcGlobal->xids, 0, TotalProcs * sizeof(*ProcGlobal->xids));
+ ProcGlobal->subxidStates = (XidCacheStatus *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->subxidStates));
+ MemSet(ProcGlobal->subxidStates, 0, TotalProcs * sizeof(*ProcGlobal->subxidStates));
+ ProcGlobal->statusFlags = (uint8 *) ShmemAlloc(TotalProcs * sizeof(*ProcGlobal->statusFlags));
+ MemSet(ProcGlobal->statusFlags, 0, TotalProcs * sizeof(*ProcGlobal->statusFlags));
+
+ for (i = 0; i < TotalProcs; i++)
+ {
+ PGPROC *proc = &procs[i];
+
+ /* Common initialization for all PGPROCs, regardless of type. */
+
+ /*
+ * Set up per-PGPROC semaphore, latch, and fpInfoLock. Prepared xact
+ * dummy PGPROCs don't need these though - they're never associated
+ * with a real process
+ */
+ if (i < MaxBackends + NUM_AUXILIARY_PROCS)
+ {
+ proc->sem = PGSemaphoreCreate();
+ InitSharedLatch(&(proc->procLatch));
+ LWLockInitialize(&(proc->fpInfoLock), LWTRANCHE_LOCK_FASTPATH);
+ }
+ proc->pgprocno = i;
+
+ /*
+ * Newly created PGPROCs for normal backends, autovacuum and bgworkers
+ * must be queued up on the appropriate free list. Because there can
+ * only ever be a small, fixed number of auxiliary processes, no free
+ * list is used in that case; InitAuxiliaryProcess() instead uses a
+ * linear search. PGPROCs for prepared transactions are added to a
+ * free list by TwoPhaseShmemInit().
+ */
+ if (i < MaxConnections)
+ {
+ /* PGPROC for normal backend, add to freeProcs list */
+ dlist_push_head(&ProcGlobal->freeProcs, &proc->links);
+ proc->procgloballist = &ProcGlobal->freeProcs;
+ }
+ else if (i < MaxConnections + autovacuum_max_workers + 1)
+ {
+ /* PGPROC for AV launcher/worker, add to autovacFreeProcs list */
+ dlist_push_head(&ProcGlobal->autovacFreeProcs, &proc->links);
+ proc->procgloballist = &ProcGlobal->autovacFreeProcs;
+ }
+ else if (i < MaxConnections + autovacuum_max_workers + 1 + max_worker_processes)
+ {
+ /* PGPROC for bgworker, add to bgworkerFreeProcs list */
+ dlist_push_head(&ProcGlobal->bgworkerFreeProcs, &proc->links);
+ proc->procgloballist = &ProcGlobal->bgworkerFreeProcs;
+ }
+ else if (i < MaxBackends)
+ {
+ /* PGPROC for walsender, add to walsenderFreeProcs list */
+ dlist_push_head(&ProcGlobal->walsenderFreeProcs, &proc->links);
+ proc->procgloballist = &ProcGlobal->walsenderFreeProcs;
+ }
+
+ /* Initialize myProcLocks[] shared memory queues. */
+ for (j = 0; j < NUM_LOCK_PARTITIONS; j++)
+ dlist_init(&(proc->myProcLocks[j]));
+
+ /* Initialize lockGroupMembers list. */
+ dlist_init(&proc->lockGroupMembers);
+
+ /*
+ * Initialize the atomic variables, otherwise, it won't be safe to
+ * access them for backends that aren't currently in use.
+ */
+ pg_atomic_init_u32(&(proc->procArrayGroupNext), INVALID_PGPROCNO);
+ pg_atomic_init_u32(&(proc->clogGroupNext), INVALID_PGPROCNO);
+ pg_atomic_init_u64(&(proc->waitStart), 0);
+ }
+
+ /*
+ * Save pointers to the blocks of PGPROC structures reserved for auxiliary
+ * processes and prepared transactions.
+ */
+ AuxiliaryProcs = &procs[MaxBackends];
+ PreparedXactProcs = &procs[MaxBackends + NUM_AUXILIARY_PROCS];
+
+ /* Create ProcStructLock spinlock, too */
+ ProcStructLock = (slock_t *) ShmemAlloc(sizeof(slock_t));
+ SpinLockInit(ProcStructLock);
+}
+
+/*
+ * InitProcess -- initialize a per-process data structure for this backend
+ */
+void
+InitProcess(void)
+{
+ dlist_head *procgloballist;
+
+ /*
+ * ProcGlobal should be set up already (if we are a backend, we inherit
+ * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+ */
+ if (ProcGlobal == NULL)
+ elog(PANIC, "proc header uninitialized");
+
+ if (MyProc != NULL)
+ elog(ERROR, "you already exist");
+
+ /* Decide which list should supply our PGPROC. */
+ if (IsAnyAutoVacuumProcess())
+ procgloballist = &ProcGlobal->autovacFreeProcs;
+ else if (IsBackgroundWorker)
+ procgloballist = &ProcGlobal->bgworkerFreeProcs;
+ else if (am_walsender)
+ procgloballist = &ProcGlobal->walsenderFreeProcs;
+ else
+ procgloballist = &ProcGlobal->freeProcs;
+
+ /*
+ * Try to get a proc struct from the appropriate free list. If this
+ * fails, we must be out of PGPROC structures (not to mention semaphores).
+ *
+ * While we are holding the ProcStructLock, also copy the current shared
+ * estimate of spins_per_delay to local storage.
+ */
+ SpinLockAcquire(ProcStructLock);
+
+ set_spins_per_delay(ProcGlobal->spins_per_delay);
+
+ if (!dlist_is_empty(procgloballist))
+ {
+ MyProc = (PGPROC *) dlist_pop_head_node(procgloballist);
+ SpinLockRelease(ProcStructLock);
+ }
+ else
+ {
+ /*
+ * If we reach here, all the PGPROCs are in use. This is one of the
+ * possible places to detect "too many backends", so give the standard
+ * error message. XXX do we need to give a different failure message
+ * in the autovacuum case?
+ */
+ SpinLockRelease(ProcStructLock);
+ if (am_walsender)
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("number of requested standby connections exceeds max_wal_senders (currently %d)",
+ max_wal_senders)));
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("sorry, too many clients already")));
+ }
+
+ /*
+ * Cross-check that the PGPROC is of the type we expect; if this were not
+ * the case, it would get returned to the wrong list.
+ */
+ Assert(MyProc->procgloballist == procgloballist);
+
+ /*
+ * Now that we have a PGPROC, mark ourselves as an active postmaster
+ * child; this is so that the postmaster can detect it if we exit without
+ * cleaning up. (XXX autovac launcher currently doesn't participate in
+ * this; it probably should.)
+ */
+ if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+ MarkPostmasterChildActive();
+
+ /*
+ * Initialize all fields of MyProc, except for those previously
+ * initialized by InitProcGlobal.
+ */
+ dlist_node_init(&MyProc->links);
+ MyProc->waitStatus = PROC_WAIT_STATUS_OK;
+ MyProc->lxid = InvalidLocalTransactionId;
+ MyProc->fpVXIDLock = false;
+ MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+ MyProc->xid = InvalidTransactionId;
+ MyProc->xmin = InvalidTransactionId;
+ MyProc->pid = MyProcPid;
+ /* backendId, databaseId and roleId will be filled in later */
+ MyProc->backendId = InvalidBackendId;
+ MyProc->databaseId = InvalidOid;
+ MyProc->roleId = InvalidOid;
+ MyProc->tempNamespaceId = InvalidOid;
+ MyProc->isBackgroundWorker = IsBackgroundWorker;
+ MyProc->delayChkptFlags = 0;
+ MyProc->statusFlags = 0;
+ /* NB -- autovac launcher intentionally does not set IS_AUTOVACUUM */
+ if (IsAutoVacuumWorkerProcess())
+ MyProc->statusFlags |= PROC_IS_AUTOVACUUM;
+ MyProc->lwWaiting = LW_WS_NOT_WAITING;
+ MyProc->lwWaitMode = 0;
+ MyProc->waitLock = NULL;
+ MyProc->waitProcLock = NULL;
+ pg_atomic_write_u64(&MyProc->waitStart, 0);
+#ifdef USE_ASSERT_CHECKING
+ {
+ int i;
+
+ /* Last process should have released all locks. */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ Assert(dlist_is_empty(&(MyProc->myProcLocks[i])));
+ }
+#endif
+ MyProc->recoveryConflictPending = false;
+
+ /* Initialize fields for sync rep */
+ MyProc->waitLSN = 0;
+ MyProc->syncRepState = SYNC_REP_NOT_WAITING;
+ dlist_node_init(&MyProc->syncRepLinks);
+
+ /* Initialize fields for group XID clearing. */
+ MyProc->procArrayGroupMember = false;
+ MyProc->procArrayGroupMemberXid = InvalidTransactionId;
+ Assert(pg_atomic_read_u32(&MyProc->procArrayGroupNext) == INVALID_PGPROCNO);
+
+ /* Check that group locking fields are in a proper initial state. */
+ Assert(MyProc->lockGroupLeader == NULL);
+ Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+
+ /* Initialize wait event information. */
+ MyProc->wait_event_info = 0;
+
+ /* Initialize fields for group transaction status update. */
+ MyProc->clogGroupMember = false;
+ MyProc->clogGroupMemberXid = InvalidTransactionId;
+ MyProc->clogGroupMemberXidStatus = TRANSACTION_STATUS_IN_PROGRESS;
+ MyProc->clogGroupMemberPage = -1;
+ MyProc->clogGroupMemberLsn = InvalidXLogRecPtr;
+ Assert(pg_atomic_read_u32(&MyProc->clogGroupNext) == INVALID_PGPROCNO);
+
+ /*
+ * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
+ * on it. That allows us to repoint the process latch, which so far
+ * points to process local one, to the shared one.
+ */
+ OwnLatch(&MyProc->procLatch);
+ SwitchToSharedLatch();
+
+ /* now that we have a proc, report wait events to shared memory */
+ pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+
+ /*
+ * We might be reusing a semaphore that belonged to a failed process. So
+ * be careful and reinitialize its value here. (This is not strictly
+ * necessary anymore, but seems like a good idea for cleanliness.)
+ */
+ PGSemaphoreReset(MyProc->sem);
+
+ /*
+ * Arrange to clean up at backend exit.
+ */
+ on_shmem_exit(ProcKill, 0);
+
+ /*
+ * Now that we have a PGPROC, we could try to acquire locks, so initialize
+ * local state needed for LWLocks, and the deadlock checker.
+ */
+ InitLWLockAccess();
+ InitDeadLockChecking();
+}
+
+/*
+ * InitProcessPhase2 -- make MyProc visible in the shared ProcArray.
+ *
+ * This is separate from InitProcess because we can't acquire LWLocks until
+ * we've created a PGPROC, but in the EXEC_BACKEND case ProcArrayAdd won't
+ * work until after we've done CreateSharedMemoryAndSemaphores.
+ */
+void
+InitProcessPhase2(void)
+{
+ Assert(MyProc != NULL);
+
+ /*
+ * Add our PGPROC to the PGPROC array in shared memory.
+ */
+ ProcArrayAdd(MyProc);
+
+ /*
+ * Arrange to clean that up at backend exit.
+ */
+ on_shmem_exit(RemoveProcFromArray, 0);
+}
+
+/*
+ * InitAuxiliaryProcess -- create a per-auxiliary-process data structure
+ *
+ * This is called by bgwriter and similar processes so that they will have a
+ * MyProc value that's real enough to let them wait for LWLocks. The PGPROC
+ * and sema that are assigned are one of the extra ones created during
+ * InitProcGlobal.
+ *
+ * Auxiliary processes are presently not expected to wait for real (lockmgr)
+ * locks, so we need not set up the deadlock checker. They are never added
+ * to the ProcArray or the sinval messaging mechanism, either. They also
+ * don't get a VXID assigned, since this is only useful when we actually
+ * hold lockmgr locks.
+ *
+ * Startup process however uses locks but never waits for them in the
+ * normal backend sense. Startup process also takes part in sinval messaging
+ * as a sendOnly process, so never reads messages from sinval queue. So
+ * Startup process does have a VXID and does show up in pg_locks.
+ */
+void
+InitAuxiliaryProcess(void)
+{
+ PGPROC *auxproc;
+ int proctype;
+
+ /*
+ * ProcGlobal should be set up already (if we are a backend, we inherit
+ * this by fork() or EXEC_BACKEND mechanism from the postmaster).
+ */
+ if (ProcGlobal == NULL || AuxiliaryProcs == NULL)
+ elog(PANIC, "proc header uninitialized");
+
+ if (MyProc != NULL)
+ elog(ERROR, "you already exist");
+
+ /*
+ * We use the ProcStructLock to protect assignment and releasing of
+ * AuxiliaryProcs entries.
+ *
+ * While we are holding the ProcStructLock, also copy the current shared
+ * estimate of spins_per_delay to local storage.
+ */
+ SpinLockAcquire(ProcStructLock);
+
+ set_spins_per_delay(ProcGlobal->spins_per_delay);
+
+ /*
+ * Find a free auxproc ... *big* trouble if there isn't one ...
+ */
+ for (proctype = 0; proctype < NUM_AUXILIARY_PROCS; proctype++)
+ {
+ auxproc = &AuxiliaryProcs[proctype];
+ if (auxproc->pid == 0)
+ break;
+ }
+ if (proctype >= NUM_AUXILIARY_PROCS)
+ {
+ SpinLockRelease(ProcStructLock);
+ elog(FATAL, "all AuxiliaryProcs are in use");
+ }
+
+ /* Mark auxiliary proc as in use by me */
+ /* use volatile pointer to prevent code rearrangement */
+ ((volatile PGPROC *) auxproc)->pid = MyProcPid;
+
+ MyProc = auxproc;
+
+ SpinLockRelease(ProcStructLock);
+
+ /*
+ * Initialize all fields of MyProc, except for those previously
+ * initialized by InitProcGlobal.
+ */
+ dlist_node_init(&MyProc->links);
+ MyProc->waitStatus = PROC_WAIT_STATUS_OK;
+ MyProc->lxid = InvalidLocalTransactionId;
+ MyProc->fpVXIDLock = false;
+ MyProc->fpLocalTransactionId = InvalidLocalTransactionId;
+ MyProc->xid = InvalidTransactionId;
+ MyProc->xmin = InvalidTransactionId;
+ MyProc->backendId = InvalidBackendId;
+ MyProc->databaseId = InvalidOid;
+ MyProc->roleId = InvalidOid;
+ MyProc->tempNamespaceId = InvalidOid;
+ MyProc->isBackgroundWorker = IsBackgroundWorker;
+ MyProc->delayChkptFlags = 0;
+ MyProc->statusFlags = 0;
+ MyProc->lwWaiting = LW_WS_NOT_WAITING;
+ MyProc->lwWaitMode = 0;
+ MyProc->waitLock = NULL;
+ MyProc->waitProcLock = NULL;
+ pg_atomic_write_u64(&MyProc->waitStart, 0);
+#ifdef USE_ASSERT_CHECKING
+ {
+ int i;
+
+ /* Last process should have released all locks. */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ Assert(dlist_is_empty(&(MyProc->myProcLocks[i])));
+ }
+#endif
+
+ /*
+ * Acquire ownership of the PGPROC's latch, so that we can use WaitLatch
+ * on it. That allows us to repoint the process latch, which so far
+ * points to process local one, to the shared one.
+ */
+ OwnLatch(&MyProc->procLatch);
+ SwitchToSharedLatch();
+
+ /* now that we have a proc, report wait events to shared memory */
+ pgstat_set_wait_event_storage(&MyProc->wait_event_info);
+
+ /* Check that group locking fields are in a proper initial state. */
+ Assert(MyProc->lockGroupLeader == NULL);
+ Assert(dlist_is_empty(&MyProc->lockGroupMembers));
+
+ /*
+ * We might be reusing a semaphore that belonged to a failed process. So
+ * be careful and reinitialize its value here. (This is not strictly
+ * necessary anymore, but seems like a good idea for cleanliness.)
+ */
+ PGSemaphoreReset(MyProc->sem);
+
+ /*
+ * Arrange to clean up at process exit.
+ */
+ on_shmem_exit(AuxiliaryProcKill, Int32GetDatum(proctype));
+}
+
+/*
+ * Used from bufmgr to share the value of the buffer that Startup waits on,
+ * or to reset the value to "not waiting" (-1). This allows processing
+ * of recovery conflicts for buffer pins. Set is made before backends look
+ * at this value, so locking not required, especially since the set is
+ * an atomic integer set operation.
+ */
+void
+SetStartupBufferPinWaitBufId(int bufid)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile PROC_HDR *procglobal = ProcGlobal;
+
+ procglobal->startupBufferPinWaitBufId = bufid;
+}
+
+/*
+ * Used by backends when they receive a request to check for buffer pin waits.
+ */
+int
+GetStartupBufferPinWaitBufId(void)
+{
+ /* use volatile pointer to prevent code rearrangement */
+ volatile PROC_HDR *procglobal = ProcGlobal;
+
+ return procglobal->startupBufferPinWaitBufId;
+}
+
+/*
+ * Check whether there are at least N free PGPROC objects. If false is
+ * returned, *nfree will be set to the number of free PGPROC objects.
+ * Otherwise, *nfree will be set to n.
+ *
+ * Note: this is designed on the assumption that N will generally be small.
+ */
+bool
+HaveNFreeProcs(int n, int *nfree)
+{
+ dlist_iter iter;
+
+ Assert(n > 0);
+ Assert(nfree);
+
+ SpinLockAcquire(ProcStructLock);
+
+ *nfree = 0;
+ dlist_foreach(iter, &ProcGlobal->freeProcs)
+ {
+ (*nfree)++;
+ if (*nfree == n)
+ break;
+ }
+
+ SpinLockRelease(ProcStructLock);
+
+ return (*nfree == n);
+}
+
+/*
+ * Check if the current process is awaiting a lock.
+ */
+bool
+IsWaitingForLock(void)
+{
+ if (lockAwaited == NULL)
+ return false;
+
+ return true;
+}
+
+/*
+ * Cancel any pending wait for lock, when aborting a transaction, and revert
+ * any strong lock count acquisition for a lock being acquired.
+ *
+ * (Normally, this would only happen if we accept a cancel/die
+ * interrupt while waiting; but an ereport(ERROR) before or during the lock
+ * wait is within the realm of possibility, too.)
+ */
+void
+LockErrorCleanup(void)
+{
+ LWLock *partitionLock;
+ DisableTimeoutParams timeouts[2];
+
+ HOLD_INTERRUPTS();
+
+ AbortStrongLockAcquire();
+
+ /* Nothing to do if we weren't waiting for a lock */
+ if (lockAwaited == NULL)
+ {
+ RESUME_INTERRUPTS();
+ return;
+ }
+
+ /*
+ * Turn off the deadlock and lock timeout timers, if they are still
+ * running (see ProcSleep). Note we must preserve the LOCK_TIMEOUT
+ * indicator flag, since this function is executed before
+ * ProcessInterrupts when responding to SIGINT; else we'd lose the
+ * knowledge that the SIGINT came from a lock timeout and not an external
+ * source.
+ */
+ timeouts[0].id = DEADLOCK_TIMEOUT;
+ timeouts[0].keep_indicator = false;
+ timeouts[1].id = LOCK_TIMEOUT;
+ timeouts[1].keep_indicator = true;
+ disable_timeouts(timeouts, 2);
+
+ /* Unlink myself from the wait queue, if on it (might not be anymore!) */
+ partitionLock = LockHashPartitionLock(lockAwaited->hashcode);
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ if (!dlist_node_is_detached(&MyProc->links))
+ {
+ /* We could not have been granted the lock yet */
+ RemoveFromWaitQueue(MyProc, lockAwaited->hashcode);
+ }
+ else
+ {
+ /*
+ * Somebody kicked us off the lock queue already. Perhaps they
+ * granted us the lock, or perhaps they detected a deadlock. If they
+ * did grant us the lock, we'd better remember it in our local lock
+ * table.
+ */
+ if (MyProc->waitStatus == PROC_WAIT_STATUS_OK)
+ GrantAwaitedLock();
+ }
+
+ lockAwaited = NULL;
+
+ LWLockRelease(partitionLock);
+
+ RESUME_INTERRUPTS();
+}
+
+
+/*
+ * ProcReleaseLocks() -- release locks associated with current transaction
+ * at main transaction commit or abort
+ *
+ * At main transaction commit, we release standard locks except session locks.
+ * At main transaction abort, we release all locks including session locks.
+ *
+ * Advisory locks are released only if they are transaction-level;
+ * session-level holds remain, whether this is a commit or not.
+ *
+ * At subtransaction commit, we don't release any locks (so this func is not
+ * needed at all); we will defer the releasing to the parent transaction.
+ * At subtransaction abort, we release all locks held by the subtransaction;
+ * this is implemented by retail releasing of the locks under control of
+ * the ResourceOwner mechanism.
+ */
+void
+ProcReleaseLocks(bool isCommit)
+{
+ if (!MyProc)
+ return;
+ /* If waiting, get off wait queue (should only be needed after error) */
+ LockErrorCleanup();
+ /* Release standard locks, including session-level if aborting */
+ LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit);
+ /* Release transaction-level advisory locks */
+ LockReleaseAll(USER_LOCKMETHOD, false);
+}
+
+
+/*
+ * RemoveProcFromArray() -- Remove this process from the shared ProcArray.
+ */
+static void
+RemoveProcFromArray(int code, Datum arg)
+{
+ Assert(MyProc != NULL);
+ ProcArrayRemove(MyProc, InvalidTransactionId);
+}
+
+/*
+ * ProcKill() -- Destroy the per-proc data structure for
+ * this process. Release any of its held LW locks.
+ */
+static void
+ProcKill(int code, Datum arg)
+{
+ PGPROC *proc;
+ dlist_head *procgloballist;
+
+ Assert(MyProc != NULL);
+
+ /* not safe if forked by system(), etc. */
+ if (MyProc->pid != (int) getpid())
+ elog(PANIC, "ProcKill() called in child process");
+
+ /* Make sure we're out of the sync rep lists */
+ SyncRepCleanupAtProcExit();
+
+#ifdef USE_ASSERT_CHECKING
+ {
+ int i;
+
+ /* Last process should have released all locks. */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ Assert(dlist_is_empty(&(MyProc->myProcLocks[i])));
+ }
+#endif
+
+ /*
+ * Release any LW locks I am holding. There really shouldn't be any, but
+ * it's cheap to check again before we cut the knees off the LWLock
+ * facility by releasing our PGPROC ...
+ */
+ LWLockReleaseAll();
+
+ /* Cancel any pending condition variable sleep, too */
+ ConditionVariableCancelSleep();
+
+ /*
+ * Detach from any lock group of which we are a member. If the leader
+ * exits before all other group members, its PGPROC will remain allocated
+ * until the last group process exits; that process must return the
+ * leader's PGPROC to the appropriate list.
+ */
+ if (MyProc->lockGroupLeader != NULL)
+ {
+ PGPROC *leader = MyProc->lockGroupLeader;
+ LWLock *leader_lwlock = LockHashPartitionLockByProc(leader);
+
+ LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+ Assert(!dlist_is_empty(&leader->lockGroupMembers));
+ dlist_delete(&MyProc->lockGroupLink);
+ if (dlist_is_empty(&leader->lockGroupMembers))
+ {
+ leader->lockGroupLeader = NULL;
+ if (leader != MyProc)
+ {
+ procgloballist = leader->procgloballist;
+
+ /* Leader exited first; return its PGPROC. */
+ SpinLockAcquire(ProcStructLock);
+ dlist_push_head(procgloballist, &leader->links);
+ SpinLockRelease(ProcStructLock);
+ }
+ }
+ else if (leader != MyProc)
+ MyProc->lockGroupLeader = NULL;
+ LWLockRelease(leader_lwlock);
+ }
+
+ /*
+ * Reset MyLatch to the process local one. This is so that signal
+ * handlers et al can continue using the latch after the shared latch
+ * isn't ours anymore.
+ *
+ * Similarly, stop reporting wait events to MyProc->wait_event_info.
+ *
+ * After that clear MyProc and disown the shared latch.
+ */
+ SwitchBackToLocalLatch();
+ pgstat_reset_wait_event_storage();
+
+ proc = MyProc;
+ MyProc = NULL;
+ DisownLatch(&proc->procLatch);
+
+ procgloballist = proc->procgloballist;
+ SpinLockAcquire(ProcStructLock);
+
+ /*
+ * If we're still a member of a locking group, that means we're a leader
+ * which has somehow exited before its children. The last remaining child
+ * will release our PGPROC. Otherwise, release it now.
+ */
+ if (proc->lockGroupLeader == NULL)
+ {
+ /* Since lockGroupLeader is NULL, lockGroupMembers should be empty. */
+ Assert(dlist_is_empty(&proc->lockGroupMembers));
+
+ /* Return PGPROC structure (and semaphore) to appropriate freelist */
+ dlist_push_tail(procgloballist, &proc->links);
+ }
+
+ /* Update shared estimate of spins_per_delay */
+ ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay);
+
+ SpinLockRelease(ProcStructLock);
+
+ /*
+ * This process is no longer present in shared memory in any meaningful
+ * way, so tell the postmaster we've cleaned up acceptably well. (XXX
+ * autovac launcher should be included here someday)
+ */
+ if (IsUnderPostmaster && !IsAutoVacuumLauncherProcess())
+ MarkPostmasterChildInactive();
+
+ /* wake autovac launcher if needed -- see comments in FreeWorkerInfo */
+ if (AutovacuumLauncherPid != 0)
+ kill(AutovacuumLauncherPid, SIGUSR2);
+}
+
+/*
+ * AuxiliaryProcKill() -- Cut-down version of ProcKill for auxiliary
+ * processes (bgwriter, etc). The PGPROC and sema are not released, only
+ * marked as not-in-use.
+ */
+static void
+AuxiliaryProcKill(int code, Datum arg)
+{
+ int proctype = DatumGetInt32(arg);
+ PGPROC *auxproc PG_USED_FOR_ASSERTS_ONLY;
+ PGPROC *proc;
+
+ Assert(proctype >= 0 && proctype < NUM_AUXILIARY_PROCS);
+
+ /* not safe if forked by system(), etc. */
+ if (MyProc->pid != (int) getpid())
+ elog(PANIC, "AuxiliaryProcKill() called in child process");
+
+ auxproc = &AuxiliaryProcs[proctype];
+
+ Assert(MyProc == auxproc);
+
+ /* Release any LW locks I am holding (see notes above) */
+ LWLockReleaseAll();
+
+ /* Cancel any pending condition variable sleep, too */
+ ConditionVariableCancelSleep();
+
+ /* look at the equivalent ProcKill() code for comments */
+ SwitchBackToLocalLatch();
+ pgstat_reset_wait_event_storage();
+
+ proc = MyProc;
+ MyProc = NULL;
+ DisownLatch(&proc->procLatch);
+
+ SpinLockAcquire(ProcStructLock);
+
+ /* Mark auxiliary proc no longer in use */
+ proc->pid = 0;
+
+ /* Update shared estimate of spins_per_delay */
+ ProcGlobal->spins_per_delay = update_spins_per_delay(ProcGlobal->spins_per_delay);
+
+ SpinLockRelease(ProcStructLock);
+}
+
+/*
+ * AuxiliaryPidGetProc -- get PGPROC for an auxiliary process
+ * given its PID
+ *
+ * Returns NULL if not found.
+ */
+PGPROC *
+AuxiliaryPidGetProc(int pid)
+{
+ PGPROC *result = NULL;
+ int index;
+
+ if (pid == 0) /* never match dummy PGPROCs */
+ return NULL;
+
+ for (index = 0; index < NUM_AUXILIARY_PROCS; index++)
+ {
+ PGPROC *proc = &AuxiliaryProcs[index];
+
+ if (proc->pid == pid)
+ {
+ result = proc;
+ break;
+ }
+ }
+ return result;
+}
+
+
+/*
+ * ProcSleep -- put a process to sleep on the specified lock
+ *
+ * Caller must have set MyProc->heldLocks to reflect locks already held
+ * on the lockable object by this process (under all XIDs).
+ *
+ * The lock table's partition lock must be held at entry, and will be held
+ * at exit.
+ *
+ * Result: PROC_WAIT_STATUS_OK if we acquired the lock, PROC_WAIT_STATUS_ERROR if not (deadlock).
+ *
+ * ASSUME: that no one will fiddle with the queue until after
+ * we release the partition lock.
+ *
+ * NOTES: The process queue is now a priority queue for locking.
+ */
+ProcWaitStatus
+ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable)
+{
+ LOCKMODE lockmode = locallock->tag.mode;
+ LOCK *lock = locallock->lock;
+ PROCLOCK *proclock = locallock->proclock;
+ uint32 hashcode = locallock->hashcode;
+ LWLock *partitionLock = LockHashPartitionLock(hashcode);
+ dclist_head *waitQueue = &lock->waitProcs;
+ PGPROC *insert_before = NULL;
+ LOCKMASK myHeldLocks = MyProc->heldLocks;
+ TimestampTz standbyWaitStart = 0;
+ bool early_deadlock = false;
+ bool allow_autovacuum_cancel = true;
+ bool logged_recovery_conflict = false;
+ ProcWaitStatus myWaitStatus;
+ PGPROC *leader = MyProc->lockGroupLeader;
+
+ /*
+ * If group locking is in use, locks held by members of my locking group
+ * need to be included in myHeldLocks. This is not required for relation
+ * extension lock which conflict among group members. However, including
+ * them in myHeldLocks will give group members the priority to get those
+ * locks as compared to other backends which are also trying to acquire
+ * those locks. OTOH, we can avoid giving priority to group members for
+ * that kind of locks, but there doesn't appear to be a clear advantage of
+ * the same.
+ */
+ if (leader != NULL)
+ {
+ dlist_iter iter;
+
+ dlist_foreach(iter, &lock->procLocks)
+ {
+ PROCLOCK *otherproclock;
+
+ otherproclock = dlist_container(PROCLOCK, lockLink, iter.cur);
+
+ if (otherproclock->groupLeader == leader)
+ myHeldLocks |= otherproclock->holdMask;
+ }
+ }
+
+ /*
+ * Determine where to add myself in the wait queue.
+ *
+ * Normally I should go at the end of the queue. However, if I already
+ * hold locks that conflict with the request of any previous waiter, put
+ * myself in the queue just in front of the first such waiter. This is not
+ * a necessary step, since deadlock detection would move me to before that
+ * waiter anyway; but it's relatively cheap to detect such a conflict
+ * immediately, and avoid delaying till deadlock timeout.
+ *
+ * Special case: if I find I should go in front of some waiter, check to
+ * see if I conflict with already-held locks or the requests before that
+ * waiter. If not, then just grant myself the requested lock immediately.
+ * This is the same as the test for immediate grant in LockAcquire, except
+ * we are only considering the part of the wait queue before my insertion
+ * point.
+ */
+ if (myHeldLocks != 0 && !dclist_is_empty(waitQueue))
+ {
+ LOCKMASK aheadRequests = 0;
+ dlist_iter iter;
+
+ dclist_foreach(iter, waitQueue)
+ {
+ PGPROC *proc = dlist_container(PGPROC, links, iter.cur);
+
+ /*
+ * If we're part of the same locking group as this waiter, its
+ * locks neither conflict with ours nor contribute to
+ * aheadRequests.
+ */
+ if (leader != NULL && leader == proc->lockGroupLeader)
+ continue;
+
+ /* Must he wait for me? */
+ if (lockMethodTable->conflictTab[proc->waitLockMode] & myHeldLocks)
+ {
+ /* Must I wait for him ? */
+ if (lockMethodTable->conflictTab[lockmode] & proc->heldLocks)
+ {
+ /*
+ * Yes, so we have a deadlock. Easiest way to clean up
+ * correctly is to call RemoveFromWaitQueue(), but we
+ * can't do that until we are *on* the wait queue. So, set
+ * a flag to check below, and break out of loop. Also,
+ * record deadlock info for later message.
+ */
+ RememberSimpleDeadLock(MyProc, lockmode, lock, proc);
+ early_deadlock = true;
+ break;
+ }
+ /* I must go before this waiter. Check special case. */
+ if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
+ !LockCheckConflicts(lockMethodTable, lockmode, lock,
+ proclock))
+ {
+ /* Skip the wait and just grant myself the lock. */
+ GrantLock(lock, proclock, lockmode);
+ GrantAwaitedLock();
+ return PROC_WAIT_STATUS_OK;
+ }
+
+ /* Put myself into wait queue before conflicting process */
+ insert_before = proc;
+ break;
+ }
+ /* Nope, so advance to next waiter */
+ aheadRequests |= LOCKBIT_ON(proc->waitLockMode);
+ }
+ }
+
+ /*
+ * Insert self into queue, at the position determined above.
+ */
+ if (insert_before)
+ dclist_insert_before(waitQueue, &insert_before->links, &MyProc->links);
+ else
+ dclist_push_tail(waitQueue, &MyProc->links);
+
+ lock->waitMask |= LOCKBIT_ON(lockmode);
+
+ /* Set up wait information in PGPROC object, too */
+ MyProc->waitLock = lock;
+ MyProc->waitProcLock = proclock;
+ MyProc->waitLockMode = lockmode;
+
+ MyProc->waitStatus = PROC_WAIT_STATUS_WAITING;
+
+ /*
+ * If we detected deadlock, give up without waiting. This must agree with
+ * CheckDeadLock's recovery code.
+ */
+ if (early_deadlock)
+ {
+ RemoveFromWaitQueue(MyProc, hashcode);
+ return PROC_WAIT_STATUS_ERROR;
+ }
+
+ /* mark that we are waiting for a lock */
+ lockAwaited = locallock;
+
+ /*
+ * Release the lock table's partition lock.
+ *
+ * NOTE: this may also cause us to exit critical-section state, possibly
+ * allowing a cancel/die interrupt to be accepted. This is OK because we
+ * have recorded the fact that we are waiting for a lock, and so
+ * LockErrorCleanup will clean up if cancel/die happens.
+ */
+ LWLockRelease(partitionLock);
+
+ /*
+ * Also, now that we will successfully clean up after an ereport, it's
+ * safe to check to see if there's a buffer pin deadlock against the
+ * Startup process. Of course, that's only necessary if we're doing Hot
+ * Standby and are not the Startup process ourselves.
+ */
+ if (RecoveryInProgress() && !InRecovery)
+ CheckRecoveryConflictDeadlock();
+
+ /* Reset deadlock_state before enabling the timeout handler */
+ deadlock_state = DS_NOT_YET_CHECKED;
+ got_deadlock_timeout = false;
+
+ /*
+ * Set timer so we can wake up after awhile and check for a deadlock. If a
+ * deadlock is detected, the handler sets MyProc->waitStatus =
+ * PROC_WAIT_STATUS_ERROR, allowing us to know that we must report failure
+ * rather than success.
+ *
+ * By delaying the check until we've waited for a bit, we can avoid
+ * running the rather expensive deadlock-check code in most cases.
+ *
+ * If LockTimeout is set, also enable the timeout for that. We can save a
+ * few cycles by enabling both timeout sources in one call.
+ *
+ * If InHotStandby we set lock waits slightly later for clarity with other
+ * code.
+ */
+ if (!InHotStandby)
+ {
+ if (LockTimeout > 0)
+ {
+ EnableTimeoutParams timeouts[2];
+
+ timeouts[0].id = DEADLOCK_TIMEOUT;
+ timeouts[0].type = TMPARAM_AFTER;
+ timeouts[0].delay_ms = DeadlockTimeout;
+ timeouts[1].id = LOCK_TIMEOUT;
+ timeouts[1].type = TMPARAM_AFTER;
+ timeouts[1].delay_ms = LockTimeout;
+ enable_timeouts(timeouts, 2);
+ }
+ else
+ enable_timeout_after(DEADLOCK_TIMEOUT, DeadlockTimeout);
+
+ /*
+ * Use the current time obtained for the deadlock timeout timer as
+ * waitStart (i.e., the time when this process started waiting for the
+ * lock). Since getting the current time newly can cause overhead, we
+ * reuse the already-obtained time to avoid that overhead.
+ *
+ * Note that waitStart is updated without holding the lock table's
+ * partition lock, to avoid the overhead by additional lock
+ * acquisition. This can cause "waitstart" in pg_locks to become NULL
+ * for a very short period of time after the wait started even though
+ * "granted" is false. This is OK in practice because we can assume
+ * that users are likely to look at "waitstart" when waiting for the
+ * lock for a long time.
+ */
+ pg_atomic_write_u64(&MyProc->waitStart,
+ get_timeout_start_time(DEADLOCK_TIMEOUT));
+ }
+ else if (log_recovery_conflict_waits)
+ {
+ /*
+ * Set the wait start timestamp if logging is enabled and in hot
+ * standby.
+ */
+ standbyWaitStart = GetCurrentTimestamp();
+ }
+
+ /*
+ * If somebody wakes us between LWLockRelease and WaitLatch, the latch
+ * will not wait. But a set latch does not necessarily mean that the lock
+ * is free now, as there are many other sources for latch sets than
+ * somebody releasing the lock.
+ *
+ * We process interrupts whenever the latch has been set, so cancel/die
+ * interrupts are processed quickly. This means we must not mind losing
+ * control to a cancel/die interrupt here. We don't, because we have no
+ * shared-state-change work to do after being granted the lock (the
+ * grantor did it all). We do have to worry about canceling the deadlock
+ * timeout and updating the locallock table, but if we lose control to an
+ * error, LockErrorCleanup will fix that up.
+ */
+ do
+ {
+ if (InHotStandby)
+ {
+ bool maybe_log_conflict =
+ (standbyWaitStart != 0 && !logged_recovery_conflict);
+
+ /* Set a timer and wait for that or for the lock to be granted */
+ ResolveRecoveryConflictWithLock(locallock->tag.lock,
+ maybe_log_conflict);
+
+ /*
+ * Emit the log message if the startup process is waiting longer
+ * than deadlock_timeout for recovery conflict on lock.
+ */
+ if (maybe_log_conflict)
+ {
+ TimestampTz now = GetCurrentTimestamp();
+
+ if (TimestampDifferenceExceeds(standbyWaitStart, now,
+ DeadlockTimeout))
+ {
+ VirtualTransactionId *vxids;
+ int cnt;
+
+ vxids = GetLockConflicts(&locallock->tag.lock,
+ AccessExclusiveLock, &cnt);
+
+ /*
+ * Log the recovery conflict and the list of PIDs of
+ * backends holding the conflicting lock. Note that we do
+ * logging even if there are no such backends right now
+ * because the startup process here has already waited
+ * longer than deadlock_timeout.
+ */
+ LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+ standbyWaitStart, now,
+ cnt > 0 ? vxids : NULL, true);
+ logged_recovery_conflict = true;
+ }
+ }
+ }
+ else
+ {
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ PG_WAIT_LOCK | locallock->tag.lock.locktag_type);
+ ResetLatch(MyLatch);
+ /* check for deadlocks first, as that's probably log-worthy */
+ if (got_deadlock_timeout)
+ {
+ CheckDeadLock();
+ got_deadlock_timeout = false;
+ }
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ /*
+ * waitStatus could change from PROC_WAIT_STATUS_WAITING to something
+ * else asynchronously. Read it just once per loop to prevent
+ * surprising behavior (such as missing log messages).
+ */
+ myWaitStatus = *((volatile ProcWaitStatus *) &MyProc->waitStatus);
+
+ /*
+ * If we are not deadlocked, but are waiting on an autovacuum-induced
+ * task, send a signal to interrupt it.
+ */
+ if (deadlock_state == DS_BLOCKED_BY_AUTOVACUUM && allow_autovacuum_cancel)
+ {
+ PGPROC *autovac = GetBlockingAutoVacuumPgproc();
+ uint8 statusFlags;
+ uint8 lockmethod_copy;
+ LOCKTAG locktag_copy;
+
+ /*
+ * Grab info we need, then release lock immediately. Note this
+ * coding means that there is a tiny chance that the process
+ * terminates its current transaction and starts a different one
+ * before we have a change to send the signal; the worst possible
+ * consequence is that a for-wraparound vacuum is cancelled. But
+ * that could happen in any case unless we were to do kill() with
+ * the lock held, which is much more undesirable.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ statusFlags = ProcGlobal->statusFlags[autovac->pgxactoff];
+ lockmethod_copy = lock->tag.locktag_lockmethodid;
+ locktag_copy = lock->tag;
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * Only do it if the worker is not working to protect against Xid
+ * wraparound.
+ */
+ if ((statusFlags & PROC_IS_AUTOVACUUM) &&
+ !(statusFlags & PROC_VACUUM_FOR_WRAPAROUND))
+ {
+ int pid = autovac->pid;
+
+ /* report the case, if configured to do so */
+ if (message_level_is_interesting(DEBUG1))
+ {
+ StringInfoData locktagbuf;
+ StringInfoData logbuf; /* errdetail for server log */
+
+ initStringInfo(&locktagbuf);
+ initStringInfo(&logbuf);
+ DescribeLockTag(&locktagbuf, &locktag_copy);
+ appendStringInfo(&logbuf,
+ "Process %d waits for %s on %s.",
+ MyProcPid,
+ GetLockmodeName(lockmethod_copy, lockmode),
+ locktagbuf.data);
+
+ ereport(DEBUG1,
+ (errmsg_internal("sending cancel to blocking autovacuum PID %d",
+ pid),
+ errdetail_log("%s", logbuf.data)));
+
+ pfree(locktagbuf.data);
+ pfree(logbuf.data);
+ }
+
+ /* send the autovacuum worker Back to Old Kent Road */
+ if (kill(pid, SIGINT) < 0)
+ {
+ /*
+ * There's a race condition here: once we release the
+ * ProcArrayLock, it's possible for the autovac worker to
+ * close up shop and exit before we can do the kill().
+ * Therefore, we do not whinge about no-such-process.
+ * Other errors such as EPERM could conceivably happen if
+ * the kernel recycles the PID fast enough, but such cases
+ * seem improbable enough that it's probably best to issue
+ * a warning if we see some other errno.
+ */
+ if (errno != ESRCH)
+ ereport(WARNING,
+ (errmsg("could not send signal to process %d: %m",
+ pid)));
+ }
+ }
+
+ /* prevent signal from being sent again more than once */
+ allow_autovacuum_cancel = false;
+ }
+
+ /*
+ * If awoken after the deadlock check interrupt has run, and
+ * log_lock_waits is on, then report about the wait.
+ */
+ if (log_lock_waits && deadlock_state != DS_NOT_YET_CHECKED)
+ {
+ StringInfoData buf,
+ lock_waiters_sbuf,
+ lock_holders_sbuf;
+ const char *modename;
+ long secs;
+ int usecs;
+ long msecs;
+ dlist_iter proc_iter;
+ PROCLOCK *curproclock;
+ bool first_holder = true,
+ first_waiter = true;
+ int lockHoldersNum = 0;
+
+ initStringInfo(&buf);
+ initStringInfo(&lock_waiters_sbuf);
+ initStringInfo(&lock_holders_sbuf);
+
+ DescribeLockTag(&buf, &locallock->tag.lock);
+ modename = GetLockmodeName(locallock->tag.lock.locktag_lockmethodid,
+ lockmode);
+ TimestampDifference(get_timeout_start_time(DEADLOCK_TIMEOUT),
+ GetCurrentTimestamp(),
+ &secs, &usecs);
+ msecs = secs * 1000 + usecs / 1000;
+ usecs = usecs % 1000;
+
+ /*
+ * we loop over the lock's procLocks to gather a list of all
+ * holders and waiters. Thus we will be able to provide more
+ * detailed information for lock debugging purposes.
+ *
+ * lock->procLocks contains all processes which hold or wait for
+ * this lock.
+ */
+
+ LWLockAcquire(partitionLock, LW_SHARED);
+
+ dlist_foreach(proc_iter, &lock->procLocks)
+ {
+ curproclock =
+ dlist_container(PROCLOCK, lockLink, proc_iter.cur);
+
+ /*
+ * we are a waiter if myProc->waitProcLock == curproclock; we
+ * are a holder if it is NULL or something different
+ */
+ if (curproclock->tag.myProc->waitProcLock == curproclock)
+ {
+ if (first_waiter)
+ {
+ appendStringInfo(&lock_waiters_sbuf, "%d",
+ curproclock->tag.myProc->pid);
+ first_waiter = false;
+ }
+ else
+ appendStringInfo(&lock_waiters_sbuf, ", %d",
+ curproclock->tag.myProc->pid);
+ }
+ else
+ {
+ if (first_holder)
+ {
+ appendStringInfo(&lock_holders_sbuf, "%d",
+ curproclock->tag.myProc->pid);
+ first_holder = false;
+ }
+ else
+ appendStringInfo(&lock_holders_sbuf, ", %d",
+ curproclock->tag.myProc->pid);
+
+ lockHoldersNum++;
+ }
+ }
+
+ LWLockRelease(partitionLock);
+
+ if (deadlock_state == DS_SOFT_DEADLOCK)
+ ereport(LOG,
+ (errmsg("process %d avoided deadlock for %s on %s by rearranging queue order after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs),
+ (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+ "Processes holding the lock: %s. Wait queue: %s.",
+ lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+ else if (deadlock_state == DS_HARD_DEADLOCK)
+ {
+ /*
+ * This message is a bit redundant with the error that will be
+ * reported subsequently, but in some cases the error report
+ * might not make it to the log (eg, if it's caught by an
+ * exception handler), and we want to ensure all long-wait
+ * events get logged.
+ */
+ ereport(LOG,
+ (errmsg("process %d detected deadlock while waiting for %s on %s after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs),
+ (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+ "Processes holding the lock: %s. Wait queue: %s.",
+ lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+ }
+
+ if (myWaitStatus == PROC_WAIT_STATUS_WAITING)
+ ereport(LOG,
+ (errmsg("process %d still waiting for %s on %s after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs),
+ (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+ "Processes holding the lock: %s. Wait queue: %s.",
+ lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+ else if (myWaitStatus == PROC_WAIT_STATUS_OK)
+ ereport(LOG,
+ (errmsg("process %d acquired %s on %s after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs)));
+ else
+ {
+ Assert(myWaitStatus == PROC_WAIT_STATUS_ERROR);
+
+ /*
+ * Currently, the deadlock checker always kicks its own
+ * process, which means that we'll only see
+ * PROC_WAIT_STATUS_ERROR when deadlock_state ==
+ * DS_HARD_DEADLOCK, and there's no need to print redundant
+ * messages. But for completeness and future-proofing, print
+ * a message if it looks like someone else kicked us off the
+ * lock.
+ */
+ if (deadlock_state != DS_HARD_DEADLOCK)
+ ereport(LOG,
+ (errmsg("process %d failed to acquire %s on %s after %ld.%03d ms",
+ MyProcPid, modename, buf.data, msecs, usecs),
+ (errdetail_log_plural("Process holding the lock: %s. Wait queue: %s.",
+ "Processes holding the lock: %s. Wait queue: %s.",
+ lockHoldersNum, lock_holders_sbuf.data, lock_waiters_sbuf.data))));
+ }
+
+ /*
+ * At this point we might still need to wait for the lock. Reset
+ * state so we don't print the above messages again.
+ */
+ deadlock_state = DS_NO_DEADLOCK;
+
+ pfree(buf.data);
+ pfree(lock_holders_sbuf.data);
+ pfree(lock_waiters_sbuf.data);
+ }
+ } while (myWaitStatus == PROC_WAIT_STATUS_WAITING);
+
+ /*
+ * Disable the timers, if they are still running. As in LockErrorCleanup,
+ * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has
+ * already caused QueryCancelPending to become set, we want the cancel to
+ * be reported as a lock timeout, not a user cancel.
+ */
+ if (!InHotStandby)
+ {
+ if (LockTimeout > 0)
+ {
+ DisableTimeoutParams timeouts[2];
+
+ timeouts[0].id = DEADLOCK_TIMEOUT;
+ timeouts[0].keep_indicator = false;
+ timeouts[1].id = LOCK_TIMEOUT;
+ timeouts[1].keep_indicator = true;
+ disable_timeouts(timeouts, 2);
+ }
+ else
+ disable_timeout(DEADLOCK_TIMEOUT, false);
+ }
+
+ /*
+ * Emit the log message if recovery conflict on lock was resolved but the
+ * startup process waited longer than deadlock_timeout for it.
+ */
+ if (InHotStandby && logged_recovery_conflict)
+ LogRecoveryConflict(PROCSIG_RECOVERY_CONFLICT_LOCK,
+ standbyWaitStart, GetCurrentTimestamp(),
+ NULL, false);
+
+ /*
+ * Re-acquire the lock table's partition lock. We have to do this to hold
+ * off cancel/die interrupts before we can mess with lockAwaited (else we
+ * might have a missed or duplicated locallock update).
+ */
+ LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+ /*
+ * We no longer want LockErrorCleanup to do anything.
+ */
+ lockAwaited = NULL;
+
+ /*
+ * If we got the lock, be sure to remember it in the locallock table.
+ */
+ if (MyProc->waitStatus == PROC_WAIT_STATUS_OK)
+ GrantAwaitedLock();
+
+ /*
+ * We don't have to do anything else, because the awaker did all the
+ * necessary update of the lock table and MyProc.
+ */
+ return MyProc->waitStatus;
+}
+
+
+/*
+ * ProcWakeup -- wake up a process by setting its latch.
+ *
+ * Also remove the process from the wait queue and set its links invalid.
+ *
+ * The appropriate lock partition lock must be held by caller.
+ *
+ * XXX: presently, this code is only used for the "success" case, and only
+ * works correctly for that case. To clean up in failure case, would need
+ * to twiddle the lock's request counts too --- see RemoveFromWaitQueue.
+ * Hence, in practice the waitStatus parameter must be PROC_WAIT_STATUS_OK.
+ */
+void
+ProcWakeup(PGPROC *proc, ProcWaitStatus waitStatus)
+{
+ if (dlist_node_is_detached(&proc->links))
+ return;
+
+ Assert(proc->waitStatus == PROC_WAIT_STATUS_WAITING);
+
+ /* Remove process from wait queue */
+ dclist_delete_from_thoroughly(&proc->waitLock->waitProcs, &proc->links);
+
+ /* Clean up process' state and pass it the ok/fail signal */
+ proc->waitLock = NULL;
+ proc->waitProcLock = NULL;
+ proc->waitStatus = waitStatus;
+ pg_atomic_write_u64(&MyProc->waitStart, 0);
+
+ /* And awaken it */
+ SetLatch(&proc->procLatch);
+}
+
+/*
+ * ProcLockWakeup -- routine for waking up processes when a lock is
+ * released (or a prior waiter is aborted). Scan all waiters
+ * for lock, waken any that are no longer blocked.
+ *
+ * The appropriate lock partition lock must be held by caller.
+ */
+void
+ProcLockWakeup(LockMethod lockMethodTable, LOCK *lock)
+{
+ dclist_head *waitQueue = &lock->waitProcs;
+ LOCKMASK aheadRequests = 0;
+ dlist_mutable_iter miter;
+
+ if (dclist_is_empty(waitQueue))
+ return;
+
+ dclist_foreach_modify(miter, waitQueue)
+ {
+ PGPROC *proc = dlist_container(PGPROC, links, miter.cur);
+ LOCKMODE lockmode = proc->waitLockMode;
+
+ /*
+ * Waken if (a) doesn't conflict with requests of earlier waiters, and
+ * (b) doesn't conflict with already-held locks.
+ */
+ if ((lockMethodTable->conflictTab[lockmode] & aheadRequests) == 0 &&
+ !LockCheckConflicts(lockMethodTable, lockmode, lock,
+ proc->waitProcLock))
+ {
+ /* OK to waken */
+ GrantLock(lock, proc->waitProcLock, lockmode);
+ /* removes proc from the lock's waiting process queue */
+ ProcWakeup(proc, PROC_WAIT_STATUS_OK);
+ }
+ else
+ {
+ /*
+ * Lock conflicts: Don't wake, but remember requested mode for
+ * later checks.
+ */
+ aheadRequests |= LOCKBIT_ON(lockmode);
+ }
+ }
+}
+
+/*
+ * CheckDeadLock
+ *
+ * We only get to this routine, if DEADLOCK_TIMEOUT fired while waiting for a
+ * lock to be released by some other process. Check if there's a deadlock; if
+ * not, just return. (But signal ProcSleep to log a message, if
+ * log_lock_waits is true.) If we have a real deadlock, remove ourselves from
+ * the lock's wait queue and signal an error to ProcSleep.
+ */
+static void
+CheckDeadLock(void)
+{
+ int i;
+
+ /*
+ * Acquire exclusive lock on the entire shared lock data structures. Must
+ * grab LWLocks in partition-number order to avoid LWLock deadlock.
+ *
+ * Note that the deadlock check interrupt had better not be enabled
+ * anywhere that this process itself holds lock partition locks, else this
+ * will wait forever. Also note that LWLockAcquire creates a critical
+ * section, so that this routine cannot be interrupted by cancel/die
+ * interrupts.
+ */
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ LWLockAcquire(LockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+
+ /*
+ * Check to see if we've been awoken by anyone in the interim.
+ *
+ * If we have, we can return and resume our transaction -- happy day.
+ * Before we are awoken the process releasing the lock grants it to us so
+ * we know that we don't have to wait anymore.
+ *
+ * We check by looking to see if we've been unlinked from the wait queue.
+ * This is safe because we hold the lock partition lock.
+ */
+ if (MyProc->links.prev == NULL ||
+ MyProc->links.next == NULL)
+ goto check_done;
+
+#ifdef LOCK_DEBUG
+ if (Debug_deadlocks)
+ DumpAllLocks();
+#endif
+
+ /* Run the deadlock check, and set deadlock_state for use by ProcSleep */
+ deadlock_state = DeadLockCheck(MyProc);
+
+ if (deadlock_state == DS_HARD_DEADLOCK)
+ {
+ /*
+ * Oops. We have a deadlock.
+ *
+ * Get this process out of wait state. (Note: we could do this more
+ * efficiently by relying on lockAwaited, but use this coding to
+ * preserve the flexibility to kill some other transaction than the
+ * one detecting the deadlock.)
+ *
+ * RemoveFromWaitQueue sets MyProc->waitStatus to
+ * PROC_WAIT_STATUS_ERROR, so ProcSleep will report an error after we
+ * return from the signal handler.
+ */
+ Assert(MyProc->waitLock != NULL);
+ RemoveFromWaitQueue(MyProc, LockTagHashCode(&(MyProc->waitLock->tag)));
+
+ /*
+ * We're done here. Transaction abort caused by the error that
+ * ProcSleep will raise will cause any other locks we hold to be
+ * released, thus allowing other processes to wake up; we don't need
+ * to do that here. NOTE: an exception is that releasing locks we
+ * hold doesn't consider the possibility of waiters that were blocked
+ * behind us on the lock we just failed to get, and might now be
+ * wakable because we're not in front of them anymore. However,
+ * RemoveFromWaitQueue took care of waking up any such processes.
+ */
+ }
+
+ /*
+ * And release locks. We do this in reverse order for two reasons: (1)
+ * Anyone else who needs more than one of the locks will be trying to lock
+ * them in increasing order; we don't want to release the other process
+ * until it can get all the locks it needs. (2) This avoids O(N^2)
+ * behavior inside LWLockRelease.
+ */
+check_done:
+ for (i = NUM_LOCK_PARTITIONS; --i >= 0;)
+ LWLockRelease(LockHashPartitionLockByIndex(i));
+}
+
+/*
+ * CheckDeadLockAlert - Handle the expiry of deadlock_timeout.
+ *
+ * NB: Runs inside a signal handler, be careful.
+ */
+void
+CheckDeadLockAlert(void)
+{
+ int save_errno = errno;
+
+ got_deadlock_timeout = true;
+
+ /*
+ * Have to set the latch again, even if handle_sig_alarm already did. Back
+ * then got_deadlock_timeout wasn't yet set... It's unlikely that this
+ * ever would be a problem, but setting a set latch again is cheap.
+ *
+ * Note that, when this function runs inside procsignal_sigusr1_handler(),
+ * the handler function sets the latch again after the latch is set here.
+ */
+ SetLatch(MyLatch);
+ errno = save_errno;
+}
+
+/*
+ * ProcWaitForSignal - wait for a signal from another backend.
+ *
+ * As this uses the generic process latch the caller has to be robust against
+ * unrelated wakeups: Always check that the desired state has occurred, and
+ * wait again if not.
+ */
+void
+ProcWaitForSignal(uint32 wait_event_info)
+{
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ wait_event_info);
+ ResetLatch(MyLatch);
+ CHECK_FOR_INTERRUPTS();
+}
+
+/*
+ * ProcSendSignal - set the latch of a backend identified by pgprocno
+ */
+void
+ProcSendSignal(int pgprocno)
+{
+ if (pgprocno < 0 || pgprocno >= ProcGlobal->allProcCount)
+ elog(ERROR, "pgprocno out of range");
+
+ SetLatch(&ProcGlobal->allProcs[pgprocno].procLatch);
+}
+
+/*
+ * BecomeLockGroupLeader - designate process as lock group leader
+ *
+ * Once this function has returned, other processes can join the lock group
+ * by calling BecomeLockGroupMember.
+ */
+void
+BecomeLockGroupLeader(void)
+{
+ LWLock *leader_lwlock;
+
+ /* If we already did it, we don't need to do it again. */
+ if (MyProc->lockGroupLeader == MyProc)
+ return;
+
+ /* We had better not be a follower. */
+ Assert(MyProc->lockGroupLeader == NULL);
+
+ /* Create single-member group, containing only ourselves. */
+ leader_lwlock = LockHashPartitionLockByProc(MyProc);
+ LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+ MyProc->lockGroupLeader = MyProc;
+ dlist_push_head(&MyProc->lockGroupMembers, &MyProc->lockGroupLink);
+ LWLockRelease(leader_lwlock);
+}
+
+/*
+ * BecomeLockGroupMember - designate process as lock group member
+ *
+ * This is pretty straightforward except for the possibility that the leader
+ * whose group we're trying to join might exit before we manage to do so;
+ * and the PGPROC might get recycled for an unrelated process. To avoid
+ * that, we require the caller to pass the PID of the intended PGPROC as
+ * an interlock. Returns true if we successfully join the intended lock
+ * group, and false if not.
+ */
+bool
+BecomeLockGroupMember(PGPROC *leader, int pid)
+{
+ LWLock *leader_lwlock;
+ bool ok = false;
+
+ /* Group leader can't become member of group */
+ Assert(MyProc != leader);
+
+ /* Can't already be a member of a group */
+ Assert(MyProc->lockGroupLeader == NULL);
+
+ /* PID must be valid. */
+ Assert(pid != 0);
+
+ /*
+ * Get lock protecting the group fields. Note LockHashPartitionLockByProc
+ * accesses leader->pgprocno in a PGPROC that might be free. This is safe
+ * because all PGPROCs' pgprocno fields are set during shared memory
+ * initialization and never change thereafter; so we will acquire the
+ * correct lock even if the leader PGPROC is in process of being recycled.
+ */
+ leader_lwlock = LockHashPartitionLockByProc(leader);
+ LWLockAcquire(leader_lwlock, LW_EXCLUSIVE);
+
+ /* Is this the leader we're looking for? */
+ if (leader->pid == pid && leader->lockGroupLeader == leader)
+ {
+ /* OK, join the group */
+ ok = true;
+ MyProc->lockGroupLeader = leader;
+ dlist_push_tail(&leader->lockGroupMembers, &MyProc->lockGroupLink);
+ }
+ LWLockRelease(leader_lwlock);
+
+ return ok;
+}
diff --git a/src/backend/storage/lmgr/s_lock.c b/src/backend/storage/lmgr/s_lock.c
new file mode 100644
index 0000000..327ac64
--- /dev/null
+++ b/src/backend/storage/lmgr/s_lock.c
@@ -0,0 +1,324 @@
+/*-------------------------------------------------------------------------
+ *
+ * s_lock.c
+ * Hardware-dependent implementation of spinlocks.
+ *
+ * When waiting for a contended spinlock we loop tightly for awhile, then
+ * delay using pg_usleep() and try again. Preferably, "awhile" should be a
+ * small multiple of the maximum time we expect a spinlock to be held. 100
+ * iterations seems about right as an initial guess. However, on a
+ * uniprocessor the loop is a waste of cycles, while in a multi-CPU scenario
+ * it's usually better to spin a bit longer than to call the kernel, so we try
+ * to adapt the spin loop count depending on whether we seem to be in a
+ * uniprocessor or multiprocessor.
+ *
+ * Note: you might think MIN_SPINS_PER_DELAY should be just 1, but you'd
+ * be wrong; there are platforms where that can result in a "stuck
+ * spinlock" failure. This has been seen particularly on Alphas; it seems
+ * that the first TAS after returning from kernel space will always fail
+ * on that hardware.
+ *
+ * Once we do decide to block, we use randomly increasing pg_usleep()
+ * delays. The first delay is 1 msec, then the delay randomly increases to
+ * about one second, after which we reset to 1 msec and start again. The
+ * idea here is that in the presence of heavy contention we need to
+ * increase the delay, else the spinlock holder may never get to run and
+ * release the lock. (Consider situation where spinlock holder has been
+ * nice'd down in priority by the scheduler --- it will not get scheduled
+ * until all would-be acquirers are sleeping, so if we always use a 1-msec
+ * sleep, there is a real possibility of starvation.) But we can't just
+ * clamp the delay to an upper bound, else it would take a long time to
+ * make a reasonable number of tries.
+ *
+ * We time out and declare error after NUM_DELAYS delays (thus, exactly
+ * that many tries). With the given settings, this will usually take 2 or
+ * so minutes. It seems better to fix the total number of tries (and thus
+ * the probability of unintended failure) than to fix the total time
+ * spent.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/s_lock.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <time.h>
+#include <unistd.h>
+
+#include "common/pg_prng.h"
+#include "port/atomics.h"
+#include "storage/s_lock.h"
+#include "utils/wait_event.h"
+
+#define MIN_SPINS_PER_DELAY 10
+#define MAX_SPINS_PER_DELAY 1000
+#define NUM_DELAYS 1000
+#define MIN_DELAY_USEC 1000L
+#define MAX_DELAY_USEC 1000000L
+
+
+slock_t dummy_spinlock;
+
+static int spins_per_delay = DEFAULT_SPINS_PER_DELAY;
+
+
+/*
+ * s_lock_stuck() - complain about a stuck spinlock
+ */
+static void
+s_lock_stuck(const char *file, int line, const char *func)
+{
+ if (!func)
+ func = "(unknown)";
+#if defined(S_LOCK_TEST)
+ fprintf(stderr,
+ "\nStuck spinlock detected at %s, %s:%d.\n",
+ func, file, line);
+ exit(1);
+#else
+ elog(PANIC, "stuck spinlock detected at %s, %s:%d",
+ func, file, line);
+#endif
+}
+
+/*
+ * s_lock(lock) - platform-independent portion of waiting for a spinlock.
+ */
+int
+s_lock(volatile slock_t *lock, const char *file, int line, const char *func)
+{
+ SpinDelayStatus delayStatus;
+
+ init_spin_delay(&delayStatus, file, line, func);
+
+ while (TAS_SPIN(lock))
+ {
+ perform_spin_delay(&delayStatus);
+ }
+
+ finish_spin_delay(&delayStatus);
+
+ return delayStatus.delays;
+}
+
+#ifdef USE_DEFAULT_S_UNLOCK
+void
+s_unlock(volatile slock_t *lock)
+{
+#ifdef TAS_ACTIVE_WORD
+ /* HP's PA-RISC */
+ *TAS_ACTIVE_WORD(lock) = -1;
+#else
+ *lock = 0;
+#endif
+}
+#endif
+
+/*
+ * Wait while spinning on a contended spinlock.
+ */
+void
+perform_spin_delay(SpinDelayStatus *status)
+{
+ /* CPU-specific delay each time through the loop */
+ SPIN_DELAY();
+
+ /* Block the process every spins_per_delay tries */
+ if (++(status->spins) >= spins_per_delay)
+ {
+ if (++(status->delays) > NUM_DELAYS)
+ s_lock_stuck(status->file, status->line, status->func);
+
+ if (status->cur_delay == 0) /* first time to delay? */
+ status->cur_delay = MIN_DELAY_USEC;
+
+ /*
+ * Once we start sleeping, the overhead of reporting a wait event is
+ * justified. Actively spinning easily stands out in profilers, but
+ * sleeping with an exponential backoff is harder to spot...
+ *
+ * We might want to report something more granular at some point, but
+ * this is better than nothing.
+ */
+ pgstat_report_wait_start(WAIT_EVENT_SPIN_DELAY);
+ pg_usleep(status->cur_delay);
+ pgstat_report_wait_end();
+
+#if defined(S_LOCK_TEST)
+ fprintf(stdout, "*");
+ fflush(stdout);
+#endif
+
+ /* increase delay by a random fraction between 1X and 2X */
+ status->cur_delay += (int) (status->cur_delay *
+ pg_prng_double(&pg_global_prng_state) + 0.5);
+ /* wrap back to minimum delay when max is exceeded */
+ if (status->cur_delay > MAX_DELAY_USEC)
+ status->cur_delay = MIN_DELAY_USEC;
+
+ status->spins = 0;
+ }
+}
+
+/*
+ * After acquiring a spinlock, update estimates about how long to loop.
+ *
+ * If we were able to acquire the lock without delaying, it's a good
+ * indication we are in a multiprocessor. If we had to delay, it's a sign
+ * (but not a sure thing) that we are in a uniprocessor. Hence, we
+ * decrement spins_per_delay slowly when we had to delay, and increase it
+ * rapidly when we didn't. It's expected that spins_per_delay will
+ * converge to the minimum value on a uniprocessor and to the maximum
+ * value on a multiprocessor.
+ *
+ * Note: spins_per_delay is local within our current process. We want to
+ * average these observations across multiple backends, since it's
+ * relatively rare for this function to even get entered, and so a single
+ * backend might not live long enough to converge on a good value. That
+ * is handled by the two routines below.
+ */
+void
+finish_spin_delay(SpinDelayStatus *status)
+{
+ if (status->cur_delay == 0)
+ {
+ /* we never had to delay */
+ if (spins_per_delay < MAX_SPINS_PER_DELAY)
+ spins_per_delay = Min(spins_per_delay + 100, MAX_SPINS_PER_DELAY);
+ }
+ else
+ {
+ if (spins_per_delay > MIN_SPINS_PER_DELAY)
+ spins_per_delay = Max(spins_per_delay - 1, MIN_SPINS_PER_DELAY);
+ }
+}
+
+/*
+ * Set local copy of spins_per_delay during backend startup.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+void
+set_spins_per_delay(int shared_spins_per_delay)
+{
+ spins_per_delay = shared_spins_per_delay;
+}
+
+/*
+ * Update shared estimate of spins_per_delay during backend exit.
+ *
+ * NB: this has to be pretty fast as it is called while holding a spinlock
+ */
+int
+update_spins_per_delay(int shared_spins_per_delay)
+{
+ /*
+ * We use an exponential moving average with a relatively slow adaption
+ * rate, so that noise in any one backend's result won't affect the shared
+ * value too much. As long as both inputs are within the allowed range,
+ * the result must be too, so we need not worry about clamping the result.
+ *
+ * We deliberately truncate rather than rounding; this is so that single
+ * adjustments inside a backend can affect the shared estimate (see the
+ * asymmetric adjustment rules above).
+ */
+ return (shared_spins_per_delay * 15 + spins_per_delay) / 16;
+}
+
+
+/*****************************************************************************/
+#if defined(S_LOCK_TEST)
+
+/*
+ * test program for verifying a port's spinlock support.
+ */
+
+struct test_lock_struct
+{
+ char pad1;
+ slock_t lock;
+ char pad2;
+};
+
+volatile struct test_lock_struct test_lock;
+
+int
+main()
+{
+ pg_prng_seed(&pg_global_prng_state, (uint64) time(NULL));
+
+ test_lock.pad1 = test_lock.pad2 = 0x44;
+
+ S_INIT_LOCK(&test_lock.lock);
+
+ if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+ {
+ printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+ return 1;
+ }
+
+ if (!S_LOCK_FREE(&test_lock.lock))
+ {
+ printf("S_LOCK_TEST: failed, lock not initialized\n");
+ return 1;
+ }
+
+ S_LOCK(&test_lock.lock);
+
+ if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+ {
+ printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+ return 1;
+ }
+
+ if (S_LOCK_FREE(&test_lock.lock))
+ {
+ printf("S_LOCK_TEST: failed, lock not locked\n");
+ return 1;
+ }
+
+ S_UNLOCK(&test_lock.lock);
+
+ if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+ {
+ printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+ return 1;
+ }
+
+ if (!S_LOCK_FREE(&test_lock.lock))
+ {
+ printf("S_LOCK_TEST: failed, lock not unlocked\n");
+ return 1;
+ }
+
+ S_LOCK(&test_lock.lock);
+
+ if (test_lock.pad1 != 0x44 || test_lock.pad2 != 0x44)
+ {
+ printf("S_LOCK_TEST: failed, declared datatype is wrong size\n");
+ return 1;
+ }
+
+ if (S_LOCK_FREE(&test_lock.lock))
+ {
+ printf("S_LOCK_TEST: failed, lock not re-locked\n");
+ return 1;
+ }
+
+ printf("S_LOCK_TEST: this will print %d stars and then\n", NUM_DELAYS);
+ printf(" exit with a 'stuck spinlock' message\n");
+ printf(" if S_LOCK() and TAS() are working.\n");
+ fflush(stdout);
+
+ s_lock(&test_lock.lock, __FILE__, __LINE__, __func__);
+
+ printf("S_LOCK_TEST: failed, lock not locked\n");
+ return 1;
+}
+
+#endif /* S_LOCK_TEST */
diff --git a/src/backend/storage/lmgr/spin.c b/src/backend/storage/lmgr/spin.c
new file mode 100644
index 0000000..6052779
--- /dev/null
+++ b/src/backend/storage/lmgr/spin.c
@@ -0,0 +1,180 @@
+/*-------------------------------------------------------------------------
+ *
+ * spin.c
+ * Hardware-independent implementation of spinlocks.
+ *
+ *
+ * For machines that have test-and-set (TAS) instructions, s_lock.h/.c
+ * define the spinlock implementation. This file contains only a stub
+ * implementation for spinlocks using PGSemaphores. Unless semaphores
+ * are implemented in a way that doesn't involve a kernel call, this
+ * is too slow to be very useful :-(
+ *
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/spin.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/pg_sema.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+
+
+#ifndef HAVE_SPINLOCKS
+
+/*
+ * No TAS, so spinlocks are implemented as PGSemaphores.
+ */
+
+#ifndef HAVE_ATOMICS
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES + NUM_ATOMICS_SEMAPHORES)
+#else
+#define NUM_EMULATION_SEMAPHORES (NUM_SPINLOCK_SEMAPHORES)
+#endif /* HAVE_ATOMICS */
+
+PGSemaphore *SpinlockSemaArray;
+
+#else /* !HAVE_SPINLOCKS */
+
+#define NUM_EMULATION_SEMAPHORES 0
+
+#endif /* HAVE_SPINLOCKS */
+
+/*
+ * Report the amount of shared memory needed to store semaphores for spinlock
+ * support.
+ */
+Size
+SpinlockSemaSize(void)
+{
+ return NUM_EMULATION_SEMAPHORES * sizeof(PGSemaphore);
+}
+
+/*
+ * Report number of semaphores needed to support spinlocks.
+ */
+int
+SpinlockSemas(void)
+{
+ return NUM_EMULATION_SEMAPHORES;
+}
+
+#ifndef HAVE_SPINLOCKS
+
+/*
+ * Initialize spinlock emulation.
+ *
+ * This must be called after PGReserveSemaphores().
+ */
+void
+SpinlockSemaInit(void)
+{
+ PGSemaphore *spinsemas;
+ int nsemas = SpinlockSemas();
+ int i;
+
+ /*
+ * We must use ShmemAllocUnlocked(), since the spinlock protecting
+ * ShmemAlloc() obviously can't be ready yet.
+ */
+ spinsemas = (PGSemaphore *) ShmemAllocUnlocked(SpinlockSemaSize());
+ for (i = 0; i < nsemas; ++i)
+ spinsemas[i] = PGSemaphoreCreate();
+ SpinlockSemaArray = spinsemas;
+}
+
+/*
+ * s_lock.h hardware-spinlock emulation using semaphores
+ *
+ * We map all spinlocks onto NUM_EMULATION_SEMAPHORES semaphores. It's okay to
+ * map multiple spinlocks onto one semaphore because no process should ever
+ * hold more than one at a time. We just need enough semaphores so that we
+ * aren't adding too much extra contention from that.
+ *
+ * There is one exception to the restriction of only holding one spinlock at a
+ * time, which is that it's ok if emulated atomic operations are nested inside
+ * spinlocks. To avoid the danger of spinlocks and atomic using the same sema,
+ * we make sure "normal" spinlocks and atomics backed by spinlocks use
+ * distinct semaphores (see the nested argument to s_init_lock_sema).
+ *
+ * slock_t is just an int for this implementation; it holds the spinlock
+ * number from 1..NUM_EMULATION_SEMAPHORES. We intentionally ensure that 0
+ * is not a valid value, so that testing with this code can help find
+ * failures to initialize spinlocks.
+ */
+
+static inline void
+s_check_valid(int lockndx)
+{
+ if (unlikely(lockndx <= 0 || lockndx > NUM_EMULATION_SEMAPHORES))
+ elog(ERROR, "invalid spinlock number: %d", lockndx);
+}
+
+void
+s_init_lock_sema(volatile slock_t *lock, bool nested)
+{
+ static uint32 counter = 0;
+ uint32 offset;
+ uint32 sema_total;
+ uint32 idx;
+
+ if (nested)
+ {
+ /*
+ * To allow nesting atomics inside spinlocked sections, use a
+ * different spinlock. See comment above.
+ */
+ offset = 1 + NUM_SPINLOCK_SEMAPHORES;
+ sema_total = NUM_ATOMICS_SEMAPHORES;
+ }
+ else
+ {
+ offset = 1;
+ sema_total = NUM_SPINLOCK_SEMAPHORES;
+ }
+
+ idx = (counter++ % sema_total) + offset;
+
+ /* double check we did things correctly */
+ s_check_valid(idx);
+
+ *lock = idx;
+}
+
+void
+s_unlock_sema(volatile slock_t *lock)
+{
+ int lockndx = *lock;
+
+ s_check_valid(lockndx);
+
+ PGSemaphoreUnlock(SpinlockSemaArray[lockndx - 1]);
+}
+
+bool
+s_lock_free_sema(volatile slock_t *lock)
+{
+ /* We don't currently use S_LOCK_FREE anyway */
+ elog(ERROR, "spin.c does not support S_LOCK_FREE()");
+ return false;
+}
+
+int
+tas_sema(volatile slock_t *lock)
+{
+ int lockndx = *lock;
+
+ s_check_valid(lockndx);
+
+ /* Note that TAS macros return 0 if *success* */
+ return !PGSemaphoreTryLock(SpinlockSemaArray[lockndx - 1]);
+}
+
+#endif /* !HAVE_SPINLOCKS */
diff --git a/src/backend/storage/meson.build b/src/backend/storage/meson.build
new file mode 100644
index 0000000..6ea9faa
--- /dev/null
+++ b/src/backend/storage/meson.build
@@ -0,0 +1,11 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+subdir('buffer')
+subdir('file')
+subdir('freespace')
+subdir('ipc')
+subdir('large_object')
+subdir('lmgr')
+subdir('page')
+subdir('smgr')
+subdir('sync')
diff --git a/src/backend/storage/page/Makefile b/src/backend/storage/page/Makefile
new file mode 100644
index 0000000..da539b1
--- /dev/null
+++ b/src/backend/storage/page/Makefile
@@ -0,0 +1,23 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/page
+#
+# IDENTIFICATION
+# src/backend/storage/page/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/page
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ bufpage.o \
+ checksum.o \
+ itemptr.o
+
+include $(top_srcdir)/src/backend/common.mk
+
+# Provide special optimization flags for checksum.c
+checksum.o: CFLAGS += ${CFLAGS_UNROLL_LOOPS} ${CFLAGS_VECTORIZE}
diff --git a/src/backend/storage/page/README b/src/backend/storage/page/README
new file mode 100644
index 0000000..e30d7ac
--- /dev/null
+++ b/src/backend/storage/page/README
@@ -0,0 +1,64 @@
+src/backend/storage/page/README
+
+Checksums
+---------
+
+Checksums on data pages are designed to detect corruption by the I/O system.
+We do not protect buffers against uncorrectable memory errors, since these
+have a very low measured incidence according to research on large server farms,
+http://www.cs.toronto.edu/~bianca/papers/sigmetrics09.pdf, discussed
+2010/12/22 on -hackers list.
+
+Current implementation requires this be enabled system-wide at initdb time, or
+by using the pg_checksums tool on an offline cluster.
+
+The checksum is not valid at all times on a data page!!
+The checksum is valid when the page leaves the shared pool and is checked
+when it later re-enters the shared pool as a result of I/O.
+We set the checksum on a buffer in the shared pool immediately before we
+flush the buffer. As a result we implicitly invalidate the page's checksum
+when we modify the page for a data change or even a hint. This means that
+many or even most pages in shared buffers have invalid page checksums,
+so be careful how you interpret the pd_checksum field.
+
+That means that WAL-logged changes to a page do NOT update the page checksum,
+so full page images may not have a valid checksum. But those page images have
+the WAL CRC covering them and so are verified separately from this
+mechanism. WAL replay should not test the checksum of a full-page image.
+
+The best way to understand this is that WAL CRCs protect records entering the
+WAL stream, and data page verification protects blocks entering the shared
+buffer pool. They are similar in purpose, yet completely separate. Together
+they ensure we are able to detect errors in data re-entering
+PostgreSQL-controlled memory. Note also that the WAL checksum is a 32-bit CRC,
+whereas the page checksum is only 16-bits.
+
+Any write of a data block can cause a torn page if the write is unsuccessful.
+Full page writes protect us from that, which are stored in WAL. Setting hint
+bits when a page is already dirty is OK because a full page write must already
+have been written for it since the last checkpoint. Setting hint bits on an
+otherwise clean page can allow torn pages; this doesn't normally matter since
+they are just hints, but when the page has checksums, then losing a few bits
+would cause the checksum to be invalid. So if we have full_page_writes = on
+and checksums enabled then we must write a WAL record specifically so that we
+record a full page image in WAL. Hint bits updates should be protected using
+MarkBufferDirtyHint(), which is responsible for writing the full-page image
+when necessary.
+
+Note that when we write a page checksum we include the hopefully zeroed bytes
+that form the hole in the centre of a standard page. Thus, when we read the
+block back from storage we implicitly check that the hole is still all zeroes.
+We do this to ensure that we spot errors that could have destroyed data even
+if they haven't actually done so. Full page images stored in WAL do *not*
+check that the hole is all zero; the data in the hole is simply skipped and
+re-zeroed if the backup block is reapplied. We do this because a failure in
+WAL is a fatal error and prevents further recovery, whereas a checksum failure
+on a normal data block is a hard error but not a critical one for the server,
+even if it is a very bad thing for the user.
+
+New WAL records cannot be written during recovery, so hint bits set during
+recovery must not dirty the page if the buffer is not already dirty, when
+checksums are enabled. Systems in Hot-Standby mode may benefit from hint bits
+being set, but with checksums enabled, a page cannot be dirtied after setting a
+hint bit (due to the torn page risk). So, it must wait for full-page images
+containing the hint bit updates to arrive from the primary.
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
new file mode 100644
index 0000000..9a302dd
--- /dev/null
+++ b/src/backend/storage/page/bufpage.c
@@ -0,0 +1,1549 @@
+/*-------------------------------------------------------------------------
+ *
+ * bufpage.c
+ * POSTGRES standard buffer page code.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/page/bufpage.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/htup_details.h"
+#include "access/itup.h"
+#include "access/xlog.h"
+#include "pgstat.h"
+#include "storage/checksum.h"
+#include "utils/memdebug.h"
+#include "utils/memutils.h"
+
+
+/* GUC variable */
+bool ignore_checksum_failure = false;
+
+
+/* ----------------------------------------------------------------
+ * Page support functions
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * PageInit
+ * Initializes the contents of a page.
+ * Note that we don't calculate an initial checksum here; that's not done
+ * until it's time to write.
+ */
+void
+PageInit(Page page, Size pageSize, Size specialSize)
+{
+ PageHeader p = (PageHeader) page;
+
+ specialSize = MAXALIGN(specialSize);
+
+ Assert(pageSize == BLCKSZ);
+ Assert(pageSize > specialSize + SizeOfPageHeaderData);
+
+ /* Make sure all fields of page are zero, as well as unused space */
+ MemSet(p, 0, pageSize);
+
+ p->pd_flags = 0;
+ p->pd_lower = SizeOfPageHeaderData;
+ p->pd_upper = pageSize - specialSize;
+ p->pd_special = pageSize - specialSize;
+ PageSetPageSizeAndVersion(page, pageSize, PG_PAGE_LAYOUT_VERSION);
+ /* p->pd_prune_xid = InvalidTransactionId; done by above MemSet */
+}
+
+
+/*
+ * PageIsVerifiedExtended
+ * Check that the page header and checksum (if any) appear valid.
+ *
+ * This is called when a page has just been read in from disk. The idea is
+ * to cheaply detect trashed pages before we go nuts following bogus line
+ * pointers, testing invalid transaction identifiers, etc.
+ *
+ * It turns out to be necessary to allow zeroed pages here too. Even though
+ * this routine is *not* called when deliberately adding a page to a relation,
+ * there are scenarios in which a zeroed page might be found in a table.
+ * (Example: a backend extends a relation, then crashes before it can write
+ * any WAL entry about the new page. The kernel will already have the
+ * zeroed page in the file, and it will stay that way after restart.) So we
+ * allow zeroed pages here, and are careful that the page access macros
+ * treat such a page as empty and without free space. Eventually, VACUUM
+ * will clean up such a page and make it usable.
+ *
+ * If flag PIV_LOG_WARNING is set, a WARNING is logged in the event of
+ * a checksum failure.
+ *
+ * If flag PIV_REPORT_STAT is set, a checksum failure is reported directly
+ * to pgstat.
+ */
+bool
+PageIsVerifiedExtended(Page page, BlockNumber blkno, int flags)
+{
+ PageHeader p = (PageHeader) page;
+ size_t *pagebytes;
+ int i;
+ bool checksum_failure = false;
+ bool header_sane = false;
+ bool all_zeroes = false;
+ uint16 checksum = 0;
+
+ /*
+ * Don't verify page data unless the page passes basic non-zero test
+ */
+ if (!PageIsNew(page))
+ {
+ if (DataChecksumsEnabled())
+ {
+ checksum = pg_checksum_page((char *) page, blkno);
+
+ if (checksum != p->pd_checksum)
+ checksum_failure = true;
+ }
+
+ /*
+ * The following checks don't prove the header is correct, only that
+ * it looks sane enough to allow into the buffer pool. Later usage of
+ * the block can still reveal problems, which is why we offer the
+ * checksum option.
+ */
+ if ((p->pd_flags & ~PD_VALID_FLAG_BITS) == 0 &&
+ p->pd_lower <= p->pd_upper &&
+ p->pd_upper <= p->pd_special &&
+ p->pd_special <= BLCKSZ &&
+ p->pd_special == MAXALIGN(p->pd_special))
+ header_sane = true;
+
+ if (header_sane && !checksum_failure)
+ return true;
+ }
+
+ /* Check all-zeroes case */
+ all_zeroes = true;
+ pagebytes = (size_t *) page;
+ for (i = 0; i < (BLCKSZ / sizeof(size_t)); i++)
+ {
+ if (pagebytes[i] != 0)
+ {
+ all_zeroes = false;
+ break;
+ }
+ }
+
+ if (all_zeroes)
+ return true;
+
+ /*
+ * Throw a WARNING if the checksum fails, but only after we've checked for
+ * the all-zeroes case.
+ */
+ if (checksum_failure)
+ {
+ if ((flags & PIV_LOG_WARNING) != 0)
+ ereport(WARNING,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("page verification failed, calculated checksum %u but expected %u",
+ checksum, p->pd_checksum)));
+
+ if ((flags & PIV_REPORT_STAT) != 0)
+ pgstat_report_checksum_failure();
+
+ if (header_sane && ignore_checksum_failure)
+ return true;
+ }
+
+ return false;
+}
+
+
+/*
+ * PageAddItemExtended
+ *
+ * Add an item to a page. Return value is the offset at which it was
+ * inserted, or InvalidOffsetNumber if the item is not inserted for any
+ * reason. A WARNING is issued indicating the reason for the refusal.
+ *
+ * offsetNumber must be either InvalidOffsetNumber to specify finding a
+ * free line pointer, or a value between FirstOffsetNumber and one past
+ * the last existing item, to specify using that particular line pointer.
+ *
+ * If offsetNumber is valid and flag PAI_OVERWRITE is set, we just store
+ * the item at the specified offsetNumber, which must be either a
+ * currently-unused line pointer, or one past the last existing item.
+ *
+ * If offsetNumber is valid and flag PAI_OVERWRITE is not set, insert
+ * the item at the specified offsetNumber, moving existing items later
+ * in the array to make room.
+ *
+ * If offsetNumber is not valid, then assign a slot by finding the first
+ * one that is both unused and deallocated.
+ *
+ * If flag PAI_IS_HEAP is set, we enforce that there can't be more than
+ * MaxHeapTuplesPerPage line pointers on the page.
+ *
+ * !!! EREPORT(ERROR) IS DISALLOWED HERE !!!
+ */
+OffsetNumber
+PageAddItemExtended(Page page,
+ Item item,
+ Size size,
+ OffsetNumber offsetNumber,
+ int flags)
+{
+ PageHeader phdr = (PageHeader) page;
+ Size alignedSize;
+ int lower;
+ int upper;
+ ItemId itemId;
+ OffsetNumber limit;
+ bool needshuffle = false;
+
+ /*
+ * Be wary about corrupted page pointers
+ */
+ if (phdr->pd_lower < SizeOfPageHeaderData ||
+ phdr->pd_lower > phdr->pd_upper ||
+ phdr->pd_upper > phdr->pd_special ||
+ phdr->pd_special > BLCKSZ)
+ ereport(PANIC,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+ phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
+
+ /*
+ * Select offsetNumber to place the new item at
+ */
+ limit = OffsetNumberNext(PageGetMaxOffsetNumber(page));
+
+ /* was offsetNumber passed in? */
+ if (OffsetNumberIsValid(offsetNumber))
+ {
+ /* yes, check it */
+ if ((flags & PAI_OVERWRITE) != 0)
+ {
+ if (offsetNumber < limit)
+ {
+ itemId = PageGetItemId(page, offsetNumber);
+ if (ItemIdIsUsed(itemId) || ItemIdHasStorage(itemId))
+ {
+ elog(WARNING, "will not overwrite a used ItemId");
+ return InvalidOffsetNumber;
+ }
+ }
+ }
+ else
+ {
+ if (offsetNumber < limit)
+ needshuffle = true; /* need to move existing linp's */
+ }
+ }
+ else
+ {
+ /* offsetNumber was not passed in, so find a free slot */
+ /* if no free slot, we'll put it at limit (1st open slot) */
+ if (PageHasFreeLinePointers(page))
+ {
+ /*
+ * Scan line pointer array to locate a "recyclable" (unused)
+ * ItemId.
+ *
+ * Always use earlier items first. PageTruncateLinePointerArray
+ * can only truncate unused items when they appear as a contiguous
+ * group at the end of the line pointer array.
+ */
+ for (offsetNumber = FirstOffsetNumber;
+ offsetNumber < limit; /* limit is maxoff+1 */
+ offsetNumber++)
+ {
+ itemId = PageGetItemId(page, offsetNumber);
+
+ /*
+ * We check for no storage as well, just to be paranoid;
+ * unused items should never have storage. Assert() that the
+ * invariant is respected too.
+ */
+ Assert(ItemIdIsUsed(itemId) || !ItemIdHasStorage(itemId));
+
+ if (!ItemIdIsUsed(itemId) && !ItemIdHasStorage(itemId))
+ break;
+ }
+ if (offsetNumber >= limit)
+ {
+ /* the hint is wrong, so reset it */
+ PageClearHasFreeLinePointers(page);
+ }
+ }
+ else
+ {
+ /* don't bother searching if hint says there's no free slot */
+ offsetNumber = limit;
+ }
+ }
+
+ /* Reject placing items beyond the first unused line pointer */
+ if (offsetNumber > limit)
+ {
+ elog(WARNING, "specified item offset is too large");
+ return InvalidOffsetNumber;
+ }
+
+ /* Reject placing items beyond heap boundary, if heap */
+ if ((flags & PAI_IS_HEAP) != 0 && offsetNumber > MaxHeapTuplesPerPage)
+ {
+ elog(WARNING, "can't put more than MaxHeapTuplesPerPage items in a heap page");
+ return InvalidOffsetNumber;
+ }
+
+ /*
+ * Compute new lower and upper pointers for page, see if it'll fit.
+ *
+ * Note: do arithmetic as signed ints, to avoid mistakes if, say,
+ * alignedSize > pd_upper.
+ */
+ if (offsetNumber == limit || needshuffle)
+ lower = phdr->pd_lower + sizeof(ItemIdData);
+ else
+ lower = phdr->pd_lower;
+
+ alignedSize = MAXALIGN(size);
+
+ upper = (int) phdr->pd_upper - (int) alignedSize;
+
+ if (lower > upper)
+ return InvalidOffsetNumber;
+
+ /*
+ * OK to insert the item. First, shuffle the existing pointers if needed.
+ */
+ itemId = PageGetItemId(page, offsetNumber);
+
+ if (needshuffle)
+ memmove(itemId + 1, itemId,
+ (limit - offsetNumber) * sizeof(ItemIdData));
+
+ /* set the line pointer */
+ ItemIdSetNormal(itemId, upper, size);
+
+ /*
+ * Items normally contain no uninitialized bytes. Core bufpage consumers
+ * conform, but this is not a necessary coding rule; a new index AM could
+ * opt to depart from it. However, data type input functions and other
+ * C-language functions that synthesize datums should initialize all
+ * bytes; datumIsEqual() relies on this. Testing here, along with the
+ * similar check in printtup(), helps to catch such mistakes.
+ *
+ * Values of the "name" type retrieved via index-only scans may contain
+ * uninitialized bytes; see comment in btrescan(). Valgrind will report
+ * this as an error, but it is safe to ignore.
+ */
+ VALGRIND_CHECK_MEM_IS_DEFINED(item, size);
+
+ /* copy the item's data onto the page */
+ memcpy((char *) page + upper, item, size);
+
+ /* adjust page header */
+ phdr->pd_lower = (LocationIndex) lower;
+ phdr->pd_upper = (LocationIndex) upper;
+
+ return offsetNumber;
+}
+
+
+/*
+ * PageGetTempPage
+ * Get a temporary page in local memory for special processing.
+ * The returned page is not initialized at all; caller must do that.
+ */
+Page
+PageGetTempPage(Page page)
+{
+ Size pageSize;
+ Page temp;
+
+ pageSize = PageGetPageSize(page);
+ temp = (Page) palloc(pageSize);
+
+ return temp;
+}
+
+/*
+ * PageGetTempPageCopy
+ * Get a temporary page in local memory for special processing.
+ * The page is initialized by copying the contents of the given page.
+ */
+Page
+PageGetTempPageCopy(Page page)
+{
+ Size pageSize;
+ Page temp;
+
+ pageSize = PageGetPageSize(page);
+ temp = (Page) palloc(pageSize);
+
+ memcpy(temp, page, pageSize);
+
+ return temp;
+}
+
+/*
+ * PageGetTempPageCopySpecial
+ * Get a temporary page in local memory for special processing.
+ * The page is PageInit'd with the same special-space size as the
+ * given page, and the special space is copied from the given page.
+ */
+Page
+PageGetTempPageCopySpecial(Page page)
+{
+ Size pageSize;
+ Page temp;
+
+ pageSize = PageGetPageSize(page);
+ temp = (Page) palloc(pageSize);
+
+ PageInit(temp, pageSize, PageGetSpecialSize(page));
+ memcpy(PageGetSpecialPointer(temp),
+ PageGetSpecialPointer(page),
+ PageGetSpecialSize(page));
+
+ return temp;
+}
+
+/*
+ * PageRestoreTempPage
+ * Copy temporary page back to permanent page after special processing
+ * and release the temporary page.
+ */
+void
+PageRestoreTempPage(Page tempPage, Page oldPage)
+{
+ Size pageSize;
+
+ pageSize = PageGetPageSize(tempPage);
+ memcpy((char *) oldPage, (char *) tempPage, pageSize);
+
+ pfree(tempPage);
+}
+
+/*
+ * Tuple defrag support for PageRepairFragmentation and PageIndexMultiDelete
+ */
+typedef struct itemIdCompactData
+{
+ uint16 offsetindex; /* linp array index */
+ int16 itemoff; /* page offset of item data */
+ uint16 alignedlen; /* MAXALIGN(item data len) */
+} itemIdCompactData;
+typedef itemIdCompactData *itemIdCompact;
+
+/*
+ * After removing or marking some line pointers unused, move the tuples to
+ * remove the gaps caused by the removed items and reorder them back into
+ * reverse line pointer order in the page.
+ *
+ * This function can often be fairly hot, so it pays to take some measures to
+ * make it as optimal as possible.
+ *
+ * Callers may pass 'presorted' as true if the 'itemidbase' array is sorted in
+ * descending order of itemoff. When this is true we can just memmove()
+ * tuples towards the end of the page. This is quite a common case as it's
+ * the order that tuples are initially inserted into pages. When we call this
+ * function to defragment the tuples in the page then any new line pointers
+ * added to the page will keep that presorted order, so hitting this case is
+ * still very common for tables that are commonly updated.
+ *
+ * When the 'itemidbase' array is not presorted then we're unable to just
+ * memmove() tuples around freely. Doing so could cause us to overwrite the
+ * memory belonging to a tuple we've not moved yet. In this case, we copy all
+ * the tuples that need to be moved into a temporary buffer. We can then
+ * simply memcpy() out of that temp buffer back into the page at the correct
+ * location. Tuples are copied back into the page in the same order as the
+ * 'itemidbase' array, so we end up reordering the tuples back into reverse
+ * line pointer order. This will increase the chances of hitting the
+ * presorted case the next time around.
+ *
+ * Callers must ensure that nitems is > 0
+ */
+static void
+compactify_tuples(itemIdCompact itemidbase, int nitems, Page page, bool presorted)
+{
+ PageHeader phdr = (PageHeader) page;
+ Offset upper;
+ Offset copy_tail;
+ Offset copy_head;
+ itemIdCompact itemidptr;
+ int i;
+
+ /* Code within will not work correctly if nitems == 0 */
+ Assert(nitems > 0);
+
+ if (presorted)
+ {
+
+#ifdef USE_ASSERT_CHECKING
+ {
+ /*
+ * Verify we've not gotten any new callers that are incorrectly
+ * passing a true presorted value.
+ */
+ Offset lastoff = phdr->pd_special;
+
+ for (i = 0; i < nitems; i++)
+ {
+ itemidptr = &itemidbase[i];
+
+ Assert(lastoff > itemidptr->itemoff);
+
+ lastoff = itemidptr->itemoff;
+ }
+ }
+#endif /* USE_ASSERT_CHECKING */
+
+ /*
+ * 'itemidbase' is already in the optimal order, i.e, lower item
+ * pointers have a higher offset. This allows us to memmove() the
+ * tuples up to the end of the page without having to worry about
+ * overwriting other tuples that have not been moved yet.
+ *
+ * There's a good chance that there are tuples already right at the
+ * end of the page that we can simply skip over because they're
+ * already in the correct location within the page. We'll do that
+ * first...
+ */
+ upper = phdr->pd_special;
+ i = 0;
+ do
+ {
+ itemidptr = &itemidbase[i];
+ if (upper != itemidptr->itemoff + itemidptr->alignedlen)
+ break;
+ upper -= itemidptr->alignedlen;
+
+ i++;
+ } while (i < nitems);
+
+ /*
+ * Now that we've found the first tuple that needs to be moved, we can
+ * do the tuple compactification. We try and make the least number of
+ * memmove() calls and only call memmove() when there's a gap. When
+ * we see a gap we just move all tuples after the gap up until the
+ * point of the last move operation.
+ */
+ copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen;
+ for (; i < nitems; i++)
+ {
+ ItemId lp;
+
+ itemidptr = &itemidbase[i];
+ lp = PageGetItemId(page, itemidptr->offsetindex + 1);
+
+ if (copy_head != itemidptr->itemoff + itemidptr->alignedlen)
+ {
+ memmove((char *) page + upper,
+ page + copy_head,
+ copy_tail - copy_head);
+
+ /*
+ * We've now moved all tuples already seen, but not the
+ * current tuple, so we set the copy_tail to the end of this
+ * tuple so it can be moved in another iteration of the loop.
+ */
+ copy_tail = itemidptr->itemoff + itemidptr->alignedlen;
+ }
+ /* shift the target offset down by the length of this tuple */
+ upper -= itemidptr->alignedlen;
+ /* point the copy_head to the start of this tuple */
+ copy_head = itemidptr->itemoff;
+
+ /* update the line pointer to reference the new offset */
+ lp->lp_off = upper;
+ }
+
+ /* move the remaining tuples. */
+ memmove((char *) page + upper,
+ page + copy_head,
+ copy_tail - copy_head);
+ }
+ else
+ {
+ PGAlignedBlock scratch;
+ char *scratchptr = scratch.data;
+
+ /*
+ * Non-presorted case: The tuples in the itemidbase array may be in
+ * any order. So, in order to move these to the end of the page we
+ * must make a temp copy of each tuple that needs to be moved before
+ * we copy them back into the page at the new offset.
+ *
+ * If a large percentage of tuples have been pruned (>75%) then we'll
+ * copy these into the temp buffer tuple-by-tuple, otherwise, we'll
+ * just do a single memcpy() for all tuples that need to be moved.
+ * When so many tuples have been removed there's likely to be a lot of
+ * gaps and it's unlikely that many non-movable tuples remain at the
+ * end of the page.
+ */
+ if (nitems < PageGetMaxOffsetNumber(page) / 4)
+ {
+ i = 0;
+ do
+ {
+ itemidptr = &itemidbase[i];
+ memcpy(scratchptr + itemidptr->itemoff, page + itemidptr->itemoff,
+ itemidptr->alignedlen);
+ i++;
+ } while (i < nitems);
+
+ /* Set things up for the compactification code below */
+ i = 0;
+ itemidptr = &itemidbase[0];
+ upper = phdr->pd_special;
+ }
+ else
+ {
+ upper = phdr->pd_special;
+
+ /*
+ * Many tuples are likely to already be in the correct location.
+ * There's no need to copy these into the temp buffer. Instead
+ * we'll just skip forward in the itemidbase array to the position
+ * that we do need to move tuples from so that the code below just
+ * leaves these ones alone.
+ */
+ i = 0;
+ do
+ {
+ itemidptr = &itemidbase[i];
+ if (upper != itemidptr->itemoff + itemidptr->alignedlen)
+ break;
+ upper -= itemidptr->alignedlen;
+
+ i++;
+ } while (i < nitems);
+
+ /* Copy all tuples that need to be moved into the temp buffer */
+ memcpy(scratchptr + phdr->pd_upper,
+ page + phdr->pd_upper,
+ upper - phdr->pd_upper);
+ }
+
+ /*
+ * Do the tuple compactification. itemidptr is already pointing to
+ * the first tuple that we're going to move. Here we collapse the
+ * memcpy calls for adjacent tuples into a single call. This is done
+ * by delaying the memcpy call until we find a gap that needs to be
+ * closed.
+ */
+ copy_tail = copy_head = itemidptr->itemoff + itemidptr->alignedlen;
+ for (; i < nitems; i++)
+ {
+ ItemId lp;
+
+ itemidptr = &itemidbase[i];
+ lp = PageGetItemId(page, itemidptr->offsetindex + 1);
+
+ /* copy pending tuples when we detect a gap */
+ if (copy_head != itemidptr->itemoff + itemidptr->alignedlen)
+ {
+ memcpy((char *) page + upper,
+ scratchptr + copy_head,
+ copy_tail - copy_head);
+
+ /*
+ * We've now copied all tuples already seen, but not the
+ * current tuple, so we set the copy_tail to the end of this
+ * tuple.
+ */
+ copy_tail = itemidptr->itemoff + itemidptr->alignedlen;
+ }
+ /* shift the target offset down by the length of this tuple */
+ upper -= itemidptr->alignedlen;
+ /* point the copy_head to the start of this tuple */
+ copy_head = itemidptr->itemoff;
+
+ /* update the line pointer to reference the new offset */
+ lp->lp_off = upper;
+ }
+
+ /* Copy the remaining chunk */
+ memcpy((char *) page + upper,
+ scratchptr + copy_head,
+ copy_tail - copy_head);
+ }
+
+ phdr->pd_upper = upper;
+}
+
+/*
+ * PageRepairFragmentation
+ *
+ * Frees fragmented space on a heap page following pruning.
+ *
+ * This routine is usable for heap pages only, but see PageIndexMultiDelete.
+ *
+ * This routine removes unused line pointers from the end of the line pointer
+ * array. This is possible when dead heap-only tuples get removed by pruning,
+ * especially when there were HOT chains with several tuples each beforehand.
+ *
+ * Caller had better have a full cleanup lock on page's buffer. As a side
+ * effect the page's PD_HAS_FREE_LINES hint bit will be set or unset as
+ * needed. Caller might also need to account for a reduction in the length of
+ * the line pointer array following array truncation.
+ */
+void
+PageRepairFragmentation(Page page)
+{
+ Offset pd_lower = ((PageHeader) page)->pd_lower;
+ Offset pd_upper = ((PageHeader) page)->pd_upper;
+ Offset pd_special = ((PageHeader) page)->pd_special;
+ Offset last_offset;
+ itemIdCompactData itemidbase[MaxHeapTuplesPerPage];
+ itemIdCompact itemidptr;
+ ItemId lp;
+ int nline,
+ nstorage,
+ nunused;
+ OffsetNumber finalusedlp = InvalidOffsetNumber;
+ int i;
+ Size totallen;
+ bool presorted = true; /* For now */
+
+ /*
+ * It's worth the trouble to be more paranoid here than in most places,
+ * because we are about to reshuffle data in (what is usually) a shared
+ * disk buffer. If we aren't careful then corrupted pointers, lengths,
+ * etc could cause us to clobber adjacent disk buffers, spreading the data
+ * loss further. So, check everything.
+ */
+ if (pd_lower < SizeOfPageHeaderData ||
+ pd_lower > pd_upper ||
+ pd_upper > pd_special ||
+ pd_special > BLCKSZ ||
+ pd_special != MAXALIGN(pd_special))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+ pd_lower, pd_upper, pd_special)));
+
+ /*
+ * Run through the line pointer array and collect data about live items.
+ */
+ nline = PageGetMaxOffsetNumber(page);
+ itemidptr = itemidbase;
+ nunused = totallen = 0;
+ last_offset = pd_special;
+ for (i = FirstOffsetNumber; i <= nline; i++)
+ {
+ lp = PageGetItemId(page, i);
+ if (ItemIdIsUsed(lp))
+ {
+ if (ItemIdHasStorage(lp))
+ {
+ itemidptr->offsetindex = i - 1;
+ itemidptr->itemoff = ItemIdGetOffset(lp);
+
+ if (last_offset > itemidptr->itemoff)
+ last_offset = itemidptr->itemoff;
+ else
+ presorted = false;
+
+ if (unlikely(itemidptr->itemoff < (int) pd_upper ||
+ itemidptr->itemoff >= (int) pd_special))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted line pointer: %u",
+ itemidptr->itemoff)));
+ itemidptr->alignedlen = MAXALIGN(ItemIdGetLength(lp));
+ totallen += itemidptr->alignedlen;
+ itemidptr++;
+ }
+
+ finalusedlp = i; /* Could be the final non-LP_UNUSED item */
+ }
+ else
+ {
+ /* Unused entries should have lp_len = 0, but make sure */
+ Assert(!ItemIdHasStorage(lp));
+ ItemIdSetUnused(lp);
+ nunused++;
+ }
+ }
+
+ nstorage = itemidptr - itemidbase;
+ if (nstorage == 0)
+ {
+ /* Page is completely empty, so just reset it quickly */
+ ((PageHeader) page)->pd_upper = pd_special;
+ }
+ else
+ {
+ /* Need to compact the page the hard way */
+ if (totallen > (Size) (pd_special - pd_lower))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted item lengths: total %u, available space %u",
+ (unsigned int) totallen, pd_special - pd_lower)));
+
+ compactify_tuples(itemidbase, nstorage, page, presorted);
+ }
+
+ if (finalusedlp != nline)
+ {
+ /* The last line pointer is not the last used line pointer */
+ int nunusedend = nline - finalusedlp;
+
+ Assert(nunused >= nunusedend && nunusedend > 0);
+
+ /* remove trailing unused line pointers from the count */
+ nunused -= nunusedend;
+ /* truncate the line pointer array */
+ ((PageHeader) page)->pd_lower -= (sizeof(ItemIdData) * nunusedend);
+ }
+
+ /* Set hint bit for PageAddItemExtended */
+ if (nunused > 0)
+ PageSetHasFreeLinePointers(page);
+ else
+ PageClearHasFreeLinePointers(page);
+}
+
+/*
+ * PageTruncateLinePointerArray
+ *
+ * Removes unused line pointers at the end of the line pointer array.
+ *
+ * This routine is usable for heap pages only. It is called by VACUUM during
+ * its second pass over the heap. We expect at least one LP_UNUSED line
+ * pointer on the page (if VACUUM didn't have an LP_DEAD item on the page that
+ * it just set to LP_UNUSED then it should not call here).
+ *
+ * We avoid truncating the line pointer array to 0 items, if necessary by
+ * leaving behind a single remaining LP_UNUSED item. This is a little
+ * arbitrary, but it seems like a good idea to avoid leaving a PageIsEmpty()
+ * page behind.
+ *
+ * Caller can have either an exclusive lock or a full cleanup lock on page's
+ * buffer. The page's PD_HAS_FREE_LINES hint bit will be set or unset based
+ * on whether or not we leave behind any remaining LP_UNUSED items.
+ */
+void
+PageTruncateLinePointerArray(Page page)
+{
+ PageHeader phdr = (PageHeader) page;
+ bool countdone = false,
+ sethint = false;
+ int nunusedend = 0;
+
+ /* Scan line pointer array back-to-front */
+ for (int i = PageGetMaxOffsetNumber(page); i >= FirstOffsetNumber; i--)
+ {
+ ItemId lp = PageGetItemId(page, i);
+
+ if (!countdone && i > FirstOffsetNumber)
+ {
+ /*
+ * Still determining which line pointers from the end of the array
+ * will be truncated away. Either count another line pointer as
+ * safe to truncate, or notice that it's not safe to truncate
+ * additional line pointers (stop counting line pointers).
+ */
+ if (!ItemIdIsUsed(lp))
+ nunusedend++;
+ else
+ countdone = true;
+ }
+ else
+ {
+ /*
+ * Once we've stopped counting we still need to figure out if
+ * there are any remaining LP_UNUSED line pointers somewhere more
+ * towards the front of the array.
+ */
+ if (!ItemIdIsUsed(lp))
+ {
+ /*
+ * This is an unused line pointer that we won't be truncating
+ * away -- so there is at least one. Set hint on page.
+ */
+ sethint = true;
+ break;
+ }
+ }
+ }
+
+ if (nunusedend > 0)
+ {
+ phdr->pd_lower -= sizeof(ItemIdData) * nunusedend;
+
+#ifdef CLOBBER_FREED_MEMORY
+ memset((char *) page + phdr->pd_lower, 0x7F,
+ sizeof(ItemIdData) * nunusedend);
+#endif
+ }
+ else
+ Assert(sethint);
+
+ /* Set hint bit for PageAddItemExtended */
+ if (sethint)
+ PageSetHasFreeLinePointers(page);
+ else
+ PageClearHasFreeLinePointers(page);
+}
+
+/*
+ * PageGetFreeSpace
+ * Returns the size of the free (allocatable) space on a page,
+ * reduced by the space needed for a new line pointer.
+ *
+ * Note: this should usually only be used on index pages. Use
+ * PageGetHeapFreeSpace on heap pages.
+ */
+Size
+PageGetFreeSpace(Page page)
+{
+ int space;
+
+ /*
+ * Use signed arithmetic here so that we behave sensibly if pd_lower >
+ * pd_upper.
+ */
+ space = (int) ((PageHeader) page)->pd_upper -
+ (int) ((PageHeader) page)->pd_lower;
+
+ if (space < (int) sizeof(ItemIdData))
+ return 0;
+ space -= sizeof(ItemIdData);
+
+ return (Size) space;
+}
+
+/*
+ * PageGetFreeSpaceForMultipleTuples
+ * Returns the size of the free (allocatable) space on a page,
+ * reduced by the space needed for multiple new line pointers.
+ *
+ * Note: this should usually only be used on index pages. Use
+ * PageGetHeapFreeSpace on heap pages.
+ */
+Size
+PageGetFreeSpaceForMultipleTuples(Page page, int ntups)
+{
+ int space;
+
+ /*
+ * Use signed arithmetic here so that we behave sensibly if pd_lower >
+ * pd_upper.
+ */
+ space = (int) ((PageHeader) page)->pd_upper -
+ (int) ((PageHeader) page)->pd_lower;
+
+ if (space < (int) (ntups * sizeof(ItemIdData)))
+ return 0;
+ space -= ntups * sizeof(ItemIdData);
+
+ return (Size) space;
+}
+
+/*
+ * PageGetExactFreeSpace
+ * Returns the size of the free (allocatable) space on a page,
+ * without any consideration for adding/removing line pointers.
+ */
+Size
+PageGetExactFreeSpace(Page page)
+{
+ int space;
+
+ /*
+ * Use signed arithmetic here so that we behave sensibly if pd_lower >
+ * pd_upper.
+ */
+ space = (int) ((PageHeader) page)->pd_upper -
+ (int) ((PageHeader) page)->pd_lower;
+
+ if (space < 0)
+ return 0;
+
+ return (Size) space;
+}
+
+
+/*
+ * PageGetHeapFreeSpace
+ * Returns the size of the free (allocatable) space on a page,
+ * reduced by the space needed for a new line pointer.
+ *
+ * The difference between this and PageGetFreeSpace is that this will return
+ * zero if there are already MaxHeapTuplesPerPage line pointers in the page
+ * and none are free. We use this to enforce that no more than
+ * MaxHeapTuplesPerPage line pointers are created on a heap page. (Although
+ * no more tuples than that could fit anyway, in the presence of redirected
+ * or dead line pointers it'd be possible to have too many line pointers.
+ * To avoid breaking code that assumes MaxHeapTuplesPerPage is a hard limit
+ * on the number of line pointers, we make this extra check.)
+ */
+Size
+PageGetHeapFreeSpace(Page page)
+{
+ Size space;
+
+ space = PageGetFreeSpace(page);
+ if (space > 0)
+ {
+ OffsetNumber offnum,
+ nline;
+
+ /*
+ * Are there already MaxHeapTuplesPerPage line pointers in the page?
+ */
+ nline = PageGetMaxOffsetNumber(page);
+ if (nline >= MaxHeapTuplesPerPage)
+ {
+ if (PageHasFreeLinePointers(page))
+ {
+ /*
+ * Since this is just a hint, we must confirm that there is
+ * indeed a free line pointer
+ */
+ for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
+ {
+ ItemId lp = PageGetItemId(page, offnum);
+
+ if (!ItemIdIsUsed(lp))
+ break;
+ }
+
+ if (offnum > nline)
+ {
+ /*
+ * The hint is wrong, but we can't clear it here since we
+ * don't have the ability to mark the page dirty.
+ */
+ space = 0;
+ }
+ }
+ else
+ {
+ /*
+ * Although the hint might be wrong, PageAddItem will believe
+ * it anyway, so we must believe it too.
+ */
+ space = 0;
+ }
+ }
+ }
+ return space;
+}
+
+
+/*
+ * PageIndexTupleDelete
+ *
+ * This routine does the work of removing a tuple from an index page.
+ *
+ * Unlike heap pages, we compact out the line pointer for the removed tuple.
+ */
+void
+PageIndexTupleDelete(Page page, OffsetNumber offnum)
+{
+ PageHeader phdr = (PageHeader) page;
+ char *addr;
+ ItemId tup;
+ Size size;
+ unsigned offset;
+ int nbytes;
+ int offidx;
+ int nline;
+
+ /*
+ * As with PageRepairFragmentation, paranoia seems justified.
+ */
+ if (phdr->pd_lower < SizeOfPageHeaderData ||
+ phdr->pd_lower > phdr->pd_upper ||
+ phdr->pd_upper > phdr->pd_special ||
+ phdr->pd_special > BLCKSZ ||
+ phdr->pd_special != MAXALIGN(phdr->pd_special))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+ phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
+
+ nline = PageGetMaxOffsetNumber(page);
+ if ((int) offnum <= 0 || (int) offnum > nline)
+ elog(ERROR, "invalid index offnum: %u", offnum);
+
+ /* change offset number to offset index */
+ offidx = offnum - 1;
+
+ tup = PageGetItemId(page, offnum);
+ Assert(ItemIdHasStorage(tup));
+ size = ItemIdGetLength(tup);
+ offset = ItemIdGetOffset(tup);
+
+ if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special ||
+ offset != MAXALIGN(offset))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted line pointer: offset = %u, size = %u",
+ offset, (unsigned int) size)));
+
+ /* Amount of space to actually be deleted */
+ size = MAXALIGN(size);
+
+ /*
+ * First, we want to get rid of the pd_linp entry for the index tuple. We
+ * copy all subsequent linp's back one slot in the array. We don't use
+ * PageGetItemId, because we are manipulating the _array_, not individual
+ * linp's.
+ */
+ nbytes = phdr->pd_lower -
+ ((char *) &phdr->pd_linp[offidx + 1] - (char *) phdr);
+
+ if (nbytes > 0)
+ memmove((char *) &(phdr->pd_linp[offidx]),
+ (char *) &(phdr->pd_linp[offidx + 1]),
+ nbytes);
+
+ /*
+ * Now move everything between the old upper bound (beginning of tuple
+ * space) and the beginning of the deleted tuple forward, so that space in
+ * the middle of the page is left free. If we've just deleted the tuple
+ * at the beginning of tuple space, then there's no need to do the copy.
+ */
+
+ /* beginning of tuple space */
+ addr = (char *) page + phdr->pd_upper;
+
+ if (offset > phdr->pd_upper)
+ memmove(addr + size, addr, offset - phdr->pd_upper);
+
+ /* adjust free space boundary pointers */
+ phdr->pd_upper += size;
+ phdr->pd_lower -= sizeof(ItemIdData);
+
+ /*
+ * Finally, we need to adjust the linp entries that remain.
+ *
+ * Anything that used to be before the deleted tuple's data was moved
+ * forward by the size of the deleted tuple.
+ */
+ if (!PageIsEmpty(page))
+ {
+ int i;
+
+ nline--; /* there's one less than when we started */
+ for (i = 1; i <= nline; i++)
+ {
+ ItemId ii = PageGetItemId(page, i);
+
+ Assert(ItemIdHasStorage(ii));
+ if (ItemIdGetOffset(ii) <= offset)
+ ii->lp_off += size;
+ }
+ }
+}
+
+
+/*
+ * PageIndexMultiDelete
+ *
+ * This routine handles the case of deleting multiple tuples from an
+ * index page at once. It is considerably faster than a loop around
+ * PageIndexTupleDelete ... however, the caller *must* supply the array
+ * of item numbers to be deleted in item number order!
+ */
+void
+PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
+{
+ PageHeader phdr = (PageHeader) page;
+ Offset pd_lower = phdr->pd_lower;
+ Offset pd_upper = phdr->pd_upper;
+ Offset pd_special = phdr->pd_special;
+ Offset last_offset;
+ itemIdCompactData itemidbase[MaxIndexTuplesPerPage];
+ ItemIdData newitemids[MaxIndexTuplesPerPage];
+ itemIdCompact itemidptr;
+ ItemId lp;
+ int nline,
+ nused;
+ Size totallen;
+ Size size;
+ unsigned offset;
+ int nextitm;
+ OffsetNumber offnum;
+ bool presorted = true; /* For now */
+
+ Assert(nitems <= MaxIndexTuplesPerPage);
+
+ /*
+ * If there aren't very many items to delete, then retail
+ * PageIndexTupleDelete is the best way. Delete the items in reverse
+ * order so we don't have to think about adjusting item numbers for
+ * previous deletions.
+ *
+ * TODO: tune the magic number here
+ */
+ if (nitems <= 2)
+ {
+ while (--nitems >= 0)
+ PageIndexTupleDelete(page, itemnos[nitems]);
+ return;
+ }
+
+ /*
+ * As with PageRepairFragmentation, paranoia seems justified.
+ */
+ if (pd_lower < SizeOfPageHeaderData ||
+ pd_lower > pd_upper ||
+ pd_upper > pd_special ||
+ pd_special > BLCKSZ ||
+ pd_special != MAXALIGN(pd_special))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+ pd_lower, pd_upper, pd_special)));
+
+ /*
+ * Scan the line pointer array and build a list of just the ones we are
+ * going to keep. Notice we do not modify the page yet, since we are
+ * still validity-checking.
+ */
+ nline = PageGetMaxOffsetNumber(page);
+ itemidptr = itemidbase;
+ totallen = 0;
+ nused = 0;
+ nextitm = 0;
+ last_offset = pd_special;
+ for (offnum = FirstOffsetNumber; offnum <= nline; offnum = OffsetNumberNext(offnum))
+ {
+ lp = PageGetItemId(page, offnum);
+ Assert(ItemIdHasStorage(lp));
+ size = ItemIdGetLength(lp);
+ offset = ItemIdGetOffset(lp);
+ if (offset < pd_upper ||
+ (offset + size) > pd_special ||
+ offset != MAXALIGN(offset))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted line pointer: offset = %u, size = %u",
+ offset, (unsigned int) size)));
+
+ if (nextitm < nitems && offnum == itemnos[nextitm])
+ {
+ /* skip item to be deleted */
+ nextitm++;
+ }
+ else
+ {
+ itemidptr->offsetindex = nused; /* where it will go */
+ itemidptr->itemoff = offset;
+
+ if (last_offset > itemidptr->itemoff)
+ last_offset = itemidptr->itemoff;
+ else
+ presorted = false;
+
+ itemidptr->alignedlen = MAXALIGN(size);
+ totallen += itemidptr->alignedlen;
+ newitemids[nused] = *lp;
+ itemidptr++;
+ nused++;
+ }
+ }
+
+ /* this will catch invalid or out-of-order itemnos[] */
+ if (nextitm != nitems)
+ elog(ERROR, "incorrect index offsets supplied");
+
+ if (totallen > (Size) (pd_special - pd_lower))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted item lengths: total %u, available space %u",
+ (unsigned int) totallen, pd_special - pd_lower)));
+
+ /*
+ * Looks good. Overwrite the line pointers with the copy, from which we've
+ * removed all the unused items.
+ */
+ memcpy(phdr->pd_linp, newitemids, nused * sizeof(ItemIdData));
+ phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData);
+
+ /* and compactify the tuple data */
+ if (nused > 0)
+ compactify_tuples(itemidbase, nused, page, presorted);
+ else
+ phdr->pd_upper = pd_special;
+}
+
+
+/*
+ * PageIndexTupleDeleteNoCompact
+ *
+ * Remove the specified tuple from an index page, but set its line pointer
+ * to "unused" instead of compacting it out, except that it can be removed
+ * if it's the last line pointer on the page.
+ *
+ * This is used for index AMs that require that existing TIDs of live tuples
+ * remain unchanged, and are willing to allow unused line pointers instead.
+ */
+void
+PageIndexTupleDeleteNoCompact(Page page, OffsetNumber offnum)
+{
+ PageHeader phdr = (PageHeader) page;
+ char *addr;
+ ItemId tup;
+ Size size;
+ unsigned offset;
+ int nline;
+
+ /*
+ * As with PageRepairFragmentation, paranoia seems justified.
+ */
+ if (phdr->pd_lower < SizeOfPageHeaderData ||
+ phdr->pd_lower > phdr->pd_upper ||
+ phdr->pd_upper > phdr->pd_special ||
+ phdr->pd_special > BLCKSZ ||
+ phdr->pd_special != MAXALIGN(phdr->pd_special))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+ phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
+
+ nline = PageGetMaxOffsetNumber(page);
+ if ((int) offnum <= 0 || (int) offnum > nline)
+ elog(ERROR, "invalid index offnum: %u", offnum);
+
+ tup = PageGetItemId(page, offnum);
+ Assert(ItemIdHasStorage(tup));
+ size = ItemIdGetLength(tup);
+ offset = ItemIdGetOffset(tup);
+
+ if (offset < phdr->pd_upper || (offset + size) > phdr->pd_special ||
+ offset != MAXALIGN(offset))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted line pointer: offset = %u, size = %u",
+ offset, (unsigned int) size)));
+
+ /* Amount of space to actually be deleted */
+ size = MAXALIGN(size);
+
+ /*
+ * Either set the line pointer to "unused", or zap it if it's the last
+ * one. (Note: it's possible that the next-to-last one(s) are already
+ * unused, but we do not trouble to try to compact them out if so.)
+ */
+ if ((int) offnum < nline)
+ ItemIdSetUnused(tup);
+ else
+ {
+ phdr->pd_lower -= sizeof(ItemIdData);
+ nline--; /* there's one less than when we started */
+ }
+
+ /*
+ * Now move everything between the old upper bound (beginning of tuple
+ * space) and the beginning of the deleted tuple forward, so that space in
+ * the middle of the page is left free. If we've just deleted the tuple
+ * at the beginning of tuple space, then there's no need to do the copy.
+ */
+
+ /* beginning of tuple space */
+ addr = (char *) page + phdr->pd_upper;
+
+ if (offset > phdr->pd_upper)
+ memmove(addr + size, addr, offset - phdr->pd_upper);
+
+ /* adjust free space boundary pointer */
+ phdr->pd_upper += size;
+
+ /*
+ * Finally, we need to adjust the linp entries that remain.
+ *
+ * Anything that used to be before the deleted tuple's data was moved
+ * forward by the size of the deleted tuple.
+ */
+ if (!PageIsEmpty(page))
+ {
+ int i;
+
+ for (i = 1; i <= nline; i++)
+ {
+ ItemId ii = PageGetItemId(page, i);
+
+ if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset)
+ ii->lp_off += size;
+ }
+ }
+}
+
+
+/*
+ * PageIndexTupleOverwrite
+ *
+ * Replace a specified tuple on an index page.
+ *
+ * The new tuple is placed exactly where the old one had been, shifting
+ * other tuples' data up or down as needed to keep the page compacted.
+ * This is better than deleting and reinserting the tuple, because it
+ * avoids any data shifting when the tuple size doesn't change; and
+ * even when it does, we avoid moving the line pointers around.
+ * This could be used by an index AM that doesn't want to unset the
+ * LP_DEAD bit when it happens to be set. It could conceivably also be
+ * used by an index AM that cares about the physical order of tuples as
+ * well as their logical/ItemId order.
+ *
+ * If there's insufficient space for the new tuple, return false. Other
+ * errors represent data-corruption problems, so we just elog.
+ */
+bool
+PageIndexTupleOverwrite(Page page, OffsetNumber offnum,
+ Item newtup, Size newsize)
+{
+ PageHeader phdr = (PageHeader) page;
+ ItemId tupid;
+ int oldsize;
+ unsigned offset;
+ Size alignednewsize;
+ int size_diff;
+ int itemcount;
+
+ /*
+ * As with PageRepairFragmentation, paranoia seems justified.
+ */
+ if (phdr->pd_lower < SizeOfPageHeaderData ||
+ phdr->pd_lower > phdr->pd_upper ||
+ phdr->pd_upper > phdr->pd_special ||
+ phdr->pd_special > BLCKSZ ||
+ phdr->pd_special != MAXALIGN(phdr->pd_special))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted page pointers: lower = %u, upper = %u, special = %u",
+ phdr->pd_lower, phdr->pd_upper, phdr->pd_special)));
+
+ itemcount = PageGetMaxOffsetNumber(page);
+ if ((int) offnum <= 0 || (int) offnum > itemcount)
+ elog(ERROR, "invalid index offnum: %u", offnum);
+
+ tupid = PageGetItemId(page, offnum);
+ Assert(ItemIdHasStorage(tupid));
+ oldsize = ItemIdGetLength(tupid);
+ offset = ItemIdGetOffset(tupid);
+
+ if (offset < phdr->pd_upper || (offset + oldsize) > phdr->pd_special ||
+ offset != MAXALIGN(offset))
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("corrupted line pointer: offset = %u, size = %u",
+ offset, (unsigned int) oldsize)));
+
+ /*
+ * Determine actual change in space requirement, check for page overflow.
+ */
+ oldsize = MAXALIGN(oldsize);
+ alignednewsize = MAXALIGN(newsize);
+ if (alignednewsize > oldsize + (phdr->pd_upper - phdr->pd_lower))
+ return false;
+
+ /*
+ * Relocate existing data and update line pointers, unless the new tuple
+ * is the same size as the old (after alignment), in which case there's
+ * nothing to do. Notice that what we have to relocate is data before the
+ * target tuple, not data after, so it's convenient to express size_diff
+ * as the amount by which the tuple's size is decreasing, making it the
+ * delta to add to pd_upper and affected line pointers.
+ */
+ size_diff = oldsize - (int) alignednewsize;
+ if (size_diff != 0)
+ {
+ char *addr = (char *) page + phdr->pd_upper;
+ int i;
+
+ /* relocate all tuple data before the target tuple */
+ memmove(addr + size_diff, addr, offset - phdr->pd_upper);
+
+ /* adjust free space boundary pointer */
+ phdr->pd_upper += size_diff;
+
+ /* adjust affected line pointers too */
+ for (i = FirstOffsetNumber; i <= itemcount; i++)
+ {
+ ItemId ii = PageGetItemId(page, i);
+
+ /* Allow items without storage; currently only BRIN needs that */
+ if (ItemIdHasStorage(ii) && ItemIdGetOffset(ii) <= offset)
+ ii->lp_off += size_diff;
+ }
+ }
+
+ /* Update the item's tuple length without changing its lp_flags field */
+ tupid->lp_off = offset + size_diff;
+ tupid->lp_len = newsize;
+
+ /* Copy new tuple data onto page */
+ memcpy(PageGetItem(page, tupid), newtup, newsize);
+
+ return true;
+}
+
+
+/*
+ * Set checksum for a page in shared buffers.
+ *
+ * If checksums are disabled, or if the page is not initialized, just return
+ * the input. Otherwise, we must make a copy of the page before calculating
+ * the checksum, to prevent concurrent modifications (e.g. setting hint bits)
+ * from making the final checksum invalid. It doesn't matter if we include or
+ * exclude hints during the copy, as long as we write a valid page and
+ * associated checksum.
+ *
+ * Returns a pointer to the block-sized data that needs to be written. Uses
+ * statically-allocated memory, so the caller must immediately write the
+ * returned page and not refer to it again.
+ */
+char *
+PageSetChecksumCopy(Page page, BlockNumber blkno)
+{
+ static char *pageCopy = NULL;
+
+ /* If we don't need a checksum, just return the passed-in data */
+ if (PageIsNew(page) || !DataChecksumsEnabled())
+ return (char *) page;
+
+ /*
+ * We allocate the copy space once and use it over on each subsequent
+ * call. The point of palloc'ing here, rather than having a static char
+ * array, is first to ensure adequate alignment for the checksumming code
+ * and second to avoid wasting space in processes that never call this.
+ */
+ if (pageCopy == NULL)
+ pageCopy = MemoryContextAllocAligned(TopMemoryContext,
+ BLCKSZ,
+ PG_IO_ALIGN_SIZE,
+ 0);
+
+ memcpy(pageCopy, (char *) page, BLCKSZ);
+ ((PageHeader) pageCopy)->pd_checksum = pg_checksum_page(pageCopy, blkno);
+ return pageCopy;
+}
+
+/*
+ * Set checksum for a page in private memory.
+ *
+ * This must only be used when we know that no other process can be modifying
+ * the page buffer.
+ */
+void
+PageSetChecksumInplace(Page page, BlockNumber blkno)
+{
+ /* If we don't need a checksum, just return */
+ if (PageIsNew(page) || !DataChecksumsEnabled())
+ return;
+
+ ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, blkno);
+}
diff --git a/src/backend/storage/page/checksum.c b/src/backend/storage/page/checksum.c
new file mode 100644
index 0000000..81fd519
--- /dev/null
+++ b/src/backend/storage/page/checksum.c
@@ -0,0 +1,22 @@
+/*-------------------------------------------------------------------------
+ *
+ * checksum.c
+ * Checksum implementation for data pages.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/page/checksum.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/checksum.h"
+/*
+ * The actual code is in storage/checksum_impl.h. This is done so that
+ * external programs can incorporate the checksum code by #include'ing
+ * that file from the exported Postgres headers. (Compare our CRC code.)
+ */
+#include "storage/checksum_impl.h"
diff --git a/src/backend/storage/page/itemptr.c b/src/backend/storage/page/itemptr.c
new file mode 100644
index 0000000..2c25ad5
--- /dev/null
+++ b/src/backend/storage/page/itemptr.c
@@ -0,0 +1,131 @@
+/*-------------------------------------------------------------------------
+ *
+ * itemptr.c
+ * POSTGRES disk item pointer code.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/page/itemptr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/itemptr.h"
+
+
+/*
+ * We really want ItemPointerData to be exactly 6 bytes.
+ */
+StaticAssertDecl(sizeof(ItemPointerData) == 3 * sizeof(uint16),
+ "ItemPointerData struct is improperly padded");
+
+/*
+ * ItemPointerEquals
+ * Returns true if both item pointers point to the same item,
+ * otherwise returns false.
+ *
+ * Note:
+ * Asserts that the disk item pointers are both valid!
+ */
+bool
+ItemPointerEquals(ItemPointer pointer1, ItemPointer pointer2)
+{
+ if (ItemPointerGetBlockNumber(pointer1) ==
+ ItemPointerGetBlockNumber(pointer2) &&
+ ItemPointerGetOffsetNumber(pointer1) ==
+ ItemPointerGetOffsetNumber(pointer2))
+ return true;
+ else
+ return false;
+}
+
+/*
+ * ItemPointerCompare
+ * Generic btree-style comparison for item pointers.
+ */
+int32
+ItemPointerCompare(ItemPointer arg1, ItemPointer arg2)
+{
+ /*
+ * Use ItemPointerGet{Offset,Block}NumberNoCheck to avoid asserting
+ * ip_posid != 0, which may not be true for a user-supplied TID.
+ */
+ BlockNumber b1 = ItemPointerGetBlockNumberNoCheck(arg1);
+ BlockNumber b2 = ItemPointerGetBlockNumberNoCheck(arg2);
+
+ if (b1 < b2)
+ return -1;
+ else if (b1 > b2)
+ return 1;
+ else if (ItemPointerGetOffsetNumberNoCheck(arg1) <
+ ItemPointerGetOffsetNumberNoCheck(arg2))
+ return -1;
+ else if (ItemPointerGetOffsetNumberNoCheck(arg1) >
+ ItemPointerGetOffsetNumberNoCheck(arg2))
+ return 1;
+ else
+ return 0;
+}
+
+/*
+ * ItemPointerInc
+ * Increment 'pointer' by 1 only paying attention to the ItemPointer's
+ * type's range limits and not MaxOffsetNumber and FirstOffsetNumber.
+ * This may result in 'pointer' becoming !OffsetNumberIsValid.
+ *
+ * If the pointer is already the maximum possible values permitted by the
+ * range of the ItemPointer's types, then do nothing.
+ */
+void
+ItemPointerInc(ItemPointer pointer)
+{
+ BlockNumber blk = ItemPointerGetBlockNumberNoCheck(pointer);
+ OffsetNumber off = ItemPointerGetOffsetNumberNoCheck(pointer);
+
+ if (off == PG_UINT16_MAX)
+ {
+ if (blk != InvalidBlockNumber)
+ {
+ off = 0;
+ blk++;
+ }
+ }
+ else
+ off++;
+
+ ItemPointerSet(pointer, blk, off);
+}
+
+/*
+ * ItemPointerDec
+ * Decrement 'pointer' by 1 only paying attention to the ItemPointer's
+ * type's range limits and not MaxOffsetNumber and FirstOffsetNumber.
+ * This may result in 'pointer' becoming !OffsetNumberIsValid.
+ *
+ * If the pointer is already the minimum possible values permitted by the
+ * range of the ItemPointer's types, then do nothing. This does rely on
+ * FirstOffsetNumber being 1 rather than 0.
+ */
+void
+ItemPointerDec(ItemPointer pointer)
+{
+ BlockNumber blk = ItemPointerGetBlockNumberNoCheck(pointer);
+ OffsetNumber off = ItemPointerGetOffsetNumberNoCheck(pointer);
+
+ if (off == 0)
+ {
+ if (blk != 0)
+ {
+ off = PG_UINT16_MAX;
+ blk--;
+ }
+ }
+ else
+ off--;
+
+ ItemPointerSet(pointer, blk, off);
+}
diff --git a/src/backend/storage/page/meson.build b/src/backend/storage/page/meson.build
new file mode 100644
index 0000000..2160a37
--- /dev/null
+++ b/src/backend/storage/page/meson.build
@@ -0,0 +1,7 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'bufpage.c',
+ 'checksum.c',
+ 'itemptr.c',
+)
diff --git a/src/backend/storage/smgr/Makefile b/src/backend/storage/smgr/Makefile
new file mode 100644
index 0000000..596b564
--- /dev/null
+++ b/src/backend/storage/smgr/Makefile
@@ -0,0 +1,19 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/smgr
+#
+# IDENTIFICATION
+# src/backend/storage/smgr/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/smgr
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ md.o \
+ smgr.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/smgr/README b/src/backend/storage/smgr/README
new file mode 100644
index 0000000..cf3aa56
--- /dev/null
+++ b/src/backend/storage/smgr/README
@@ -0,0 +1,52 @@
+src/backend/storage/smgr/README
+
+Storage Managers
+================
+
+In the original Berkeley Postgres system, there were several storage managers,
+of which only the "magnetic disk" manager remains. (At Berkeley there were
+also managers for the Sony WORM optical disk jukebox and persistent main
+memory, but these were never supported in any externally released Postgres,
+nor in any version of PostgreSQL.) The "magnetic disk" manager is itself
+seriously misnamed, because actually it supports any kind of device for
+which the operating system provides standard filesystem operations; which
+these days is pretty much everything of interest. However, we retain the
+notion of a storage manager switch in case anyone ever wants to reintroduce
+other kinds of storage managers. Removing the switch layer would save
+nothing noticeable anyway, since storage-access operations are surely far
+more expensive than one extra layer of C function calls.
+
+In Berkeley Postgres each relation was tagged with the ID of the storage
+manager to use for it. This is gone. It would be probably more reasonable
+to associate storage managers with tablespaces, should we ever re-introduce
+multiple storage managers into the system catalogs.
+
+The files in this directory, and their contents, are
+
+ smgr.c The storage manager switch dispatch code. The routines in
+ this file call the appropriate storage manager to do storage
+ accesses requested by higher-level code. smgr.c also manages
+ the file handle cache (SMgrRelation table).
+
+ md.c The "magnetic disk" storage manager, which is really just
+ an interface to the kernel's filesystem operations.
+
+Note that md.c in turn relies on src/backend/storage/file/fd.c.
+
+
+Relation Forks
+==============
+
+Since 8.4, a single smgr relation can be comprised of multiple physical
+files, called relation forks. This allows storing additional metadata like
+Free Space information in additional forks, which can be grown and truncated
+independently of the main data file, while still treating it all as a single
+physical relation in system catalogs.
+
+It is assumed that the main fork, fork number 0 or MAIN_FORKNUM, always
+exists. Fork numbers are assigned in src/include/common/relpath.h.
+Functions in smgr.c and md.c take an extra fork number argument, in addition
+to relfilelocator and block number, to identify which relation fork you want to
+access. Since most code wants to access the main fork, a shortcut version of
+ReadBuffer that accesses MAIN_FORKNUM is provided in the buffer manager for
+convenience.
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
new file mode 100644
index 0000000..fdecbad
--- /dev/null
+++ b/src/backend/storage/smgr/md.c
@@ -0,0 +1,1623 @@
+/*-------------------------------------------------------------------------
+ *
+ * md.c
+ * This code manages relations that reside on magnetic disk.
+ *
+ * Or at least, that was what the Berkeley folk had in mind when they named
+ * this file. In reality, what this code provides is an interface from
+ * the smgr API to Unix-like filesystem APIs, so it will work with any type
+ * of device for which the operating system provides filesystem support.
+ * It doesn't matter whether the bits are on spinning rust or some other
+ * storage technology.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/smgr/md.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/md.h"
+#include "storage/relfilelocator.h"
+#include "storage/smgr.h"
+#include "storage/sync.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+
+/*
+ * The magnetic disk storage manager keeps track of open file
+ * descriptors in its own descriptor pool. This is done to make it
+ * easier to support relations that are larger than the operating
+ * system's file size limit (often 2GBytes). In order to do that,
+ * we break relations up into "segment" files that are each shorter than
+ * the OS file size limit. The segment size is set by the RELSEG_SIZE
+ * configuration constant in pg_config.h.
+ *
+ * On disk, a relation must consist of consecutively numbered segment
+ * files in the pattern
+ * -- Zero or more full segments of exactly RELSEG_SIZE blocks each
+ * -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
+ * -- Optionally, any number of inactive segments of size 0 blocks.
+ * The full and partial segments are collectively the "active" segments.
+ * Inactive segments are those that once contained data but are currently
+ * not needed because of an mdtruncate() operation. The reason for leaving
+ * them present at size zero, rather than unlinking them, is that other
+ * backends and/or the checkpointer might be holding open file references to
+ * such segments. If the relation expands again after mdtruncate(), such
+ * that a deactivated segment becomes active again, it is important that
+ * such file references still be valid --- else data might get written
+ * out to an unlinked old copy of a segment file that will eventually
+ * disappear.
+ *
+ * File descriptors are stored in the per-fork md_seg_fds arrays inside
+ * SMgrRelation. The length of these arrays is stored in md_num_open_segs.
+ * Note that a fork's md_num_open_segs having a specific value does not
+ * necessarily mean the relation doesn't have additional segments; we may
+ * just not have opened the next segment yet. (We could not have "all
+ * segments are in the array" as an invariant anyway, since another backend
+ * could extend the relation while we aren't looking.) We do not have
+ * entries for inactive segments, however; as soon as we find a partial
+ * segment, we assume that any subsequent segments are inactive.
+ *
+ * The entire MdfdVec array is palloc'd in the MdCxt memory context.
+ */
+
+typedef struct _MdfdVec
+{
+ File mdfd_vfd; /* fd number in fd.c's pool */
+ BlockNumber mdfd_segno; /* segment number, from 0 */
+} MdfdVec;
+
+static MemoryContext MdCxt; /* context for all MdfdVec objects */
+
+
+/* Populate a file tag describing an md.c segment file. */
+#define INIT_MD_FILETAG(a,xx_rlocator,xx_forknum,xx_segno) \
+( \
+ memset(&(a), 0, sizeof(FileTag)), \
+ (a).handler = SYNC_HANDLER_MD, \
+ (a).rlocator = (xx_rlocator), \
+ (a).forknum = (xx_forknum), \
+ (a).segno = (xx_segno) \
+)
+
+
+/*** behavior for mdopen & _mdfd_getseg ***/
+/* ereport if segment not present */
+#define EXTENSION_FAIL (1 << 0)
+/* return NULL if segment not present */
+#define EXTENSION_RETURN_NULL (1 << 1)
+/* create new segments as needed */
+#define EXTENSION_CREATE (1 << 2)
+/* create new segments if needed during recovery */
+#define EXTENSION_CREATE_RECOVERY (1 << 3)
+/*
+ * Allow opening segments which are preceded by segments smaller than
+ * RELSEG_SIZE, e.g. inactive segments (see above). Note that this breaks
+ * mdnblocks() and related functionality henceforth - which currently is ok,
+ * because this is only required in the checkpointer which never uses
+ * mdnblocks().
+ */
+#define EXTENSION_DONT_CHECK_SIZE (1 << 4)
+/* don't try to open a segment, if not already open */
+#define EXTENSION_DONT_OPEN (1 << 5)
+
+
+/* local routines */
+static void mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum,
+ bool isRedo);
+static MdfdVec *mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior);
+static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
+ MdfdVec *seg);
+static void register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
+ BlockNumber segno);
+static void register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
+ BlockNumber segno);
+static void _fdvec_resize(SMgrRelation reln,
+ ForkNumber forknum,
+ int nseg);
+static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber segno);
+static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber segno, int oflags);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blkno, bool skipFsync, int behavior);
+static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
+ MdfdVec *seg);
+
+static inline int
+_mdfd_open_flags(void)
+{
+ int flags = O_RDWR | PG_BINARY;
+
+ if (io_direct_flags & IO_DIRECT_DATA)
+ flags |= PG_O_DIRECT;
+
+ return flags;
+}
+
+/*
+ * mdinit() -- Initialize private state for magnetic disk storage manager.
+ */
+void
+mdinit(void)
+{
+ MdCxt = AllocSetContextCreate(TopMemoryContext,
+ "MdSmgr",
+ ALLOCSET_DEFAULT_SIZES);
+}
+
+/*
+ * mdexists() -- Does the physical file exist?
+ *
+ * Note: this will return true for lingering files, with pending deletions
+ */
+bool
+mdexists(SMgrRelation reln, ForkNumber forknum)
+{
+ /*
+ * Close it first, to ensure that we notice if the fork has been unlinked
+ * since we opened it. As an optimization, we can skip that in recovery,
+ * which already closes relations when dropping them.
+ */
+ if (!InRecovery)
+ mdclose(reln, forknum);
+
+ return (mdopenfork(reln, forknum, EXTENSION_RETURN_NULL) != NULL);
+}
+
+/*
+ * mdcreate() -- Create a new relation on magnetic disk.
+ *
+ * If isRedo is true, it's okay for the relation to exist already.
+ */
+void
+mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+ MdfdVec *mdfd;
+ char *path;
+ File fd;
+
+ if (isRedo && reln->md_num_open_segs[forknum] > 0)
+ return; /* created and opened already... */
+
+ Assert(reln->md_num_open_segs[forknum] == 0);
+
+ /*
+ * We may be using the target table space for the first time in this
+ * database, so create a per-database subdirectory if needed.
+ *
+ * XXX this is a fairly ugly violation of module layering, but this seems
+ * to be the best place to put the check. Maybe TablespaceCreateDbspace
+ * should be here and not in commands/tablespace.c? But that would imply
+ * importing a lot of stuff that smgr.c oughtn't know, either.
+ */
+ TablespaceCreateDbspace(reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ isRedo);
+
+ path = relpath(reln->smgr_rlocator, forknum);
+
+ fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL);
+
+ if (fd < 0)
+ {
+ int save_errno = errno;
+
+ if (isRedo)
+ fd = PathNameOpenFile(path, _mdfd_open_flags());
+ if (fd < 0)
+ {
+ /* be sure to report the error reported by create, not open */
+ errno = save_errno;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create file \"%s\": %m", path)));
+ }
+ }
+
+ pfree(path);
+
+ _fdvec_resize(reln, forknum, 1);
+ mdfd = &reln->md_seg_fds[forknum][0];
+ mdfd->mdfd_vfd = fd;
+ mdfd->mdfd_segno = 0;
+
+ if (!SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, mdfd);
+}
+
+/*
+ * mdunlink() -- Unlink a relation.
+ *
+ * Note that we're passed a RelFileLocatorBackend --- by the time this is called,
+ * there won't be an SMgrRelation hashtable entry anymore.
+ *
+ * forknum can be a fork number to delete a specific fork, or InvalidForkNumber
+ * to delete all forks.
+ *
+ * For regular relations, we don't unlink the first segment file of the rel,
+ * but just truncate it to zero length, and record a request to unlink it after
+ * the next checkpoint. Additional segments can be unlinked immediately,
+ * however. Leaving the empty file in place prevents that relfilenumber
+ * from being reused. The scenario this protects us from is:
+ * 1. We delete a relation (and commit, and actually remove its file).
+ * 2. We create a new relation, which by chance gets the same relfilenumber as
+ * the just-deleted one (OIDs must've wrapped around for that to happen).
+ * 3. We crash before another checkpoint occurs.
+ * During replay, we would delete the file and then recreate it, which is fine
+ * if the contents of the file were repopulated by subsequent WAL entries.
+ * But if we didn't WAL-log insertions, but instead relied on fsyncing the
+ * file after populating it (as we do at wal_level=minimal), the contents of
+ * the file would be lost forever. By leaving the empty file until after the
+ * next checkpoint, we prevent reassignment of the relfilenumber until it's
+ * safe, because relfilenumber assignment skips over any existing file.
+ *
+ * Additional segments, if any, are truncated and then unlinked. The reason
+ * for truncating is that other backends may still hold open FDs for these at
+ * the smgr level, so that the kernel can't remove the file yet. We want to
+ * reclaim the disk space right away despite that.
+ *
+ * We do not need to go through this dance for temp relations, though, because
+ * we never make WAL entries for temp rels, and so a temp rel poses no threat
+ * to the health of a regular rel that has taken over its relfilenumber.
+ * The fact that temp rels and regular rels have different file naming
+ * patterns provides additional safety. Other backends shouldn't have open
+ * FDs for them, either.
+ *
+ * We also don't do it while performing a binary upgrade. There is no reuse
+ * hazard in that case, since after a crash or even a simple ERROR, the
+ * upgrade fails and the whole cluster must be recreated from scratch.
+ * Furthermore, it is important to remove the files from disk immediately,
+ * because we may be about to reuse the same relfilenumber.
+ *
+ * All the above applies only to the relation's main fork; other forks can
+ * just be removed immediately, since they are not needed to prevent the
+ * relfilenumber from being recycled. Also, we do not carefully
+ * track whether other forks have been created or not, but just attempt to
+ * unlink them unconditionally; so we should never complain about ENOENT.
+ *
+ * If isRedo is true, it's unsurprising for the relation to be already gone.
+ * Also, we should remove the file immediately instead of queuing a request
+ * for later, since during redo there's no possibility of creating a
+ * conflicting relation.
+ *
+ * Note: we currently just never warn about ENOENT at all. We could warn in
+ * the main-fork, non-isRedo case, but it doesn't seem worth the trouble.
+ *
+ * Note: any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
+ */
+void
+mdunlink(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
+{
+ /* Now do the per-fork work */
+ if (forknum == InvalidForkNumber)
+ {
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ mdunlinkfork(rlocator, forknum, isRedo);
+ }
+ else
+ mdunlinkfork(rlocator, forknum, isRedo);
+}
+
+/*
+ * Truncate a file to release disk space.
+ */
+static int
+do_truncate(const char *path)
+{
+ int save_errno;
+ int ret;
+
+ ret = pg_truncate(path, 0);
+
+ /* Log a warning here to avoid repetition in callers. */
+ if (ret < 0 && errno != ENOENT)
+ {
+ save_errno = errno;
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\": %m", path)));
+ errno = save_errno;
+ }
+
+ return ret;
+}
+
+static void
+mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo)
+{
+ char *path;
+ int ret;
+ int save_errno;
+
+ path = relpath(rlocator, forknum);
+
+ /*
+ * Truncate and then unlink the first segment, or just register a request
+ * to unlink it later, as described in the comments for mdunlink().
+ */
+ if (isRedo || IsBinaryUpgrade || forknum != MAIN_FORKNUM ||
+ RelFileLocatorBackendIsTemp(rlocator))
+ {
+ if (!RelFileLocatorBackendIsTemp(rlocator))
+ {
+ /* Prevent other backends' fds from holding on to the disk space */
+ ret = do_truncate(path);
+
+ /* Forget any pending sync requests for the first segment */
+ save_errno = errno;
+ register_forget_request(rlocator, forknum, 0 /* first seg */ );
+ errno = save_errno;
+ }
+ else
+ ret = 0;
+
+ /* Next unlink the file, unless it was already found to be missing */
+ if (ret >= 0 || errno != ENOENT)
+ {
+ ret = unlink(path);
+ if (ret < 0 && errno != ENOENT)
+ {
+ save_errno = errno;
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", path)));
+ errno = save_errno;
+ }
+ }
+ }
+ else
+ {
+ /* Prevent other backends' fds from holding on to the disk space */
+ ret = do_truncate(path);
+
+ /* Register request to unlink first segment later */
+ save_errno = errno;
+ register_unlink_segment(rlocator, forknum, 0 /* first seg */ );
+ errno = save_errno;
+ }
+
+ /*
+ * Delete any additional segments.
+ *
+ * Note that because we loop until getting ENOENT, we will correctly
+ * remove all inactive segments as well as active ones. Ideally we'd
+ * continue the loop until getting exactly that errno, but that risks an
+ * infinite loop if the problem is directory-wide (for instance, if we
+ * suddenly can't read the data directory itself). We compromise by
+ * continuing after a non-ENOENT truncate error, but stopping after any
+ * unlink error. If there is indeed a directory-wide problem, additional
+ * unlink attempts wouldn't work anyway.
+ */
+ if (ret >= 0 || errno != ENOENT)
+ {
+ char *segpath = (char *) palloc(strlen(path) + 12);
+ BlockNumber segno;
+
+ for (segno = 1;; segno++)
+ {
+ sprintf(segpath, "%s.%u", path, segno);
+
+ if (!RelFileLocatorBackendIsTemp(rlocator))
+ {
+ /*
+ * Prevent other backends' fds from holding on to the disk
+ * space. We're done if we see ENOENT, though.
+ */
+ if (do_truncate(segpath) < 0 && errno == ENOENT)
+ break;
+
+ /*
+ * Forget any pending sync requests for this segment before we
+ * try to unlink.
+ */
+ register_forget_request(rlocator, forknum, segno);
+ }
+
+ if (unlink(segpath) < 0)
+ {
+ /* ENOENT is expected after the last segment... */
+ if (errno != ENOENT)
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", segpath)));
+ break;
+ }
+ }
+ pfree(segpath);
+ }
+
+ pfree(path);
+}
+
+/*
+ * mdextend() -- Add a block to the specified relation.
+ *
+ * The semantics are nearly the same as mdwrite(): write at the
+ * specified position. However, this is to be used for the case of
+ * extending a relation (i.e., blocknum is at or beyond the current
+ * EOF). Note that we assume writing a block beyond current EOF
+ * causes intervening file space to become filled with zeroes.
+ */
+void
+mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
+{
+ off_t seekpos;
+ int nbytes;
+ MdfdVec *v;
+
+ /* If this build supports direct I/O, the buffer must be I/O aligned. */
+ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+ Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+ /*
+ * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+ * more --- we mustn't create a block whose number actually is
+ * InvalidBlockNumber. (Note that this failure should be unreachable
+ * because of upstream checks in bufmgr.c.)
+ */
+ if (blocknum == InvalidBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend file \"%s\" beyond %u blocks",
+ relpath(reln->smgr_rlocator, forknum),
+ InvalidBlockNumber)));
+
+ v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ)
+ {
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend file \"%s\": %m",
+ FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk space.")));
+ /* short write: complain appropriately */
+ ereport(ERROR,
+ (errcode(ERRCODE_DISK_FULL),
+ errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
+ FilePathName(v->mdfd_vfd),
+ nbytes, BLCKSZ, blocknum),
+ errhint("Check free disk space.")));
+ }
+
+ if (!skipFsync && !SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+
+ Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+}
+
+/*
+ * mdzeroextend() -- Add new zeroed out blocks to the specified relation.
+ *
+ * Similar to mdextend(), except the relation can be extended by multiple
+ * blocks at once and the added blocks will be filled with zeroes.
+ */
+void
+mdzeroextend(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync)
+{
+ MdfdVec *v;
+ BlockNumber curblocknum = blocknum;
+ int remblocks = nblocks;
+
+ Assert(nblocks > 0);
+
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum >= mdnblocks(reln, forknum));
+#endif
+
+ /*
+ * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
+ * more --- we mustn't create a block whose number actually is
+ * InvalidBlockNumber or larger.
+ */
+ if ((uint64) blocknum + nblocks >= (uint64) InvalidBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend file \"%s\" beyond %u blocks",
+ relpath(reln->smgr_rlocator, forknum),
+ InvalidBlockNumber)));
+
+ while (remblocks > 0)
+ {
+ BlockNumber segstartblock = curblocknum % ((BlockNumber) RELSEG_SIZE);
+ off_t seekpos = (off_t) BLCKSZ * segstartblock;
+ int numblocks;
+
+ if (segstartblock + remblocks > RELSEG_SIZE)
+ numblocks = RELSEG_SIZE - segstartblock;
+ else
+ numblocks = remblocks;
+
+ v = _mdfd_getseg(reln, forknum, curblocknum, skipFsync, EXTENSION_CREATE);
+
+ Assert(segstartblock < RELSEG_SIZE);
+ Assert(segstartblock + numblocks <= RELSEG_SIZE);
+
+ /*
+ * If available and useful, use posix_fallocate() (via
+ * FileFallocate()) to extend the relation. That's often more
+ * efficient than using write(), as it commonly won't cause the kernel
+ * to allocate page cache space for the extended pages.
+ *
+ * However, we don't use FileFallocate() for small extensions, as it
+ * defeats delayed allocation on some filesystems. Not clear where
+ * that decision should be made though? For now just use a cutoff of
+ * 8, anything between 4 and 8 worked OK in some local testing.
+ */
+ if (numblocks > 8)
+ {
+ int ret;
+
+ ret = FileFallocate(v->mdfd_vfd,
+ seekpos, (off_t) BLCKSZ * numblocks,
+ WAIT_EVENT_DATA_FILE_EXTEND);
+ if (ret != 0)
+ {
+ ereport(ERROR,
+ errcode_for_file_access(),
+ errmsg("could not extend file \"%s\" with FileFallocate(): %m",
+ FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk space."));
+ }
+ }
+ else
+ {
+ int ret;
+
+ /*
+ * Even if we don't want to use fallocate, we can still extend a
+ * bit more efficiently than writing each 8kB block individually.
+ * pg_pwrite_zeros() (via FileZero()) uses pg_pwritev_with_retry()
+ * to avoid multiple writes or needing a zeroed buffer for the
+ * whole length of the extension.
+ */
+ ret = FileZero(v->mdfd_vfd,
+ seekpos, (off_t) BLCKSZ * numblocks,
+ WAIT_EVENT_DATA_FILE_EXTEND);
+ if (ret < 0)
+ ereport(ERROR,
+ errcode_for_file_access(),
+ errmsg("could not extend file \"%s\": %m",
+ FilePathName(v->mdfd_vfd)),
+ errhint("Check free disk space."));
+ }
+
+ if (!skipFsync && !SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+
+ Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+ remblocks -= numblocks;
+ curblocknum += numblocks;
+ }
+}
+
+/*
+ * mdopenfork() -- Open one fork of the specified relation.
+ *
+ * Note we only open the first segment, when there are multiple segments.
+ *
+ * If first segment is not present, either ereport or return NULL according
+ * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
+ * EXTENSION_CREATE means it's OK to extend an existing relation, not to
+ * invent one out of whole cloth.
+ */
+static MdfdVec *
+mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
+{
+ MdfdVec *mdfd;
+ char *path;
+ File fd;
+
+ /* No work if already open */
+ if (reln->md_num_open_segs[forknum] > 0)
+ return &reln->md_seg_fds[forknum][0];
+
+ path = relpath(reln->smgr_rlocator, forknum);
+
+ fd = PathNameOpenFile(path, _mdfd_open_flags());
+
+ if (fd < 0)
+ {
+ if ((behavior & EXTENSION_RETURN_NULL) &&
+ FILE_POSSIBLY_DELETED(errno))
+ {
+ pfree(path);
+ return NULL;
+ }
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\": %m", path)));
+ }
+
+ pfree(path);
+
+ _fdvec_resize(reln, forknum, 1);
+ mdfd = &reln->md_seg_fds[forknum][0];
+ mdfd->mdfd_vfd = fd;
+ mdfd->mdfd_segno = 0;
+
+ Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
+
+ return mdfd;
+}
+
+/*
+ * mdopen() -- Initialize newly-opened relation.
+ */
+void
+mdopen(SMgrRelation reln)
+{
+ /* mark it not open */
+ for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ reln->md_num_open_segs[forknum] = 0;
+}
+
+/*
+ * mdclose() -- Close the specified relation, if it isn't closed already.
+ */
+void
+mdclose(SMgrRelation reln, ForkNumber forknum)
+{
+ int nopensegs = reln->md_num_open_segs[forknum];
+
+ /* No work if already closed */
+ if (nopensegs == 0)
+ return;
+
+ /* close segments starting from the end */
+ while (nopensegs > 0)
+ {
+ MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1];
+
+ FileClose(v->mdfd_vfd);
+ _fdvec_resize(reln, forknum, nopensegs - 1);
+ nopensegs--;
+ }
+}
+
+/*
+ * mdprefetch() -- Initiate asynchronous read of the specified block of a relation
+ */
+bool
+mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+#ifdef USE_PREFETCH
+ off_t seekpos;
+ MdfdVec *v;
+
+ Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
+ v = _mdfd_getseg(reln, forknum, blocknum, false,
+ InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
+ if (v == NULL)
+ return false;
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ, WAIT_EVENT_DATA_FILE_PREFETCH);
+#endif /* USE_PREFETCH */
+
+ return true;
+}
+
+/*
+ * mdread() -- Read the specified block from a relation.
+ */
+void
+mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ void *buffer)
+{
+ off_t seekpos;
+ int nbytes;
+ MdfdVec *v;
+
+ /* If this build supports direct I/O, the buffer must be I/O aligned. */
+ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+ Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
+ TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ reln->smgr_rlocator.backend);
+
+ v = _mdfd_getseg(reln, forknum, blocknum, false,
+ EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_READ);
+
+ TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ reln->smgr_rlocator.backend,
+ nbytes,
+ BLCKSZ);
+
+ if (nbytes != BLCKSZ)
+ {
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read block %u in file \"%s\": %m",
+ blocknum, FilePathName(v->mdfd_vfd))));
+
+ /*
+ * Short read: we are at or past EOF, or we read a partial block at
+ * EOF. Normally this is an error; upper levels should never try to
+ * read a nonexistent block. However, if zero_damaged_pages is ON or
+ * we are InRecovery, we should instead return zeroes without
+ * complaining. This allows, for example, the case of trying to
+ * update a block that was later truncated away.
+ */
+ if (zero_damaged_pages || InRecovery)
+ MemSet(buffer, 0, BLCKSZ);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
+ blocknum, FilePathName(v->mdfd_vfd),
+ nbytes, BLCKSZ)));
+ }
+}
+
+/*
+ * mdwrite() -- Write the supplied block at the appropriate location.
+ *
+ * This is to be used only for updating already-existing blocks of a
+ * relation (ie, those before the current EOF). To extend a relation,
+ * use mdextend().
+ */
+void
+mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
+{
+ off_t seekpos;
+ int nbytes;
+ MdfdVec *v;
+
+ /* If this build supports direct I/O, the buffer must be I/O aligned. */
+ if (PG_O_DIRECT != 0 && PG_IO_ALIGN_SIZE <= BLCKSZ)
+ Assert((uintptr_t) buffer == TYPEALIGN(PG_IO_ALIGN_SIZE, buffer));
+
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum < mdnblocks(reln, forknum));
+#endif
+
+ TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ reln->smgr_rlocator.backend);
+
+ v = _mdfd_getseg(reln, forknum, blocknum, skipFsync,
+ EXTENSION_FAIL | EXTENSION_CREATE_RECOVERY);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
+
+ nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_WRITE);
+
+ TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
+ reln->smgr_rlocator.locator.spcOid,
+ reln->smgr_rlocator.locator.dbOid,
+ reln->smgr_rlocator.locator.relNumber,
+ reln->smgr_rlocator.backend,
+ nbytes,
+ BLCKSZ);
+
+ if (nbytes != BLCKSZ)
+ {
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write block %u in file \"%s\": %m",
+ blocknum, FilePathName(v->mdfd_vfd))));
+ /* short write: complain appropriately */
+ ereport(ERROR,
+ (errcode(ERRCODE_DISK_FULL),
+ errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
+ blocknum,
+ FilePathName(v->mdfd_vfd),
+ nbytes, BLCKSZ),
+ errhint("Check free disk space.")));
+ }
+
+ if (!skipFsync && !SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+}
+
+/*
+ * mdwriteback() -- Tell the kernel to write pages back to storage.
+ *
+ * This accepts a range of blocks because flushing several pages at once is
+ * considerably more efficient than doing so individually.
+ */
+void
+mdwriteback(SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks)
+{
+ Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
+ /*
+ * Issue flush requests in as few requests as possible; have to split at
+ * segment boundaries though, since those are actually separate files.
+ */
+ while (nblocks > 0)
+ {
+ BlockNumber nflush = nblocks;
+ off_t seekpos;
+ MdfdVec *v;
+ int segnum_start,
+ segnum_end;
+
+ v = _mdfd_getseg(reln, forknum, blocknum, true /* not used */ ,
+ EXTENSION_DONT_OPEN);
+
+ /*
+ * We might be flushing buffers of already removed relations, that's
+ * ok, just ignore that case. If the segment file wasn't open already
+ * (ie from a recent mdwrite()), then we don't want to re-open it, to
+ * avoid a race with PROCSIGNAL_BARRIER_SMGRRELEASE that might leave
+ * us with a descriptor to a file that is about to be unlinked.
+ */
+ if (!v)
+ return;
+
+ /* compute offset inside the current segment */
+ segnum_start = blocknum / RELSEG_SIZE;
+
+ /* compute number of desired writes within the current segment */
+ segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE;
+ if (segnum_start != segnum_end)
+ nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ Assert(nflush >= 1);
+ Assert(nflush <= nblocks);
+
+ seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
+
+ FileWriteback(v->mdfd_vfd, seekpos, (off_t) BLCKSZ * nflush, WAIT_EVENT_DATA_FILE_FLUSH);
+
+ nblocks -= nflush;
+ blocknum += nflush;
+ }
+}
+
+/*
+ * mdnblocks() -- Get the number of blocks stored in a relation.
+ *
+ * Important side effect: all active segments of the relation are opened
+ * and added to the md_seg_fds array. If this routine has not been
+ * called, then only segments up to the last one actually touched
+ * are present in the array.
+ */
+BlockNumber
+mdnblocks(SMgrRelation reln, ForkNumber forknum)
+{
+ MdfdVec *v;
+ BlockNumber nblocks;
+ BlockNumber segno;
+
+ mdopenfork(reln, forknum, EXTENSION_FAIL);
+
+ /* mdopen has opened the first segment */
+ Assert(reln->md_num_open_segs[forknum] > 0);
+
+ /*
+ * Start from the last open segments, to avoid redundant seeks. We have
+ * previously verified that these segments are exactly RELSEG_SIZE long,
+ * and it's useless to recheck that each time.
+ *
+ * NOTE: this assumption could only be wrong if another backend has
+ * truncated the relation. We rely on higher code levels to handle that
+ * scenario by closing and re-opening the md fd, which is handled via
+ * relcache flush. (Since the checkpointer doesn't participate in
+ * relcache flush, it could have segment entries for inactive segments;
+ * that's OK because the checkpointer never needs to compute relation
+ * size.)
+ */
+ segno = reln->md_num_open_segs[forknum] - 1;
+ v = &reln->md_seg_fds[forknum][segno];
+
+ for (;;)
+ {
+ nblocks = _mdnblocks(reln, forknum, v);
+ if (nblocks > ((BlockNumber) RELSEG_SIZE))
+ elog(FATAL, "segment too big");
+ if (nblocks < ((BlockNumber) RELSEG_SIZE))
+ return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
+
+ /*
+ * If segment is exactly RELSEG_SIZE, advance to next one.
+ */
+ segno++;
+
+ /*
+ * We used to pass O_CREAT here, but that has the disadvantage that it
+ * might create a segment which has vanished through some operating
+ * system misadventure. In such a case, creating the segment here
+ * undermines _mdfd_getseg's attempts to notice and report an error
+ * upon access to a missing segment.
+ */
+ v = _mdfd_openseg(reln, forknum, segno, 0);
+ if (v == NULL)
+ return segno * ((BlockNumber) RELSEG_SIZE);
+ }
+}
+
+/*
+ * mdtruncate() -- Truncate relation to specified number of blocks.
+ */
+void
+mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
+{
+ BlockNumber curnblk;
+ BlockNumber priorblocks;
+ int curopensegs;
+
+ /*
+ * NOTE: mdnblocks makes sure we have opened all active segments, so that
+ * truncation loop will get them all!
+ */
+ curnblk = mdnblocks(reln, forknum);
+ if (nblocks > curnblk)
+ {
+ /* Bogus request ... but no complaint if InRecovery */
+ if (InRecovery)
+ return;
+ ereport(ERROR,
+ (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
+ relpath(reln->smgr_rlocator, forknum),
+ nblocks, curnblk)));
+ }
+ if (nblocks == curnblk)
+ return; /* no work */
+
+ /*
+ * Truncate segments, starting at the last one. Starting at the end makes
+ * managing the memory for the fd array easier, should there be errors.
+ */
+ curopensegs = reln->md_num_open_segs[forknum];
+ while (curopensegs > 0)
+ {
+ MdfdVec *v;
+
+ priorblocks = (curopensegs - 1) * RELSEG_SIZE;
+
+ v = &reln->md_seg_fds[forknum][curopensegs - 1];
+
+ if (priorblocks > nblocks)
+ {
+ /*
+ * This segment is no longer active. We truncate the file, but do
+ * not delete it, for reasons explained in the header comments.
+ */
+ if (FileTruncate(v->mdfd_vfd, 0, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\": %m",
+ FilePathName(v->mdfd_vfd))));
+
+ if (!SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+
+ /* we never drop the 1st segment */
+ Assert(v != &reln->md_seg_fds[forknum][0]);
+
+ FileClose(v->mdfd_vfd);
+ _fdvec_resize(reln, forknum, curopensegs - 1);
+ }
+ else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
+ {
+ /*
+ * This is the last segment we want to keep. Truncate the file to
+ * the right length. NOTE: if nblocks is exactly a multiple K of
+ * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
+ * keep it. This adheres to the invariant given in the header
+ * comments.
+ */
+ BlockNumber lastsegblocks = nblocks - priorblocks;
+
+ if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ, WAIT_EVENT_DATA_FILE_TRUNCATE) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate file \"%s\" to %u blocks: %m",
+ FilePathName(v->mdfd_vfd),
+ nblocks)));
+ if (!SmgrIsTemp(reln))
+ register_dirty_segment(reln, forknum, v);
+ }
+ else
+ {
+ /*
+ * We still need this segment, so nothing to do for this and any
+ * earlier segment.
+ */
+ break;
+ }
+ curopensegs--;
+ }
+}
+
+/*
+ * mdimmedsync() -- Immediately sync a relation to stable storage.
+ *
+ * Note that only writes already issued are synced; this routine knows
+ * nothing of dirty buffers that may exist inside the buffer manager. We
+ * sync active and inactive segments; smgrDoPendingSyncs() relies on this.
+ * Consider a relation skipping WAL. Suppose a checkpoint syncs blocks of
+ * some segment, then mdtruncate() renders that segment inactive. If we
+ * crash before the next checkpoint syncs the newly-inactive segment, that
+ * segment may survive recovery, reintroducing unwanted data into the table.
+ */
+void
+mdimmedsync(SMgrRelation reln, ForkNumber forknum)
+{
+ int segno;
+ int min_inactive_seg;
+
+ /*
+ * NOTE: mdnblocks makes sure we have opened all active segments, so that
+ * fsync loop will get them all!
+ */
+ mdnblocks(reln, forknum);
+
+ min_inactive_seg = segno = reln->md_num_open_segs[forknum];
+
+ /*
+ * Temporarily open inactive segments, then close them after sync. There
+ * may be some inactive segments left opened after fsync() error, but that
+ * is harmless. We don't bother to clean them up and take a risk of
+ * further trouble. The next mdclose() will soon close them.
+ */
+ while (_mdfd_openseg(reln, forknum, segno, 0) != NULL)
+ segno++;
+
+ while (segno > 0)
+ {
+ MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1];
+
+ /*
+ * fsyncs done through mdimmedsync() should be tracked in a separate
+ * IOContext than those done through mdsyncfiletag() to differentiate
+ * between unavoidable client backend fsyncs (e.g. those done during
+ * index build) and those which ideally would have been done by the
+ * checkpointer. Since other IO operations bypassing the buffer
+ * manager could also be tracked in such an IOContext, wait until
+ * these are also tracked to track immediate fsyncs.
+ */
+ if (FileSync(v->mdfd_vfd, WAIT_EVENT_DATA_FILE_IMMEDIATE_SYNC) < 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ FilePathName(v->mdfd_vfd))));
+
+ /* Close inactive segments immediately */
+ if (segno > min_inactive_seg)
+ {
+ FileClose(v->mdfd_vfd);
+ _fdvec_resize(reln, forknum, segno - 1);
+ }
+
+ segno--;
+ }
+}
+
+/*
+ * register_dirty_segment() -- Mark a relation segment as needing fsync
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * ProcessSyncRequests to process later. Otherwise, try to pass off the
+ * fsync request to the checkpointer process. If that fails, just do the
+ * fsync locally before returning (we hope this will not happen often
+ * enough to be a performance problem).
+ */
+static void
+register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
+{
+ FileTag tag;
+
+ INIT_MD_FILETAG(tag, reln->smgr_rlocator.locator, forknum, seg->mdfd_segno);
+
+ /* Temp relations should never be fsync'd */
+ Assert(!SmgrIsTemp(reln));
+
+ if (!RegisterSyncRequest(&tag, SYNC_REQUEST, false /* retryOnError */ ))
+ {
+ instr_time io_start;
+
+ ereport(DEBUG1,
+ (errmsg_internal("could not forward fsync request because request queue is full")));
+
+ io_start = pgstat_prepare_io_time();
+
+ if (FileSync(seg->mdfd_vfd, WAIT_EVENT_DATA_FILE_SYNC) < 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ FilePathName(seg->mdfd_vfd))));
+
+ /*
+ * We have no way of knowing if the current IOContext is
+ * IOCONTEXT_NORMAL or IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] at this
+ * point, so count the fsync as being in the IOCONTEXT_NORMAL
+ * IOContext. This is probably okay, because the number of backend
+ * fsyncs doesn't say anything about the efficacy of the
+ * BufferAccessStrategy. And counting both fsyncs done in
+ * IOCONTEXT_NORMAL and IOCONTEXT_[BULKREAD, BULKWRITE, VACUUM] under
+ * IOCONTEXT_NORMAL is likely clearer when investigating the number of
+ * backend fsyncs.
+ */
+ pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+ IOOP_FSYNC, io_start, 1);
+ }
+}
+
+/*
+ * register_unlink_segment() -- Schedule a file to be deleted after next checkpoint
+ */
+static void
+register_unlink_segment(RelFileLocatorBackend rlocator, ForkNumber forknum,
+ BlockNumber segno)
+{
+ FileTag tag;
+
+ INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
+
+ /* Should never be used with temp relations */
+ Assert(!RelFileLocatorBackendIsTemp(rlocator));
+
+ RegisterSyncRequest(&tag, SYNC_UNLINK_REQUEST, true /* retryOnError */ );
+}
+
+/*
+ * register_forget_request() -- forget any fsyncs for a relation fork's segment
+ */
+static void
+register_forget_request(RelFileLocatorBackend rlocator, ForkNumber forknum,
+ BlockNumber segno)
+{
+ FileTag tag;
+
+ INIT_MD_FILETAG(tag, rlocator.locator, forknum, segno);
+
+ RegisterSyncRequest(&tag, SYNC_FORGET_REQUEST, true /* retryOnError */ );
+}
+
+/*
+ * ForgetDatabaseSyncRequests -- forget any fsyncs and unlinks for a DB
+ */
+void
+ForgetDatabaseSyncRequests(Oid dbid)
+{
+ FileTag tag;
+ RelFileLocator rlocator;
+
+ rlocator.dbOid = dbid;
+ rlocator.spcOid = 0;
+ rlocator.relNumber = 0;
+
+ INIT_MD_FILETAG(tag, rlocator, InvalidForkNumber, InvalidBlockNumber);
+
+ RegisterSyncRequest(&tag, SYNC_FILTER_REQUEST, true /* retryOnError */ );
+}
+
+/*
+ * DropRelationFiles -- drop files of all given relations
+ */
+void
+DropRelationFiles(RelFileLocator *delrels, int ndelrels, bool isRedo)
+{
+ SMgrRelation *srels;
+ int i;
+
+ srels = palloc(sizeof(SMgrRelation) * ndelrels);
+ for (i = 0; i < ndelrels; i++)
+ {
+ SMgrRelation srel = smgropen(delrels[i], InvalidBackendId);
+
+ if (isRedo)
+ {
+ ForkNumber fork;
+
+ for (fork = 0; fork <= MAX_FORKNUM; fork++)
+ XLogDropRelation(delrels[i], fork);
+ }
+ srels[i] = srel;
+ }
+
+ smgrdounlinkall(srels, ndelrels, isRedo);
+
+ for (i = 0; i < ndelrels; i++)
+ smgrclose(srels[i]);
+ pfree(srels);
+}
+
+
+/*
+ * _fdvec_resize() -- Resize the fork's open segments array
+ */
+static void
+_fdvec_resize(SMgrRelation reln,
+ ForkNumber forknum,
+ int nseg)
+{
+ if (nseg == 0)
+ {
+ if (reln->md_num_open_segs[forknum] > 0)
+ {
+ pfree(reln->md_seg_fds[forknum]);
+ reln->md_seg_fds[forknum] = NULL;
+ }
+ }
+ else if (reln->md_num_open_segs[forknum] == 0)
+ {
+ reln->md_seg_fds[forknum] =
+ MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
+ }
+ else
+ {
+ /*
+ * It doesn't seem worthwhile complicating the code to amortize
+ * repalloc() calls. Those are far faster than PathNameOpenFile() or
+ * FileClose(), and the memory context internally will sometimes avoid
+ * doing an actual reallocation.
+ */
+ reln->md_seg_fds[forknum] =
+ repalloc(reln->md_seg_fds[forknum],
+ sizeof(MdfdVec) * nseg);
+ }
+
+ reln->md_num_open_segs[forknum] = nseg;
+}
+
+/*
+ * Return the filename for the specified segment of the relation. The
+ * returned string is palloc'd.
+ */
+static char *
+_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
+{
+ char *path,
+ *fullpath;
+
+ path = relpath(reln->smgr_rlocator, forknum);
+
+ if (segno > 0)
+ {
+ fullpath = psprintf("%s.%u", path, segno);
+ pfree(path);
+ }
+ else
+ fullpath = path;
+
+ return fullpath;
+}
+
+/*
+ * Open the specified segment of the relation,
+ * and make a MdfdVec object for it. Returns NULL on failure.
+ */
+static MdfdVec *
+_mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
+ int oflags)
+{
+ MdfdVec *v;
+ File fd;
+ char *fullpath;
+
+ fullpath = _mdfd_segpath(reln, forknum, segno);
+
+ /* open the file */
+ fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags);
+
+ pfree(fullpath);
+
+ if (fd < 0)
+ return NULL;
+
+ /*
+ * Segments are always opened in order from lowest to highest, so we must
+ * be adding a new one at the end.
+ */
+ Assert(segno == reln->md_num_open_segs[forknum]);
+
+ _fdvec_resize(reln, forknum, segno + 1);
+
+ /* fill the entry */
+ v = &reln->md_seg_fds[forknum][segno];
+ v->mdfd_vfd = fd;
+ v->mdfd_segno = segno;
+
+ Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
+
+ /* all done */
+ return v;
+}
+
+/*
+ * _mdfd_getseg() -- Find the segment of the relation holding the
+ * specified block.
+ *
+ * If the segment doesn't exist, we ereport, return NULL, or create the
+ * segment, according to "behavior". Note: skipFsync is only used in the
+ * EXTENSION_CREATE case.
+ */
+static MdfdVec *
+_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
+ bool skipFsync, int behavior)
+{
+ MdfdVec *v;
+ BlockNumber targetseg;
+ BlockNumber nextsegno;
+
+ /* some way to handle non-existent segments needs to be specified */
+ Assert(behavior &
+ (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL |
+ EXTENSION_DONT_OPEN));
+
+ targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
+
+ /* if an existing and opened segment, we're done */
+ if (targetseg < reln->md_num_open_segs[forknum])
+ {
+ v = &reln->md_seg_fds[forknum][targetseg];
+ return v;
+ }
+
+ /* The caller only wants the segment if we already had it open. */
+ if (behavior & EXTENSION_DONT_OPEN)
+ return NULL;
+
+ /*
+ * The target segment is not yet open. Iterate over all the segments
+ * between the last opened and the target segment. This way missing
+ * segments either raise an error, or get created (according to
+ * 'behavior'). Start with either the last opened, or the first segment if
+ * none was opened before.
+ */
+ if (reln->md_num_open_segs[forknum] > 0)
+ v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
+ else
+ {
+ v = mdopenfork(reln, forknum, behavior);
+ if (!v)
+ return NULL; /* if behavior & EXTENSION_RETURN_NULL */
+ }
+
+ for (nextsegno = reln->md_num_open_segs[forknum];
+ nextsegno <= targetseg; nextsegno++)
+ {
+ BlockNumber nblocks = _mdnblocks(reln, forknum, v);
+ int flags = 0;
+
+ Assert(nextsegno == v->mdfd_segno + 1);
+
+ if (nblocks > ((BlockNumber) RELSEG_SIZE))
+ elog(FATAL, "segment too big");
+
+ if ((behavior & EXTENSION_CREATE) ||
+ (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
+ {
+ /*
+ * Normally we will create new segments only if authorized by the
+ * caller (i.e., we are doing mdextend()). But when doing WAL
+ * recovery, create segments anyway; this allows cases such as
+ * replaying WAL data that has a write into a high-numbered
+ * segment of a relation that was later deleted. We want to go
+ * ahead and create the segments so we can finish out the replay.
+ *
+ * We have to maintain the invariant that segments before the last
+ * active segment are of size RELSEG_SIZE; therefore, if
+ * extending, pad them out with zeroes if needed. (This only
+ * matters if in recovery, or if the caller is extending the
+ * relation discontiguously, but that can happen in hash indexes.)
+ */
+ if (nblocks < ((BlockNumber) RELSEG_SIZE))
+ {
+ char *zerobuf = palloc_aligned(BLCKSZ, PG_IO_ALIGN_SIZE,
+ MCXT_ALLOC_ZERO);
+
+ mdextend(reln, forknum,
+ nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+ zerobuf, skipFsync);
+ pfree(zerobuf);
+ }
+ flags = O_CREAT;
+ }
+ else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
+ nblocks < ((BlockNumber) RELSEG_SIZE))
+ {
+ /*
+ * When not extending (or explicitly including truncated
+ * segments), only open the next segment if the current one is
+ * exactly RELSEG_SIZE. If not (this branch), either return NULL
+ * or fail.
+ */
+ if (behavior & EXTENSION_RETURN_NULL)
+ {
+ /*
+ * Some callers discern between reasons for _mdfd_getseg()
+ * returning NULL based on errno. As there's no failing
+ * syscall involved in this case, explicitly set errno to
+ * ENOENT, as that seems the closest interpretation.
+ */
+ errno = ENOENT;
+ return NULL;
+ }
+
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
+ _mdfd_segpath(reln, forknum, nextsegno),
+ blkno, nblocks)));
+ }
+
+ v = _mdfd_openseg(reln, forknum, nextsegno, flags);
+
+ if (v == NULL)
+ {
+ if ((behavior & EXTENSION_RETURN_NULL) &&
+ FILE_POSSIBLY_DELETED(errno))
+ return NULL;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open file \"%s\" (target block %u): %m",
+ _mdfd_segpath(reln, forknum, nextsegno),
+ blkno)));
+ }
+ }
+
+ return v;
+}
+
+/*
+ * Get number of blocks present in a single disk file
+ */
+static BlockNumber
+_mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
+{
+ off_t len;
+
+ len = FileSize(seg->mdfd_vfd);
+ if (len < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to end of file \"%s\": %m",
+ FilePathName(seg->mdfd_vfd))));
+ /* note that this calculation will ignore any partial block at EOF */
+ return (BlockNumber) (len / BLCKSZ);
+}
+
+/*
+ * Sync a file to disk, given a file tag. Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdsyncfiletag(const FileTag *ftag, char *path)
+{
+ SMgrRelation reln = smgropen(ftag->rlocator, InvalidBackendId);
+ File file;
+ instr_time io_start;
+ bool need_to_close;
+ int result,
+ save_errno;
+
+ /* See if we already have the file open, or need to open it. */
+ if (ftag->segno < reln->md_num_open_segs[ftag->forknum])
+ {
+ file = reln->md_seg_fds[ftag->forknum][ftag->segno].mdfd_vfd;
+ strlcpy(path, FilePathName(file), MAXPGPATH);
+ need_to_close = false;
+ }
+ else
+ {
+ char *p;
+
+ p = _mdfd_segpath(reln, ftag->forknum, ftag->segno);
+ strlcpy(path, p, MAXPGPATH);
+ pfree(p);
+
+ file = PathNameOpenFile(path, _mdfd_open_flags());
+ if (file < 0)
+ return -1;
+ need_to_close = true;
+ }
+
+ io_start = pgstat_prepare_io_time();
+
+ /* Sync the file. */
+ result = FileSync(file, WAIT_EVENT_DATA_FILE_SYNC);
+ save_errno = errno;
+
+ if (need_to_close)
+ FileClose(file);
+
+ pgstat_count_io_op_time(IOOBJECT_RELATION, IOCONTEXT_NORMAL,
+ IOOP_FSYNC, io_start, 1);
+
+ errno = save_errno;
+ return result;
+}
+
+/*
+ * Unlink a file, given a file tag. Write the path into an output
+ * buffer so the caller can use it in error messages.
+ *
+ * Return 0 on success, -1 on failure, with errno set.
+ */
+int
+mdunlinkfiletag(const FileTag *ftag, char *path)
+{
+ char *p;
+
+ /* Compute the path. */
+ p = relpathperm(ftag->rlocator, MAIN_FORKNUM);
+ strlcpy(path, p, MAXPGPATH);
+ pfree(p);
+
+ /* Try to unlink the file. */
+ return unlink(path);
+}
+
+/*
+ * Check if a given candidate request matches a given tag, when processing
+ * a SYNC_FILTER_REQUEST request. This will be called for all pending
+ * requests to find out whether to forget them.
+ */
+bool
+mdfiletagmatches(const FileTag *ftag, const FileTag *candidate)
+{
+ /*
+ * For now we only use filter requests as a way to drop all scheduled
+ * callbacks relating to a given database, when dropping the database.
+ * We'll return true for all candidates that have the same database OID as
+ * the ftag from the SYNC_FILTER_REQUEST request, so they're forgotten.
+ */
+ return ftag->rlocator.dbOid == candidate->rlocator.dbOid;
+}
diff --git a/src/backend/storage/smgr/meson.build b/src/backend/storage/smgr/meson.build
new file mode 100644
index 0000000..e1ba6ed
--- /dev/null
+++ b/src/backend/storage/smgr/meson.build
@@ -0,0 +1,6 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'md.c',
+ 'smgr.c',
+)
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
new file mode 100644
index 0000000..5d0f3d5
--- /dev/null
+++ b/src/backend/storage/smgr/smgr.c
@@ -0,0 +1,767 @@
+/*-------------------------------------------------------------------------
+ *
+ * smgr.c
+ * public interface routines to storage manager switch.
+ *
+ * All file system operations in POSTGRES dispatch through these
+ * routines.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/smgr/smgr.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xlogutils.h"
+#include "lib/ilist.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/md.h"
+#include "storage/smgr.h"
+#include "utils/hsearch.h"
+#include "utils/inval.h"
+
+
+/*
+ * This struct of function pointers defines the API between smgr.c and
+ * any individual storage manager module. Note that smgr subfunctions are
+ * generally expected to report problems via elog(ERROR). An exception is
+ * that smgr_unlink should use elog(WARNING), rather than erroring out,
+ * because we normally unlink relations during post-commit/abort cleanup,
+ * and so it's too late to raise an error. Also, various conditions that
+ * would normally be errors should be allowed during bootstrap and/or WAL
+ * recovery --- see comments in md.c for details.
+ */
+typedef struct f_smgr
+{
+ void (*smgr_init) (void); /* may be NULL */
+ void (*smgr_shutdown) (void); /* may be NULL */
+ void (*smgr_open) (SMgrRelation reln);
+ void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
+ void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
+ bool isRedo);
+ bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
+ void (*smgr_unlink) (RelFileLocatorBackend rlocator, ForkNumber forknum,
+ bool isRedo);
+ void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, const void *buffer, bool skipFsync);
+ void (*smgr_zeroextend) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, int nblocks, bool skipFsync);
+ bool (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum);
+ void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, void *buffer);
+ void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, const void *buffer, bool skipFsync);
+ void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber blocknum, BlockNumber nblocks);
+ BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
+ void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
+ BlockNumber nblocks);
+ void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
+} f_smgr;
+
+static const f_smgr smgrsw[] = {
+ /* magnetic disk */
+ {
+ .smgr_init = mdinit,
+ .smgr_shutdown = NULL,
+ .smgr_open = mdopen,
+ .smgr_close = mdclose,
+ .smgr_create = mdcreate,
+ .smgr_exists = mdexists,
+ .smgr_unlink = mdunlink,
+ .smgr_extend = mdextend,
+ .smgr_zeroextend = mdzeroextend,
+ .smgr_prefetch = mdprefetch,
+ .smgr_read = mdread,
+ .smgr_write = mdwrite,
+ .smgr_writeback = mdwriteback,
+ .smgr_nblocks = mdnblocks,
+ .smgr_truncate = mdtruncate,
+ .smgr_immedsync = mdimmedsync,
+ }
+};
+
+static const int NSmgr = lengthof(smgrsw);
+
+/*
+ * Each backend has a hashtable that stores all extant SMgrRelation objects.
+ * In addition, "unowned" SMgrRelation objects are chained together in a list.
+ */
+static HTAB *SMgrRelationHash = NULL;
+
+static dlist_head unowned_relns;
+
+/* local function prototypes */
+static void smgrshutdown(int code, Datum arg);
+
+
+/*
+ * smgrinit(), smgrshutdown() -- Initialize or shut down storage
+ * managers.
+ *
+ * Note: smgrinit is called during backend startup (normal or standalone
+ * case), *not* during postmaster start. Therefore, any resources created
+ * here or destroyed in smgrshutdown are backend-local.
+ */
+void
+smgrinit(void)
+{
+ int i;
+
+ for (i = 0; i < NSmgr; i++)
+ {
+ if (smgrsw[i].smgr_init)
+ smgrsw[i].smgr_init();
+ }
+
+ /* register the shutdown proc */
+ on_proc_exit(smgrshutdown, 0);
+}
+
+/*
+ * on_proc_exit hook for smgr cleanup during backend shutdown
+ */
+static void
+smgrshutdown(int code, Datum arg)
+{
+ int i;
+
+ for (i = 0; i < NSmgr; i++)
+ {
+ if (smgrsw[i].smgr_shutdown)
+ smgrsw[i].smgr_shutdown();
+ }
+}
+
+/*
+ * smgropen() -- Return an SMgrRelation object, creating it if need be.
+ *
+ * This does not attempt to actually open the underlying file.
+ */
+SMgrRelation
+smgropen(RelFileLocator rlocator, BackendId backend)
+{
+ RelFileLocatorBackend brlocator;
+ SMgrRelation reln;
+ bool found;
+
+ if (SMgrRelationHash == NULL)
+ {
+ /* First time through: initialize the hash table */
+ HASHCTL ctl;
+
+ ctl.keysize = sizeof(RelFileLocatorBackend);
+ ctl.entrysize = sizeof(SMgrRelationData);
+ SMgrRelationHash = hash_create("smgr relation table", 400,
+ &ctl, HASH_ELEM | HASH_BLOBS);
+ dlist_init(&unowned_relns);
+ }
+
+ /* Look up or create an entry */
+ brlocator.locator = rlocator;
+ brlocator.backend = backend;
+ reln = (SMgrRelation) hash_search(SMgrRelationHash,
+ &brlocator,
+ HASH_ENTER, &found);
+
+ /* Initialize it if not present before */
+ if (!found)
+ {
+ /* hash_search already filled in the lookup key */
+ reln->smgr_owner = NULL;
+ reln->smgr_targblock = InvalidBlockNumber;
+ for (int i = 0; i <= MAX_FORKNUM; ++i)
+ reln->smgr_cached_nblocks[i] = InvalidBlockNumber;
+ reln->smgr_which = 0; /* we only have md.c at present */
+
+ /* implementation-specific initialization */
+ smgrsw[reln->smgr_which].smgr_open(reln);
+
+ /* it has no owner yet */
+ dlist_push_tail(&unowned_relns, &reln->node);
+ }
+
+ return reln;
+}
+
+/*
+ * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
+ *
+ * There can be only one owner at a time; this is sufficient since currently
+ * the only such owners exist in the relcache.
+ */
+void
+smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
+{
+ /* We don't support "disowning" an SMgrRelation here, use smgrclearowner */
+ Assert(owner != NULL);
+
+ /*
+ * First, unhook any old owner. (Normally there shouldn't be any, but it
+ * seems possible that this can happen during swap_relation_files()
+ * depending on the order of processing. It's ok to close the old
+ * relcache entry early in that case.)
+ *
+ * If there isn't an old owner, then the reln should be in the unowned
+ * list, and we need to remove it.
+ */
+ if (reln->smgr_owner)
+ *(reln->smgr_owner) = NULL;
+ else
+ dlist_delete(&reln->node);
+
+ /* Now establish the ownership relationship. */
+ reln->smgr_owner = owner;
+ *owner = reln;
+}
+
+/*
+ * smgrclearowner() -- Remove long-lived reference to an SMgrRelation object
+ * if one exists
+ */
+void
+smgrclearowner(SMgrRelation *owner, SMgrRelation reln)
+{
+ /* Do nothing if the SMgrRelation object is not owned by the owner */
+ if (reln->smgr_owner != owner)
+ return;
+
+ /* unset the owner's reference */
+ *owner = NULL;
+
+ /* unset our reference to the owner */
+ reln->smgr_owner = NULL;
+
+ /* add to list of unowned relations */
+ dlist_push_tail(&unowned_relns, &reln->node);
+}
+
+/*
+ * smgrexists() -- Does the underlying file for a fork exist?
+ */
+bool
+smgrexists(SMgrRelation reln, ForkNumber forknum)
+{
+ return smgrsw[reln->smgr_which].smgr_exists(reln, forknum);
+}
+
+/*
+ * smgrclose() -- Close and delete an SMgrRelation object.
+ */
+void
+smgrclose(SMgrRelation reln)
+{
+ SMgrRelation *owner;
+ ForkNumber forknum;
+
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ smgrsw[reln->smgr_which].smgr_close(reln, forknum);
+
+ owner = reln->smgr_owner;
+
+ if (!owner)
+ dlist_delete(&reln->node);
+
+ if (hash_search(SMgrRelationHash,
+ &(reln->smgr_rlocator),
+ HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "SMgrRelation hashtable corrupted");
+
+ /*
+ * Unhook the owner pointer, if any. We do this last since in the remote
+ * possibility of failure above, the SMgrRelation object will still exist.
+ */
+ if (owner)
+ *owner = NULL;
+}
+
+/*
+ * smgrrelease() -- Release all resources used by this object.
+ *
+ * The object remains valid.
+ */
+void
+smgrrelease(SMgrRelation reln)
+{
+ for (ForkNumber forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ {
+ smgrsw[reln->smgr_which].smgr_close(reln, forknum);
+ reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+ }
+ reln->smgr_targblock = InvalidBlockNumber;
+}
+
+/*
+ * smgrreleaseall() -- Release resources used by all objects.
+ *
+ * This is called for PROCSIGNAL_BARRIER_SMGRRELEASE.
+ */
+void
+smgrreleaseall(void)
+{
+ HASH_SEQ_STATUS status;
+ SMgrRelation reln;
+
+ /* Nothing to do if hashtable not set up */
+ if (SMgrRelationHash == NULL)
+ return;
+
+ hash_seq_init(&status, SMgrRelationHash);
+
+ while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
+ smgrrelease(reln);
+}
+
+/*
+ * smgrcloseall() -- Close all existing SMgrRelation objects.
+ */
+void
+smgrcloseall(void)
+{
+ HASH_SEQ_STATUS status;
+ SMgrRelation reln;
+
+ /* Nothing to do if hashtable not set up */
+ if (SMgrRelationHash == NULL)
+ return;
+
+ hash_seq_init(&status, SMgrRelationHash);
+
+ while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
+ smgrclose(reln);
+}
+
+/*
+ * smgrcloserellocator() -- Close SMgrRelation object for given RelFileLocator,
+ * if one exists.
+ *
+ * This has the same effects as smgrclose(smgropen(rlocator)), but it avoids
+ * uselessly creating a hashtable entry only to drop it again when no
+ * such entry exists already.
+ */
+void
+smgrcloserellocator(RelFileLocatorBackend rlocator)
+{
+ SMgrRelation reln;
+
+ /* Nothing to do if hashtable not set up */
+ if (SMgrRelationHash == NULL)
+ return;
+
+ reln = (SMgrRelation) hash_search(SMgrRelationHash,
+ &rlocator,
+ HASH_FIND, NULL);
+ if (reln != NULL)
+ smgrclose(reln);
+}
+
+/*
+ * smgrcreate() -- Create a new relation.
+ *
+ * Given an already-created (but presumably unused) SMgrRelation,
+ * cause the underlying disk file or other storage for the fork
+ * to be created.
+ */
+void
+smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
+{
+ smgrsw[reln->smgr_which].smgr_create(reln, forknum, isRedo);
+}
+
+/*
+ * smgrdosyncall() -- Immediately sync all forks of all given relations
+ *
+ * All forks of all given relations are synced out to the store.
+ *
+ * This is equivalent to FlushRelationBuffers() for each smgr relation,
+ * then calling smgrimmedsync() for all forks of each relation, but it's
+ * significantly quicker so should be preferred when possible.
+ */
+void
+smgrdosyncall(SMgrRelation *rels, int nrels)
+{
+ int i = 0;
+ ForkNumber forknum;
+
+ if (nrels == 0)
+ return;
+
+ FlushRelationsAllBuffers(rels, nrels);
+
+ /*
+ * Sync the physical file(s).
+ */
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
+
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ {
+ if (smgrsw[which].smgr_exists(rels[i], forknum))
+ smgrsw[which].smgr_immedsync(rels[i], forknum);
+ }
+ }
+}
+
+/*
+ * smgrdounlinkall() -- Immediately unlink all forks of all given relations
+ *
+ * All forks of all given relations are removed from the store. This
+ * should not be used during transactional operations, since it can't be
+ * undone.
+ *
+ * If isRedo is true, it is okay for the underlying file(s) to be gone
+ * already.
+ */
+void
+smgrdounlinkall(SMgrRelation *rels, int nrels, bool isRedo)
+{
+ int i = 0;
+ RelFileLocatorBackend *rlocators;
+ ForkNumber forknum;
+
+ if (nrels == 0)
+ return;
+
+ /*
+ * Get rid of any remaining buffers for the relations. bufmgr will just
+ * drop them without bothering to write the contents.
+ */
+ DropRelationsAllBuffers(rels, nrels);
+
+ /*
+ * create an array which contains all relations to be dropped, and close
+ * each relation's forks at the smgr level while at it
+ */
+ rlocators = palloc(sizeof(RelFileLocatorBackend) * nrels);
+ for (i = 0; i < nrels; i++)
+ {
+ RelFileLocatorBackend rlocator = rels[i]->smgr_rlocator;
+ int which = rels[i]->smgr_which;
+
+ rlocators[i] = rlocator;
+
+ /* Close the forks at smgr level */
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ smgrsw[which].smgr_close(rels[i], forknum);
+ }
+
+ /*
+ * Send a shared-inval message to force other backends to close any
+ * dangling smgr references they may have for these rels. We should do
+ * this before starting the actual unlinking, in case we fail partway
+ * through that step. Note that the sinval messages will eventually come
+ * back to this backend, too, and thereby provide a backstop that we
+ * closed our own smgr rel.
+ */
+ for (i = 0; i < nrels; i++)
+ CacheInvalidateSmgr(rlocators[i]);
+
+ /*
+ * Delete the physical file(s).
+ *
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
+ */
+
+ for (i = 0; i < nrels; i++)
+ {
+ int which = rels[i]->smgr_which;
+
+ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
+ smgrsw[which].smgr_unlink(rlocators[i], forknum, isRedo);
+ }
+
+ pfree(rlocators);
+}
+
+
+/*
+ * smgrextend() -- Add a new block to a file.
+ *
+ * The semantics are nearly the same as smgrwrite(): write at the
+ * specified position. However, this is to be used for the case of
+ * extending a relation (i.e., blocknum is at or beyond the current
+ * EOF). Note that we assume writing a block beyond current EOF
+ * causes intervening file space to become filled with zeroes.
+ */
+void
+smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
+{
+ smgrsw[reln->smgr_which].smgr_extend(reln, forknum, blocknum,
+ buffer, skipFsync);
+
+ /*
+ * Normally we expect this to increase nblocks by one, but if the cached
+ * value isn't as expected, just invalidate it so the next call asks the
+ * kernel.
+ */
+ if (reln->smgr_cached_nblocks[forknum] == blocknum)
+ reln->smgr_cached_nblocks[forknum] = blocknum + 1;
+ else
+ reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+}
+
+/*
+ * smgrzeroextend() -- Add new zeroed out blocks to a file.
+ *
+ * Similar to smgrextend(), except the relation can be extended by
+ * multiple blocks at once and the added blocks will be filled with
+ * zeroes.
+ */
+void
+smgrzeroextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ int nblocks, bool skipFsync)
+{
+ smgrsw[reln->smgr_which].smgr_zeroextend(reln, forknum, blocknum,
+ nblocks, skipFsync);
+
+ /*
+ * Normally we expect this to increase the fork size by nblocks, but if
+ * the cached value isn't as expected, just invalidate it so the next call
+ * asks the kernel.
+ */
+ if (reln->smgr_cached_nblocks[forknum] == blocknum)
+ reln->smgr_cached_nblocks[forknum] = blocknum + nblocks;
+ else
+ reln->smgr_cached_nblocks[forknum] = InvalidBlockNumber;
+}
+
+/*
+ * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
+ *
+ * In recovery only, this can return false to indicate that a file
+ * doesn't exist (presumably it has been dropped by a later WAL
+ * record).
+ */
+bool
+smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
+{
+ return smgrsw[reln->smgr_which].smgr_prefetch(reln, forknum, blocknum);
+}
+
+/*
+ * smgrread() -- read a particular block from a relation into the supplied
+ * buffer.
+ *
+ * This routine is called from the buffer manager in order to
+ * instantiate pages in the shared buffer cache. All storage managers
+ * return pages in the format that POSTGRES expects.
+ */
+void
+smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ void *buffer)
+{
+ smgrsw[reln->smgr_which].smgr_read(reln, forknum, blocknum, buffer);
+}
+
+/*
+ * smgrwrite() -- Write the supplied buffer out.
+ *
+ * This is to be used only for updating already-existing blocks of a
+ * relation (ie, those before the current EOF). To extend a relation,
+ * use smgrextend().
+ *
+ * This is not a synchronous write -- the block is not necessarily
+ * on disk at return, only dumped out to the kernel. However,
+ * provisions will be made to fsync the write before the next checkpoint.
+ *
+ * skipFsync indicates that the caller will make other provisions to
+ * fsync the relation, so we needn't bother. Temporary relations also
+ * do not require fsync.
+ */
+void
+smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ const void *buffer, bool skipFsync)
+{
+ smgrsw[reln->smgr_which].smgr_write(reln, forknum, blocknum,
+ buffer, skipFsync);
+}
+
+
+/*
+ * smgrwriteback() -- Trigger kernel writeback for the supplied range of
+ * blocks.
+ */
+void
+smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
+ BlockNumber nblocks)
+{
+ smgrsw[reln->smgr_which].smgr_writeback(reln, forknum, blocknum,
+ nblocks);
+}
+
+/*
+ * smgrnblocks() -- Calculate the number of blocks in the
+ * supplied relation.
+ */
+BlockNumber
+smgrnblocks(SMgrRelation reln, ForkNumber forknum)
+{
+ BlockNumber result;
+
+ /* Check and return if we get the cached value for the number of blocks. */
+ result = smgrnblocks_cached(reln, forknum);
+ if (result != InvalidBlockNumber)
+ return result;
+
+ result = smgrsw[reln->smgr_which].smgr_nblocks(reln, forknum);
+
+ reln->smgr_cached_nblocks[forknum] = result;
+
+ return result;
+}
+
+/*
+ * smgrnblocks_cached() -- Get the cached number of blocks in the supplied
+ * relation.
+ *
+ * Returns an InvalidBlockNumber when not in recovery and when the relation
+ * fork size is not cached.
+ */
+BlockNumber
+smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
+{
+ /*
+ * For now, we only use cached values in recovery due to lack of a shared
+ * invalidation mechanism for changes in file size.
+ */
+ if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
+ return reln->smgr_cached_nblocks[forknum];
+
+ return InvalidBlockNumber;
+}
+
+/*
+ * smgrtruncate() -- Truncate the given forks of supplied relation to
+ * each specified numbers of blocks
+ *
+ * The truncation is done immediately, so this can't be rolled back.
+ *
+ * The caller must hold AccessExclusiveLock on the relation, to ensure that
+ * other backends receive the smgr invalidation event that this function sends
+ * before they access any forks of the relation again.
+ */
+void
+smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nblocks)
+{
+ int i;
+
+ /*
+ * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
+ * just drop them without bothering to write the contents.
+ */
+ DropRelationBuffers(reln, forknum, nforks, nblocks);
+
+ /*
+ * Send a shared-inval message to force other backends to close any smgr
+ * references they may have for this rel. This is useful because they
+ * might have open file pointers to segments that got removed, and/or
+ * smgr_targblock variables pointing past the new rel end. (The inval
+ * message will come back to our backend, too, causing a
+ * probably-unnecessary local smgr flush. But we don't expect that this
+ * is a performance-critical path.) As in the unlink code, we want to be
+ * sure the message is sent before we start changing things on-disk.
+ */
+ CacheInvalidateSmgr(reln->smgr_rlocator);
+
+ /* Do the truncation */
+ for (i = 0; i < nforks; i++)
+ {
+ /* Make the cached size is invalid if we encounter an error. */
+ reln->smgr_cached_nblocks[forknum[i]] = InvalidBlockNumber;
+
+ smgrsw[reln->smgr_which].smgr_truncate(reln, forknum[i], nblocks[i]);
+
+ /*
+ * We might as well update the local smgr_cached_nblocks values. The
+ * smgr cache inval message that this function sent will cause other
+ * backends to invalidate their copies of smgr_fsm_nblocks and
+ * smgr_vm_nblocks, and these ones too at the next command boundary.
+ * But these ensure they aren't outright wrong until then.
+ */
+ reln->smgr_cached_nblocks[forknum[i]] = nblocks[i];
+ }
+}
+
+/*
+ * smgrimmedsync() -- Force the specified relation to stable storage.
+ *
+ * Synchronously force all previous writes to the specified relation
+ * down to disk.
+ *
+ * This is useful for building completely new relations (eg, new
+ * indexes). Instead of incrementally WAL-logging the index build
+ * steps, we can just write completed index pages to disk with smgrwrite
+ * or smgrextend, and then fsync the completed index file before
+ * committing the transaction. (This is sufficient for purposes of
+ * crash recovery, since it effectively duplicates forcing a checkpoint
+ * for the completed index. But it is *not* sufficient if one wishes
+ * to use the WAL log for PITR or replication purposes: in that case
+ * we have to make WAL entries as well.)
+ *
+ * The preceding writes should specify skipFsync = true to avoid
+ * duplicative fsyncs.
+ *
+ * Note that you need to do FlushRelationBuffers() first if there is
+ * any possibility that there are dirty buffers for the relation;
+ * otherwise the sync is not very meaningful.
+ */
+void
+smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
+{
+ smgrsw[reln->smgr_which].smgr_immedsync(reln, forknum);
+}
+
+/*
+ * AtEOXact_SMgr
+ *
+ * This routine is called during transaction commit or abort (it doesn't
+ * particularly care which). All transient SMgrRelation objects are closed.
+ *
+ * We do this as a compromise between wanting transient SMgrRelations to
+ * live awhile (to amortize the costs of blind writes of multiple blocks)
+ * and needing them to not live forever (since we're probably holding open
+ * a kernel file descriptor for the underlying file, and we need to ensure
+ * that gets closed reasonably soon if the file gets deleted).
+ */
+void
+AtEOXact_SMgr(void)
+{
+ dlist_mutable_iter iter;
+
+ /*
+ * Zap all unowned SMgrRelations. We rely on smgrclose() to remove each
+ * one from the list.
+ */
+ dlist_foreach_modify(iter, &unowned_relns)
+ {
+ SMgrRelation rel = dlist_container(SMgrRelationData, node,
+ iter.cur);
+
+ Assert(rel->smgr_owner == NULL);
+
+ smgrclose(rel);
+ }
+}
+
+/*
+ * This routine is called when we are ordered to release all open files by a
+ * ProcSignalBarrier.
+ */
+bool
+ProcessBarrierSmgrRelease(void)
+{
+ smgrreleaseall();
+ return true;
+}
diff --git a/src/backend/storage/sync/Makefile b/src/backend/storage/sync/Makefile
new file mode 100644
index 0000000..be88b44
--- /dev/null
+++ b/src/backend/storage/sync/Makefile
@@ -0,0 +1,18 @@
+#-------------------------------------------------------------------------
+#
+# Makefile--
+# Makefile for storage/sync
+#
+# IDENTIFICATION
+# src/backend/storage/sync/Makefile
+#
+#-------------------------------------------------------------------------
+
+subdir = src/backend/storage/sync
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ sync.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/sync/meson.build b/src/backend/storage/sync/meson.build
new file mode 100644
index 0000000..1b49f16
--- /dev/null
+++ b/src/backend/storage/sync/meson.build
@@ -0,0 +1,6 @@
+# Copyright (c) 2022-2023, PostgreSQL Global Development Group
+
+backend_sources += files(
+ 'sync.c',
+
+)
diff --git a/src/backend/storage/sync/sync.c b/src/backend/storage/sync/sync.c
new file mode 100644
index 0000000..04fcb06
--- /dev/null
+++ b/src/backend/storage/sync/sync.c
@@ -0,0 +1,624 @@
+/*-------------------------------------------------------------------------
+ *
+ * sync.c
+ * File synchronization management code.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/sync/sync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <unistd.h>
+#include <fcntl.h>
+#include <sys/file.h>
+
+#include "access/commit_ts.h"
+#include "access/clog.h"
+#include "access/multixact.h"
+#include "access/xlog.h"
+#include "access/xlogutils.h"
+#include "commands/tablespace.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/instr_time.h"
+#include "postmaster/bgwriter.h"
+#include "storage/bufmgr.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/md.h"
+#include "utils/hsearch.h"
+#include "utils/inval.h"
+#include "utils/memutils.h"
+
+/*
+ * In some contexts (currently, standalone backends and the checkpointer)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint. This hash
+ * table remembers the pending operations. We use a hash table mostly as
+ * a convenient way of merging duplicate requests.
+ *
+ * We use a similar mechanism to remember no-longer-needed files that can
+ * be deleted after the next checkpoint, but we use a linked list instead of
+ * a hash table, because we don't expect there to be any duplicate requests.
+ *
+ * These mechanisms are only used for non-temp relations; we never fsync
+ * temp rels, nor do we need to postpone their deletion (see comments in
+ * mdunlink).
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the checkpointer.)
+ */
+typedef uint16 CycleCtr; /* can be any convenient integer size */
+
+typedef struct
+{
+ FileTag tag; /* identifies handler and file */
+ CycleCtr cycle_ctr; /* sync_cycle_ctr of oldest request */
+ bool canceled; /* canceled is true if we canceled "recently" */
+} PendingFsyncEntry;
+
+typedef struct
+{
+ FileTag tag; /* identifies handler and file */
+ CycleCtr cycle_ctr; /* checkpoint_cycle_ctr when request was made */
+ bool canceled; /* true if request has been canceled */
+} PendingUnlinkEntry;
+
+static HTAB *pendingOps = NULL;
+static List *pendingUnlinks = NIL;
+static MemoryContext pendingOpsCxt; /* context for the above */
+
+static CycleCtr sync_cycle_ctr = 0;
+static CycleCtr checkpoint_cycle_ctr = 0;
+
+/* Intervals for calling AbsorbSyncRequests */
+#define FSYNCS_PER_ABSORB 10
+#define UNLINKS_PER_ABSORB 10
+
+/*
+ * Function pointers for handling sync and unlink requests.
+ */
+typedef struct SyncOps
+{
+ int (*sync_syncfiletag) (const FileTag *ftag, char *path);
+ int (*sync_unlinkfiletag) (const FileTag *ftag, char *path);
+ bool (*sync_filetagmatches) (const FileTag *ftag,
+ const FileTag *candidate);
+} SyncOps;
+
+/*
+ * These indexes must correspond to the values of the SyncRequestHandler enum.
+ */
+static const SyncOps syncsw[] = {
+ /* magnetic disk */
+ [SYNC_HANDLER_MD] = {
+ .sync_syncfiletag = mdsyncfiletag,
+ .sync_unlinkfiletag = mdunlinkfiletag,
+ .sync_filetagmatches = mdfiletagmatches
+ },
+ /* pg_xact */
+ [SYNC_HANDLER_CLOG] = {
+ .sync_syncfiletag = clogsyncfiletag
+ },
+ /* pg_commit_ts */
+ [SYNC_HANDLER_COMMIT_TS] = {
+ .sync_syncfiletag = committssyncfiletag
+ },
+ /* pg_multixact/offsets */
+ [SYNC_HANDLER_MULTIXACT_OFFSET] = {
+ .sync_syncfiletag = multixactoffsetssyncfiletag
+ },
+ /* pg_multixact/members */
+ [SYNC_HANDLER_MULTIXACT_MEMBER] = {
+ .sync_syncfiletag = multixactmemberssyncfiletag
+ }
+};
+
+/*
+ * Initialize data structures for the file sync tracking.
+ */
+void
+InitSync(void)
+{
+ /*
+ * Create pending-operations hashtable if we need it. Currently, we need
+ * it if we are standalone (not under a postmaster) or if we are a
+ * checkpointer auxiliary process.
+ */
+ if (!IsUnderPostmaster || AmCheckpointerProcess())
+ {
+ HASHCTL hash_ctl;
+
+ /*
+ * XXX: The checkpointer needs to add entries to the pending ops table
+ * when absorbing fsync requests. That is done within a critical
+ * section, which isn't usually allowed, but we make an exception. It
+ * means that there's a theoretical possibility that you run out of
+ * memory while absorbing fsync requests, which leads to a PANIC.
+ * Fortunately the hash table is small so that's unlikely to happen in
+ * practice.
+ */
+ pendingOpsCxt = AllocSetContextCreate(TopMemoryContext,
+ "Pending ops context",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(pendingOpsCxt, true);
+
+ hash_ctl.keysize = sizeof(FileTag);
+ hash_ctl.entrysize = sizeof(PendingFsyncEntry);
+ hash_ctl.hcxt = pendingOpsCxt;
+ pendingOps = hash_create("Pending Ops Table",
+ 100L,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ pendingUnlinks = NIL;
+ }
+}
+
+/*
+ * SyncPreCheckpoint() -- Do pre-checkpoint work
+ *
+ * To distinguish unlink requests that arrived before this checkpoint
+ * started from those that arrived during the checkpoint, we use a cycle
+ * counter similar to the one we use for fsync requests. That cycle
+ * counter is incremented here.
+ *
+ * This must be called *before* the checkpoint REDO point is determined.
+ * That ensures that we won't delete files too soon. Since this calls
+ * AbsorbSyncRequests(), which performs memory allocations, it cannot be
+ * called within a critical section.
+ *
+ * Note that we can't do anything here that depends on the assumption
+ * that the checkpoint will be completed.
+ */
+void
+SyncPreCheckpoint(void)
+{
+ /*
+ * Operations such as DROP TABLESPACE assume that the next checkpoint will
+ * process all recently forwarded unlink requests, but if they aren't
+ * absorbed prior to advancing the cycle counter, they won't be processed
+ * until a future checkpoint. The following absorb ensures that any
+ * unlink requests forwarded before the checkpoint began will be processed
+ * in the current checkpoint.
+ */
+ AbsorbSyncRequests();
+
+ /*
+ * Any unlink requests arriving after this point will be assigned the next
+ * cycle counter, and won't be unlinked until next checkpoint.
+ */
+ checkpoint_cycle_ctr++;
+}
+
+/*
+ * SyncPostCheckpoint() -- Do post-checkpoint work
+ *
+ * Remove any lingering files that can now be safely removed.
+ */
+void
+SyncPostCheckpoint(void)
+{
+ int absorb_counter;
+ ListCell *lc;
+
+ absorb_counter = UNLINKS_PER_ABSORB;
+ foreach(lc, pendingUnlinks)
+ {
+ PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(lc);
+ char path[MAXPGPATH];
+
+ /* Skip over any canceled entries */
+ if (entry->canceled)
+ continue;
+
+ /*
+ * New entries are appended to the end, so if the entry is new we've
+ * reached the end of old entries.
+ *
+ * Note: if just the right number of consecutive checkpoints fail, we
+ * could be fooled here by cycle_ctr wraparound. However, the only
+ * consequence is that we'd delay unlinking for one more checkpoint,
+ * which is perfectly tolerable.
+ */
+ if (entry->cycle_ctr == checkpoint_cycle_ctr)
+ break;
+
+ /* Unlink the file */
+ if (syncsw[entry->tag.handler].sync_unlinkfiletag(&entry->tag,
+ path) < 0)
+ {
+ /*
+ * There's a race condition, when the database is dropped at the
+ * same time that we process the pending unlink requests. If the
+ * DROP DATABASE deletes the file before we do, we will get ENOENT
+ * here. rmtree() also has to ignore ENOENT errors, to deal with
+ * the possibility that we delete the file first.
+ */
+ if (errno != ENOENT)
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", path)));
+ }
+
+ /* Mark the list entry as canceled, just in case */
+ entry->canceled = true;
+
+ /*
+ * As in ProcessSyncRequests, we don't want to stop absorbing fsync
+ * requests for a long time when there are many deletions to be done.
+ * We can safely call AbsorbSyncRequests() at this point in the loop.
+ */
+ if (--absorb_counter <= 0)
+ {
+ AbsorbSyncRequests();
+ absorb_counter = UNLINKS_PER_ABSORB;
+ }
+ }
+
+ /*
+ * If we reached the end of the list, we can just remove the whole list
+ * (remembering to pfree all the PendingUnlinkEntry objects). Otherwise,
+ * we must keep the entries at or after "lc".
+ */
+ if (lc == NULL)
+ {
+ list_free_deep(pendingUnlinks);
+ pendingUnlinks = NIL;
+ }
+ else
+ {
+ int ntodelete = list_cell_number(pendingUnlinks, lc);
+
+ for (int i = 0; i < ntodelete; i++)
+ pfree(list_nth(pendingUnlinks, i));
+
+ pendingUnlinks = list_delete_first_n(pendingUnlinks, ntodelete);
+ }
+}
+
+/*
+ * ProcessSyncRequests() -- Process queued fsync requests.
+ */
+void
+ProcessSyncRequests(void)
+{
+ static bool sync_in_progress = false;
+
+ HASH_SEQ_STATUS hstat;
+ PendingFsyncEntry *entry;
+ int absorb_counter;
+
+ /* Statistics on sync times */
+ int processed = 0;
+ instr_time sync_start,
+ sync_end,
+ sync_diff;
+ uint64 elapsed;
+ uint64 longest = 0;
+ uint64 total_elapsed = 0;
+
+ /*
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOps.
+ */
+ if (!pendingOps)
+ elog(ERROR, "cannot sync without a pendingOps table");
+
+ /*
+ * If we are in the checkpointer, the sync had better include all fsync
+ * requests that were queued by backends up to this point. The tightest
+ * race condition that could occur is that a buffer that must be written
+ * and fsync'd for the checkpoint could have been dumped by a backend just
+ * before it was visited by BufferSync(). We know the backend will have
+ * queued an fsync request before clearing the buffer's dirtybit, so we
+ * are safe as long as we do an Absorb after completing BufferSync().
+ */
+ AbsorbSyncRequests();
+
+ /*
+ * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
+ * checkpoint), we want to ignore fsync requests that are entered into the
+ * hashtable after this point --- they should be processed next time,
+ * instead. We use sync_cycle_ctr to tell old entries apart from new
+ * ones: new ones will have cycle_ctr equal to the incremented value of
+ * sync_cycle_ctr.
+ *
+ * In normal circumstances, all entries present in the table at this point
+ * will have cycle_ctr exactly equal to the current (about to be old)
+ * value of sync_cycle_ctr. However, if we fail partway through the
+ * fsync'ing loop, then older values of cycle_ctr might remain when we
+ * come back here to try again. Repeated checkpoint failures would
+ * eventually wrap the counter around to the point where an old entry
+ * might appear new, causing us to skip it, possibly allowing a checkpoint
+ * to succeed that should not have. To forestall wraparound, any time the
+ * previous ProcessSyncRequests() failed to complete, run through the
+ * table and forcibly set cycle_ctr = sync_cycle_ctr.
+ *
+ * Think not to merge this loop with the main loop, as the problem is
+ * exactly that that loop may fail before having visited all the entries.
+ * From a performance point of view it doesn't matter anyway, as this path
+ * will never be taken in a system that's functioning normally.
+ */
+ if (sync_in_progress)
+ {
+ /* prior try failed, so update any stale cycle_ctr values */
+ hash_seq_init(&hstat, pendingOps);
+ while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ entry->cycle_ctr = sync_cycle_ctr;
+ }
+ }
+
+ /* Advance counter so that new hashtable entries are distinguishable */
+ sync_cycle_ctr++;
+
+ /* Set flag to detect failure if we don't reach the end of the loop */
+ sync_in_progress = true;
+
+ /* Now scan the hashtable for fsync requests to process */
+ absorb_counter = FSYNCS_PER_ABSORB;
+ hash_seq_init(&hstat, pendingOps);
+ while ((entry = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ int failures;
+
+ /*
+ * If the entry is new then don't process it this time; it is new.
+ * Note "continue" bypasses the hash-remove call at the bottom of the
+ * loop.
+ */
+ if (entry->cycle_ctr == sync_cycle_ctr)
+ continue;
+
+ /* Else assert we haven't missed it */
+ Assert((CycleCtr) (entry->cycle_ctr + 1) == sync_cycle_ctr);
+
+ /*
+ * If fsync is off then we don't have to bother opening the file at
+ * all. (We delay checking until this point so that changing fsync on
+ * the fly behaves sensibly.)
+ */
+ if (enableFsync)
+ {
+ /*
+ * If in checkpointer, we want to absorb pending requests every so
+ * often to prevent overflow of the fsync request queue. It is
+ * unspecified whether newly-added entries will be visited by
+ * hash_seq_search, but we don't care since we don't need to
+ * process them anyway.
+ */
+ if (--absorb_counter <= 0)
+ {
+ AbsorbSyncRequests();
+ absorb_counter = FSYNCS_PER_ABSORB;
+ }
+
+ /*
+ * The fsync table could contain requests to fsync segments that
+ * have been deleted (unlinked) by the time we get to them. Rather
+ * than just hoping an ENOENT (or EACCES on Windows) error can be
+ * ignored, what we do on error is absorb pending requests and
+ * then retry. Since mdunlink() queues a "cancel" message before
+ * actually unlinking, the fsync request is guaranteed to be
+ * marked canceled after the absorb if it really was this case.
+ * DROP DATABASE likewise has to tell us to forget fsync requests
+ * before it starts deletions.
+ */
+ for (failures = 0; !entry->canceled; failures++)
+ {
+ char path[MAXPGPATH];
+
+ INSTR_TIME_SET_CURRENT(sync_start);
+ if (syncsw[entry->tag.handler].sync_syncfiletag(&entry->tag,
+ path) == 0)
+ {
+ /* Success; update statistics about sync timing */
+ INSTR_TIME_SET_CURRENT(sync_end);
+ sync_diff = sync_end;
+ INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+ elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+ if (elapsed > longest)
+ longest = elapsed;
+ total_elapsed += elapsed;
+ processed++;
+
+ if (log_checkpoints)
+ elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f ms",
+ processed,
+ path,
+ (double) elapsed / 1000);
+
+ break; /* out of retry loop */
+ }
+
+ /*
+ * It is possible that the relation has been dropped or
+ * truncated since the fsync request was entered. Therefore,
+ * allow ENOENT, but only if we didn't fail already on this
+ * file.
+ */
+ if (!FILE_POSSIBLY_DELETED(errno) || failures > 0)
+ ereport(data_sync_elevel(ERROR),
+ (errcode_for_file_access(),
+ errmsg("could not fsync file \"%s\": %m",
+ path)));
+ else
+ ereport(DEBUG1,
+ (errcode_for_file_access(),
+ errmsg_internal("could not fsync file \"%s\" but retrying: %m",
+ path)));
+
+ /*
+ * Absorb incoming requests and check to see if a cancel
+ * arrived for this relation fork.
+ */
+ AbsorbSyncRequests();
+ absorb_counter = FSYNCS_PER_ABSORB; /* might as well... */
+ } /* end retry loop */
+ }
+
+ /* We are done with this entry, remove it */
+ if (hash_search(pendingOps, &entry->tag, HASH_REMOVE, NULL) == NULL)
+ elog(ERROR, "pendingOps corrupted");
+ } /* end loop over hashtable entries */
+
+ /* Return sync performance metrics for report at checkpoint end */
+ CheckpointStats.ckpt_sync_rels = processed;
+ CheckpointStats.ckpt_longest_sync = longest;
+ CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
+ /* Flag successful completion of ProcessSyncRequests */
+ sync_in_progress = false;
+}
+
+/*
+ * RememberSyncRequest() -- callback from checkpointer side of sync request
+ *
+ * We stuff fsync requests into the local hash table for execution
+ * during the checkpointer's next checkpoint. UNLINK requests go into a
+ * separate linked list, however, because they get processed separately.
+ *
+ * See sync.h for more information on the types of sync requests supported.
+ */
+void
+RememberSyncRequest(const FileTag *ftag, SyncRequestType type)
+{
+ Assert(pendingOps);
+
+ if (type == SYNC_FORGET_REQUEST)
+ {
+ PendingFsyncEntry *entry;
+
+ /* Cancel previously entered request */
+ entry = (PendingFsyncEntry *) hash_search(pendingOps,
+ ftag,
+ HASH_FIND,
+ NULL);
+ if (entry != NULL)
+ entry->canceled = true;
+ }
+ else if (type == SYNC_FILTER_REQUEST)
+ {
+ HASH_SEQ_STATUS hstat;
+ PendingFsyncEntry *pfe;
+ ListCell *cell;
+
+ /* Cancel matching fsync requests */
+ hash_seq_init(&hstat, pendingOps);
+ while ((pfe = (PendingFsyncEntry *) hash_seq_search(&hstat)) != NULL)
+ {
+ if (pfe->tag.handler == ftag->handler &&
+ syncsw[ftag->handler].sync_filetagmatches(ftag, &pfe->tag))
+ pfe->canceled = true;
+ }
+
+ /* Cancel matching unlink requests */
+ foreach(cell, pendingUnlinks)
+ {
+ PendingUnlinkEntry *pue = (PendingUnlinkEntry *) lfirst(cell);
+
+ if (pue->tag.handler == ftag->handler &&
+ syncsw[ftag->handler].sync_filetagmatches(ftag, &pue->tag))
+ pue->canceled = true;
+ }
+ }
+ else if (type == SYNC_UNLINK_REQUEST)
+ {
+ /* Unlink request: put it in the linked list */
+ MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+ PendingUnlinkEntry *entry;
+
+ entry = palloc(sizeof(PendingUnlinkEntry));
+ entry->tag = *ftag;
+ entry->cycle_ctr = checkpoint_cycle_ctr;
+ entry->canceled = false;
+
+ pendingUnlinks = lappend(pendingUnlinks, entry);
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+ else
+ {
+ /* Normal case: enter a request to fsync this segment */
+ MemoryContext oldcxt = MemoryContextSwitchTo(pendingOpsCxt);
+ PendingFsyncEntry *entry;
+ bool found;
+
+ Assert(type == SYNC_REQUEST);
+
+ entry = (PendingFsyncEntry *) hash_search(pendingOps,
+ ftag,
+ HASH_ENTER,
+ &found);
+ /* if new entry, or was previously canceled, initialize it */
+ if (!found || entry->canceled)
+ {
+ entry->cycle_ctr = sync_cycle_ctr;
+ entry->canceled = false;
+ }
+
+ /*
+ * NB: it's intentional that we don't change cycle_ctr if the entry
+ * already exists. The cycle_ctr must represent the oldest fsync
+ * request that could be in the entry.
+ */
+
+ MemoryContextSwitchTo(oldcxt);
+ }
+}
+
+/*
+ * Register the sync request locally, or forward it to the checkpointer.
+ *
+ * If retryOnError is true, we'll keep trying if there is no space in the
+ * queue. Return true if we succeeded, or false if there wasn't space.
+ */
+bool
+RegisterSyncRequest(const FileTag *ftag, SyncRequestType type,
+ bool retryOnError)
+{
+ bool ret;
+
+ if (pendingOps != NULL)
+ {
+ /* standalone backend or startup process: fsync state is local */
+ RememberSyncRequest(ftag, type);
+ return true;
+ }
+
+ for (;;)
+ {
+ /*
+ * Notify the checkpointer about it. If we fail to queue a message in
+ * retryOnError mode, we have to sleep and try again ... ugly, but
+ * hopefully won't happen often.
+ *
+ * XXX should we CHECK_FOR_INTERRUPTS in this loop? Escaping with an
+ * error in the case of SYNC_UNLINK_REQUEST would leave the
+ * no-longer-used file still present on disk, which would be bad, so
+ * I'm inclined to assume that the checkpointer will always empty the
+ * queue soon.
+ */
+ ret = ForwardSyncRequest(ftag, type);
+
+ /*
+ * If we are successful in queueing the request, or we failed and were
+ * instructed not to retry on error, break.
+ */
+ if (ret || (!ret && !retryOnError))
+ break;
+
+ WaitLatch(NULL, WL_EXIT_ON_PM_DEATH | WL_TIMEOUT, 10,
+ WAIT_EVENT_REGISTER_SYNC_REQUEST);
+ }
+
+ return ret;
+}