summaryrefslogtreecommitdiffstats
path: root/src/backend/storage/ipc
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 12:15:05 +0000
commit46651ce6fe013220ed397add242004d764fc0153 (patch)
tree6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/storage/ipc
parentInitial commit. (diff)
downloadpostgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz
postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/storage/ipc')
-rw-r--r--src/backend/storage/ipc/Makefile30
-rw-r--r--src/backend/storage/ipc/barrier.c333
-rw-r--r--src/backend/storage/ipc/dsm.c1248
-rw-r--r--src/backend/storage/ipc/dsm_impl.c1058
-rw-r--r--src/backend/storage/ipc/ipc.c435
-rw-r--r--src/backend/storage/ipc/ipci.c291
-rw-r--r--src/backend/storage/ipc/latch.c2158
-rw-r--r--src/backend/storage/ipc/pmsignal.c430
-rw-r--r--src/backend/storage/ipc/procarray.c5220
-rw-r--r--src/backend/storage/ipc/procsignal.c685
-rw-r--r--src/backend/storage/ipc/shm_mq.c1288
-rw-r--r--src/backend/storage/ipc/shm_toc.c272
-rw-r--r--src/backend/storage/ipc/shmem.c611
-rw-r--r--src/backend/storage/ipc/shmqueue.c190
-rw-r--r--src/backend/storage/ipc/signalfuncs.c300
-rw-r--r--src/backend/storage/ipc/sinval.c205
-rw-r--r--src/backend/storage/ipc/sinvaladt.c777
-rw-r--r--src/backend/storage/ipc/standby.c1450
18 files changed, 16981 insertions, 0 deletions
diff --git a/src/backend/storage/ipc/Makefile b/src/backend/storage/ipc/Makefile
new file mode 100644
index 0000000..df90c6b
--- /dev/null
+++ b/src/backend/storage/ipc/Makefile
@@ -0,0 +1,30 @@
+#
+# Makefile for storage/ipc
+#
+# src/backend/storage/ipc/Makefile
+#
+
+subdir = src/backend/storage/ipc
+top_builddir = ../../../..
+include $(top_builddir)/src/Makefile.global
+
+OBJS = \
+ barrier.o \
+ dsm.o \
+ dsm_impl.o \
+ ipc.o \
+ ipci.o \
+ latch.o \
+ pmsignal.o \
+ procarray.o \
+ procsignal.o \
+ shm_mq.o \
+ shm_toc.o \
+ shmem.o \
+ shmqueue.o \
+ signalfuncs.o \
+ sinval.o \
+ sinvaladt.o \
+ standby.o
+
+include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/storage/ipc/barrier.c b/src/backend/storage/ipc/barrier.c
new file mode 100644
index 0000000..5c05297
--- /dev/null
+++ b/src/backend/storage/ipc/barrier.c
@@ -0,0 +1,333 @@
+/*-------------------------------------------------------------------------
+ *
+ * barrier.c
+ * Barriers for synchronizing cooperating processes.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * From Wikipedia[1]: "In parallel computing, a barrier is a type of
+ * synchronization method. A barrier for a group of threads or processes in
+ * the source code means any thread/process must stop at this point and cannot
+ * proceed until all other threads/processes reach this barrier."
+ *
+ * This implementation of barriers allows for static sets of participants
+ * known up front, or dynamic sets of participants which processes can join or
+ * leave at any time. In the dynamic case, a phase number can be used to
+ * track progress through a parallel algorithm, and may be necessary to
+ * synchronize with the current phase of a multi-phase algorithm when a new
+ * participant joins. In the static case, the phase number is used
+ * internally, but it isn't strictly necessary for client code to access it
+ * because the phase can only advance when the declared number of participants
+ * reaches the barrier, so client code should be in no doubt about the current
+ * phase of computation at all times.
+ *
+ * Consider a parallel algorithm that involves separate phases of computation
+ * A, B and C where the output of each phase is needed before the next phase
+ * can begin.
+ *
+ * In the case of a static barrier initialized with 4 participants, each
+ * participant works on phase A, then calls BarrierArriveAndWait to wait until
+ * all 4 participants have reached that point. When BarrierArriveAndWait
+ * returns control, each participant can work on B, and so on. Because the
+ * barrier knows how many participants to expect, the phases of computation
+ * don't need labels or numbers, since each process's program counter implies
+ * the current phase. Even if some of the processes are slow to start up and
+ * begin running phase A, the other participants are expecting them and will
+ * patiently wait at the barrier. The code could be written as follows:
+ *
+ * perform_a();
+ * BarrierArriveAndWait(&barrier, ...);
+ * perform_b();
+ * BarrierArriveAndWait(&barrier, ...);
+ * perform_c();
+ * BarrierArriveAndWait(&barrier, ...);
+ *
+ * If the number of participants is not known up front, then a dynamic barrier
+ * is needed and the number should be set to zero at initialization. New
+ * complications arise because the number necessarily changes over time as
+ * participants attach and detach, and therefore phases B, C or even the end
+ * of processing may be reached before any given participant has started
+ * running and attached. Therefore the client code must perform an initial
+ * test of the phase number after attaching, because it needs to find out
+ * which phase of the algorithm has been reached by any participants that are
+ * already attached in order to synchronize with that work. Once the program
+ * counter or some other representation of current progress is synchronized
+ * with the barrier's phase, normal control flow can be used just as in the
+ * static case. Our example could be written using a switch statement with
+ * cases that fall-through, as follows:
+ *
+ * phase = BarrierAttach(&barrier);
+ * switch (phase)
+ * {
+ * case PHASE_A:
+ * perform_a();
+ * BarrierArriveAndWait(&barrier, ...);
+ * case PHASE_B:
+ * perform_b();
+ * BarrierArriveAndWait(&barrier, ...);
+ * case PHASE_C:
+ * perform_c();
+ * BarrierArriveAndWait(&barrier, ...);
+ * }
+ * BarrierDetach(&barrier);
+ *
+ * Static barriers behave similarly to POSIX's pthread_barrier_t. Dynamic
+ * barriers behave similarly to Java's java.util.concurrent.Phaser.
+ *
+ * [1] https://en.wikipedia.org/wiki/Barrier_(computer_science)
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/barrier.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "storage/barrier.h"
+
+static inline bool BarrierDetachImpl(Barrier *barrier, bool arrive);
+
+/*
+ * Initialize this barrier. To use a static party size, provide the number of
+ * participants to wait for at each phase indicating that that number of
+ * backends is implicitly attached. To use a dynamic party size, specify zero
+ * here and then use BarrierAttach() and
+ * BarrierDetach()/BarrierArriveAndDetach() to register and deregister
+ * participants explicitly.
+ */
+void
+BarrierInit(Barrier *barrier, int participants)
+{
+ SpinLockInit(&barrier->mutex);
+ barrier->participants = participants;
+ barrier->arrived = 0;
+ barrier->phase = 0;
+ barrier->elected = 0;
+ barrier->static_party = participants > 0;
+ ConditionVariableInit(&barrier->condition_variable);
+}
+
+/*
+ * Arrive at this barrier, wait for all other attached participants to arrive
+ * too and then return. Increments the current phase. The caller must be
+ * attached.
+ *
+ * While waiting, pg_stat_activity shows a wait_event_type and wait_event
+ * controlled by the wait_event_info passed in, which should be a value from
+ * one of the WaitEventXXX enums defined in pgstat.h.
+ *
+ * Return true in one arbitrarily chosen participant. Return false in all
+ * others. The return code can be used to elect one participant to execute a
+ * phase of work that must be done serially while other participants wait.
+ */
+bool
+BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info)
+{
+ bool release = false;
+ bool elected;
+ int start_phase;
+ int next_phase;
+
+ SpinLockAcquire(&barrier->mutex);
+ start_phase = barrier->phase;
+ next_phase = start_phase + 1;
+ ++barrier->arrived;
+ if (barrier->arrived == barrier->participants)
+ {
+ release = true;
+ barrier->arrived = 0;
+ barrier->phase = next_phase;
+ barrier->elected = next_phase;
+ }
+ SpinLockRelease(&barrier->mutex);
+
+ /*
+ * If we were the last expected participant to arrive, we can release our
+ * peers and return true to indicate that this backend has been elected to
+ * perform any serial work.
+ */
+ if (release)
+ {
+ ConditionVariableBroadcast(&barrier->condition_variable);
+
+ return true;
+ }
+
+ /*
+ * Otherwise we have to wait for the last participant to arrive and
+ * advance the phase.
+ */
+ elected = false;
+ ConditionVariablePrepareToSleep(&barrier->condition_variable);
+ for (;;)
+ {
+ /*
+ * We know that phase must either be start_phase, indicating that we
+ * need to keep waiting, or next_phase, indicating that the last
+ * participant that we were waiting for has either arrived or detached
+ * so that the next phase has begun. The phase cannot advance any
+ * further than that without this backend's participation, because
+ * this backend is attached.
+ */
+ SpinLockAcquire(&barrier->mutex);
+ Assert(barrier->phase == start_phase || barrier->phase == next_phase);
+ release = barrier->phase == next_phase;
+ if (release && barrier->elected != next_phase)
+ {
+ /*
+ * Usually the backend that arrives last and releases the other
+ * backends is elected to return true (see above), so that it can
+ * begin processing serial work while it has a CPU timeslice.
+ * However, if the barrier advanced because someone detached, then
+ * one of the backends that is awoken will need to be elected.
+ */
+ barrier->elected = barrier->phase;
+ elected = true;
+ }
+ SpinLockRelease(&barrier->mutex);
+ if (release)
+ break;
+ ConditionVariableSleep(&barrier->condition_variable, wait_event_info);
+ }
+ ConditionVariableCancelSleep();
+
+ return elected;
+}
+
+/*
+ * Arrive at this barrier, but detach rather than waiting. Returns true if
+ * the caller was the last to detach.
+ */
+bool
+BarrierArriveAndDetach(Barrier *barrier)
+{
+ return BarrierDetachImpl(barrier, true);
+}
+
+/*
+ * Arrive at a barrier, and detach all but the last to arrive. Returns true if
+ * the caller was the last to arrive, and is therefore still attached.
+ */
+bool
+BarrierArriveAndDetachExceptLast(Barrier *barrier)
+{
+ SpinLockAcquire(&barrier->mutex);
+ if (barrier->participants > 1)
+ {
+ --barrier->participants;
+ SpinLockRelease(&barrier->mutex);
+
+ return false;
+ }
+ Assert(barrier->participants == 1);
+ ++barrier->phase;
+ SpinLockRelease(&barrier->mutex);
+
+ return true;
+}
+
+/*
+ * Attach to a barrier. All waiting participants will now wait for this
+ * participant to call BarrierArriveAndWait(), BarrierDetach() or
+ * BarrierArriveAndDetach(). Return the current phase.
+ */
+int
+BarrierAttach(Barrier *barrier)
+{
+ int phase;
+
+ Assert(!barrier->static_party);
+
+ SpinLockAcquire(&barrier->mutex);
+ ++barrier->participants;
+ phase = barrier->phase;
+ SpinLockRelease(&barrier->mutex);
+
+ return phase;
+}
+
+/*
+ * Detach from a barrier. This may release other waiters from
+ * BarrierArriveAndWait() and advance the phase if they were only waiting for
+ * this backend. Return true if this participant was the last to detach.
+ */
+bool
+BarrierDetach(Barrier *barrier)
+{
+ return BarrierDetachImpl(barrier, false);
+}
+
+/*
+ * Return the current phase of a barrier. The caller must be attached.
+ */
+int
+BarrierPhase(Barrier *barrier)
+{
+ /*
+ * It is OK to read barrier->phase without locking, because it can't
+ * change without us (we are attached to it), and we executed a memory
+ * barrier when we either attached or participated in changing it last
+ * time.
+ */
+ return barrier->phase;
+}
+
+/*
+ * Return an instantaneous snapshot of the number of participants currently
+ * attached to this barrier. For debugging purposes only.
+ */
+int
+BarrierParticipants(Barrier *barrier)
+{
+ int participants;
+
+ SpinLockAcquire(&barrier->mutex);
+ participants = barrier->participants;
+ SpinLockRelease(&barrier->mutex);
+
+ return participants;
+}
+
+/*
+ * Detach from a barrier. If 'arrive' is true then also increment the phase
+ * if there are no other participants. If there are other participants
+ * waiting, then the phase will be advanced and they'll be released if they
+ * were only waiting for the caller. Return true if this participant was the
+ * last to detach.
+ */
+static inline bool
+BarrierDetachImpl(Barrier *barrier, bool arrive)
+{
+ bool release;
+ bool last;
+
+ Assert(!barrier->static_party);
+
+ SpinLockAcquire(&barrier->mutex);
+ Assert(barrier->participants > 0);
+ --barrier->participants;
+
+ /*
+ * If any other participants are waiting and we were the last participant
+ * waited for, release them. If no other participants are waiting, but
+ * this is a BarrierArriveAndDetach() call, then advance the phase too.
+ */
+ if ((arrive || barrier->participants > 0) &&
+ barrier->arrived == barrier->participants)
+ {
+ release = true;
+ barrier->arrived = 0;
+ ++barrier->phase;
+ }
+ else
+ release = false;
+
+ last = barrier->participants == 0;
+ SpinLockRelease(&barrier->mutex);
+
+ if (release)
+ ConditionVariableBroadcast(&barrier->condition_variable);
+
+ return last;
+}
diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c
new file mode 100644
index 0000000..b461a5f
--- /dev/null
+++ b/src/backend/storage/ipc/dsm.c
@@ -0,0 +1,1248 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm.c
+ * manage dynamic shared memory segments
+ *
+ * This file provides a set of services to make programming with dynamic
+ * shared memory segments more convenient. Unlike the low-level
+ * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments
+ * created using this module will be cleaned up automatically. Mappings
+ * will be removed when the resource owner under which they were created
+ * is cleaned up, unless dsm_pin_mapping() is used, in which case they
+ * have session lifespan. Segments will be removed when there are no
+ * remaining mappings, or at postmaster shutdown in any case. After a
+ * hard postmaster crash, remaining segments will be removed, if they
+ * still exist, at the next postmaster startup.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/dsm.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+
+#include "lib/ilist.h"
+#include "miscadmin.h"
+#include "port/pg_bitutils.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "utils/freepage.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+#include "utils/resowner_private.h"
+
+#define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32
+
+#define PG_DYNSHMEM_FIXED_SLOTS 64
+#define PG_DYNSHMEM_SLOTS_PER_BACKEND 5
+
+#define INVALID_CONTROL_SLOT ((uint32) -1)
+
+/* Backend-local tracking for on-detach callbacks. */
+typedef struct dsm_segment_detach_callback
+{
+ on_dsm_detach_callback function;
+ Datum arg;
+ slist_node node;
+} dsm_segment_detach_callback;
+
+/* Backend-local state for a dynamic shared memory segment. */
+struct dsm_segment
+{
+ dlist_node node; /* List link in dsm_segment_list. */
+ ResourceOwner resowner; /* Resource owner. */
+ dsm_handle handle; /* Segment name. */
+ uint32 control_slot; /* Slot in control segment. */
+ void *impl_private; /* Implementation-specific private data. */
+ void *mapped_address; /* Mapping address, or NULL if unmapped. */
+ Size mapped_size; /* Size of our mapping. */
+ slist_head on_detach; /* On-detach callbacks. */
+};
+
+/* Shared-memory state for a dynamic shared memory segment. */
+typedef struct dsm_control_item
+{
+ dsm_handle handle;
+ uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */
+ size_t first_page;
+ size_t npages;
+ void *impl_private_pm_handle; /* only needed on Windows */
+ bool pinned;
+} dsm_control_item;
+
+/* Layout of the dynamic shared memory control segment. */
+typedef struct dsm_control_header
+{
+ uint32 magic;
+ uint32 nitems;
+ uint32 maxitems;
+ dsm_control_item item[FLEXIBLE_ARRAY_MEMBER];
+} dsm_control_header;
+
+static void dsm_cleanup_for_mmap(void);
+static void dsm_postmaster_shutdown(int code, Datum arg);
+static dsm_segment *dsm_create_descriptor(void);
+static bool dsm_control_segment_sane(dsm_control_header *control,
+ Size mapped_size);
+static uint64 dsm_control_bytes_needed(uint32 nitems);
+static inline dsm_handle make_main_region_dsm_handle(int slot);
+static inline bool is_main_region_dsm_handle(dsm_handle handle);
+
+/* Has this backend initialized the dynamic shared memory system yet? */
+static bool dsm_init_done = false;
+
+/* Preallocated DSM space in the main shared memory region. */
+static void *dsm_main_space_begin = NULL;
+
+/*
+ * List of dynamic shared memory segments used by this backend.
+ *
+ * At process exit time, we must decrement the reference count of each
+ * segment we have attached; this list makes it possible to find all such
+ * segments.
+ *
+ * This list should always be empty in the postmaster. We could probably
+ * allow the postmaster to map dynamic shared memory segments before it
+ * begins to start child processes, provided that each process adjusted
+ * the reference counts for those segments in the control segment at
+ * startup time, but there's no obvious need for such a facility, which
+ * would also be complex to handle in the EXEC_BACKEND case. Once the
+ * postmaster has begun spawning children, there's an additional problem:
+ * each new mapping would require an update to the control segment,
+ * which requires locking, in which the postmaster must not be involved.
+ */
+static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list);
+
+/*
+ * Control segment information.
+ *
+ * Unlike ordinary shared memory segments, the control segment is not
+ * reference counted; instead, it lasts for the postmaster's entire
+ * life cycle. For simplicity, it doesn't have a dsm_segment object either.
+ */
+static dsm_handle dsm_control_handle;
+static dsm_control_header *dsm_control;
+static Size dsm_control_mapped_size = 0;
+static void *dsm_control_impl_private = NULL;
+
+/*
+ * Start up the dynamic shared memory system.
+ *
+ * This is called just once during each cluster lifetime, at postmaster
+ * startup time.
+ */
+void
+dsm_postmaster_startup(PGShmemHeader *shim)
+{
+ void *dsm_control_address = NULL;
+ uint32 maxitems;
+ Size segsize;
+
+ Assert(!IsUnderPostmaster);
+
+ /*
+ * If we're using the mmap implementations, clean up any leftovers.
+ * Cleanup isn't needed on Windows, and happens earlier in startup for
+ * POSIX and System V shared memory, via a direct call to
+ * dsm_cleanup_using_control_segment.
+ */
+ if (dynamic_shared_memory_type == DSM_IMPL_MMAP)
+ dsm_cleanup_for_mmap();
+
+ /* Determine size for new control segment. */
+ maxitems = PG_DYNSHMEM_FIXED_SLOTS
+ + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends;
+ elog(DEBUG2, "dynamic shared memory system will support %u segments",
+ maxitems);
+ segsize = dsm_control_bytes_needed(maxitems);
+
+ /*
+ * Loop until we find an unused identifier for the new control segment. We
+ * sometimes use 0 as a sentinel value indicating that no control segment
+ * is known to exist, so avoid using that value for a real control
+ * segment.
+ */
+ for (;;)
+ {
+ Assert(dsm_control_address == NULL);
+ Assert(dsm_control_mapped_size == 0);
+ dsm_control_handle = random() << 1; /* Even numbers only */
+ if (dsm_control_handle == DSM_HANDLE_INVALID)
+ continue;
+ if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize,
+ &dsm_control_impl_private, &dsm_control_address,
+ &dsm_control_mapped_size, ERROR))
+ break;
+ }
+ dsm_control = dsm_control_address;
+ on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim));
+ elog(DEBUG2,
+ "created dynamic shared memory control segment %u (%zu bytes)",
+ dsm_control_handle, segsize);
+ shim->dsm_control = dsm_control_handle;
+
+ /* Initialize control segment. */
+ dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC;
+ dsm_control->nitems = 0;
+ dsm_control->maxitems = maxitems;
+}
+
+/*
+ * Determine whether the control segment from the previous postmaster
+ * invocation still exists. If so, remove the dynamic shared memory
+ * segments to which it refers, and then the control segment itself.
+ */
+void
+dsm_cleanup_using_control_segment(dsm_handle old_control_handle)
+{
+ void *mapped_address = NULL;
+ void *junk_mapped_address = NULL;
+ void *impl_private = NULL;
+ void *junk_impl_private = NULL;
+ Size mapped_size = 0;
+ Size junk_mapped_size = 0;
+ uint32 nitems;
+ uint32 i;
+ dsm_control_header *old_control;
+
+ /*
+ * Try to attach the segment. If this fails, it probably just means that
+ * the operating system has been rebooted and the segment no longer
+ * exists, or an unrelated process has used the same shm ID. So just fall
+ * out quietly.
+ */
+ if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private,
+ &mapped_address, &mapped_size, DEBUG1))
+ return;
+
+ /*
+ * We've managed to reattach it, but the contents might not be sane. If
+ * they aren't, we disregard the segment after all.
+ */
+ old_control = (dsm_control_header *) mapped_address;
+ if (!dsm_control_segment_sane(old_control, mapped_size))
+ {
+ dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private,
+ &mapped_address, &mapped_size, LOG);
+ return;
+ }
+
+ /*
+ * OK, the control segment looks basically valid, so we can use it to get
+ * a list of segments that need to be removed.
+ */
+ nitems = old_control->nitems;
+ for (i = 0; i < nitems; ++i)
+ {
+ dsm_handle handle;
+ uint32 refcnt;
+
+ /* If the reference count is 0, the slot is actually unused. */
+ refcnt = old_control->item[i].refcnt;
+ if (refcnt == 0)
+ continue;
+
+ /* If it was using the main shmem area, there is nothing to do. */
+ handle = old_control->item[i].handle;
+ if (is_main_region_dsm_handle(handle))
+ continue;
+
+ /* Log debugging information. */
+ elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)",
+ handle, refcnt);
+
+ /* Destroy the referenced segment. */
+ dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+ &junk_mapped_address, &junk_mapped_size, LOG);
+ }
+
+ /* Destroy the old control segment, too. */
+ elog(DEBUG2,
+ "cleaning up dynamic shared memory control segment with ID %u",
+ old_control_handle);
+ dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private,
+ &mapped_address, &mapped_size, LOG);
+}
+
+/*
+ * When we're using the mmap shared memory implementation, "shared memory"
+ * segments might even manage to survive an operating system reboot.
+ * But there's no guarantee as to exactly what will survive: some segments
+ * may survive, and others may not, and the contents of some may be out
+ * of date. In particular, the control segment may be out of date, so we
+ * can't rely on it to figure out what to remove. However, since we know
+ * what directory contains the files we used as shared memory, we can simply
+ * scan the directory and blow everything away that shouldn't be there.
+ */
+static void
+dsm_cleanup_for_mmap(void)
+{
+ DIR *dir;
+ struct dirent *dent;
+
+ /* Scan the directory for something with a name of the correct format. */
+ dir = AllocateDir(PG_DYNSHMEM_DIR);
+
+ while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL)
+ {
+ if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX,
+ strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0)
+ {
+ char buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)];
+
+ snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name);
+
+ elog(DEBUG2, "removing file \"%s\"", buf);
+
+ /* We found a matching file; so remove it. */
+ if (unlink(buf) != 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not remove file \"%s\": %m", buf)));
+ }
+ }
+
+ /* Cleanup complete. */
+ FreeDir(dir);
+}
+
+/*
+ * At shutdown time, we iterate over the control segment and remove all
+ * remaining dynamic shared memory segments. We avoid throwing errors here;
+ * the postmaster is shutting down either way, and this is just non-critical
+ * resource cleanup.
+ */
+static void
+dsm_postmaster_shutdown(int code, Datum arg)
+{
+ uint32 nitems;
+ uint32 i;
+ void *dsm_control_address;
+ void *junk_mapped_address = NULL;
+ void *junk_impl_private = NULL;
+ Size junk_mapped_size = 0;
+ PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg);
+
+ /*
+ * If some other backend exited uncleanly, it might have corrupted the
+ * control segment while it was dying. In that case, we warn and ignore
+ * the contents of the control segment. This may end up leaving behind
+ * stray shared memory segments, but there's not much we can do about that
+ * if the metadata is gone.
+ */
+ nitems = dsm_control->nitems;
+ if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
+ {
+ ereport(LOG,
+ (errmsg("dynamic shared memory control segment is corrupt")));
+ return;
+ }
+
+ /* Remove any remaining segments. */
+ for (i = 0; i < nitems; ++i)
+ {
+ dsm_handle handle;
+
+ /* If the reference count is 0, the slot is actually unused. */
+ if (dsm_control->item[i].refcnt == 0)
+ continue;
+
+ handle = dsm_control->item[i].handle;
+ if (is_main_region_dsm_handle(handle))
+ continue;
+
+ /* Log debugging information. */
+ elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u",
+ handle);
+
+ /* Destroy the segment. */
+ dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+ &junk_mapped_address, &junk_mapped_size, LOG);
+ }
+
+ /* Remove the control segment itself. */
+ elog(DEBUG2,
+ "cleaning up dynamic shared memory control segment with ID %u",
+ dsm_control_handle);
+ dsm_control_address = dsm_control;
+ dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0,
+ &dsm_control_impl_private, &dsm_control_address,
+ &dsm_control_mapped_size, LOG);
+ dsm_control = dsm_control_address;
+ shim->dsm_control = 0;
+}
+
+/*
+ * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND,
+ * we must reread the state file and map the control segment; in other cases,
+ * we'll have inherited the postmaster's mapping and global variables.
+ */
+static void
+dsm_backend_startup(void)
+{
+#ifdef EXEC_BACKEND
+ {
+ void *control_address = NULL;
+
+ /* Attach control segment. */
+ Assert(dsm_control_handle != 0);
+ dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0,
+ &dsm_control_impl_private, &control_address,
+ &dsm_control_mapped_size, ERROR);
+ dsm_control = control_address;
+ /* If control segment doesn't look sane, something is badly wrong. */
+ if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size))
+ {
+ dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
+ &dsm_control_impl_private, &control_address,
+ &dsm_control_mapped_size, WARNING);
+ ereport(FATAL,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("dynamic shared memory control segment is not valid")));
+ }
+ }
+#endif
+
+ dsm_init_done = true;
+}
+
+#ifdef EXEC_BACKEND
+/*
+ * When running under EXEC_BACKEND, we get a callback here when the main
+ * shared memory segment is re-attached, so that we can record the control
+ * handle retrieved from it.
+ */
+void
+dsm_set_control_handle(dsm_handle h)
+{
+ Assert(dsm_control_handle == 0 && h != 0);
+ dsm_control_handle = h;
+}
+#endif
+
+/*
+ * Reserve some space in the main shared memory segment for DSM segments.
+ */
+size_t
+dsm_estimate_size(void)
+{
+ return 1024 * 1024 * (size_t) min_dynamic_shared_memory;
+}
+
+/*
+ * Initialize space in the main shared memory segment for DSM segments.
+ */
+void
+dsm_shmem_init(void)
+{
+ size_t size = dsm_estimate_size();
+ bool found;
+
+ if (size == 0)
+ return;
+
+ dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found);
+ if (!found)
+ {
+ FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin;
+ size_t first_page = 0;
+ size_t pages;
+
+ /* Reserve space for the FreePageManager. */
+ while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager))
+ ++first_page;
+
+ /* Initialize it and give it all the rest of the space. */
+ FreePageManagerInitialize(fpm, dsm_main_space_begin);
+ pages = (size / FPM_PAGE_SIZE) - first_page;
+ FreePageManagerPut(fpm, first_page, pages);
+ }
+}
+
+/*
+ * Create a new dynamic shared memory segment.
+ *
+ * If there is a non-NULL CurrentResourceOwner, the new segment is associated
+ * with it and must be detached before the resource owner releases, or a
+ * warning will be logged. If CurrentResourceOwner is NULL, the segment
+ * remains attached until explicitly detached or the session ends.
+ * Creating with a NULL CurrentResourceOwner is equivalent to creating
+ * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping.
+ */
+dsm_segment *
+dsm_create(Size size, int flags)
+{
+ dsm_segment *seg;
+ uint32 i;
+ uint32 nitems;
+ size_t npages = 0;
+ size_t first_page = 0;
+ FreePageManager *dsm_main_space_fpm = dsm_main_space_begin;
+ bool using_main_dsm_region = false;
+
+ /* Unsafe in postmaster (and pointless in a stand-alone backend). */
+ Assert(IsUnderPostmaster);
+
+ if (!dsm_init_done)
+ dsm_backend_startup();
+
+ /* Create a new segment descriptor. */
+ seg = dsm_create_descriptor();
+
+ /*
+ * Lock the control segment while we try to allocate from the main shared
+ * memory area, if configured.
+ */
+ if (dsm_main_space_fpm)
+ {
+ npages = size / FPM_PAGE_SIZE;
+ if (size % FPM_PAGE_SIZE > 0)
+ ++npages;
+
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page))
+ {
+ /* We can carve out a piece of the main shared memory segment. */
+ seg->mapped_address = (char *) dsm_main_space_begin +
+ first_page * FPM_PAGE_SIZE;
+ seg->mapped_size = npages * FPM_PAGE_SIZE;
+ using_main_dsm_region = true;
+ /* We'll choose a handle below. */
+ }
+ }
+
+ if (!using_main_dsm_region)
+ {
+ /*
+ * We need to create a new memory segment. Loop until we find an
+ * unused segment identifier.
+ */
+ if (dsm_main_space_fpm)
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ for (;;)
+ {
+ Assert(seg->mapped_address == NULL && seg->mapped_size == 0);
+ seg->handle = random() << 1; /* Even numbers only */
+ if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */
+ continue;
+ if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, ERROR))
+ break;
+ }
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ }
+
+ /* Search the control segment for an unused slot. */
+ nitems = dsm_control->nitems;
+ for (i = 0; i < nitems; ++i)
+ {
+ if (dsm_control->item[i].refcnt == 0)
+ {
+ if (using_main_dsm_region)
+ {
+ seg->handle = make_main_region_dsm_handle(i);
+ dsm_control->item[i].first_page = first_page;
+ dsm_control->item[i].npages = npages;
+ }
+ else
+ Assert(!is_main_region_dsm_handle(seg->handle));
+ dsm_control->item[i].handle = seg->handle;
+ /* refcnt of 1 triggers destruction, so start at 2 */
+ dsm_control->item[i].refcnt = 2;
+ dsm_control->item[i].impl_private_pm_handle = NULL;
+ dsm_control->item[i].pinned = false;
+ seg->control_slot = i;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ return seg;
+ }
+ }
+
+ /* Verify that we can support an additional mapping. */
+ if (nitems >= dsm_control->maxitems)
+ {
+ if (using_main_dsm_region)
+ FreePageManagerPut(dsm_main_space_fpm, first_page, npages);
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ if (!using_main_dsm_region)
+ dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, WARNING);
+ if (seg->resowner != NULL)
+ ResourceOwnerForgetDSM(seg->resowner, seg);
+ dlist_delete(&seg->node);
+ pfree(seg);
+
+ if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0)
+ return NULL;
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+ errmsg("too many dynamic shared memory segments")));
+ }
+
+ /* Enter the handle into a new array slot. */
+ if (using_main_dsm_region)
+ {
+ seg->handle = make_main_region_dsm_handle(nitems);
+ dsm_control->item[i].first_page = first_page;
+ dsm_control->item[i].npages = npages;
+ }
+ dsm_control->item[nitems].handle = seg->handle;
+ /* refcnt of 1 triggers destruction, so start at 2 */
+ dsm_control->item[nitems].refcnt = 2;
+ dsm_control->item[nitems].impl_private_pm_handle = NULL;
+ dsm_control->item[nitems].pinned = false;
+ seg->control_slot = nitems;
+ dsm_control->nitems++;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+
+ return seg;
+}
+
+/*
+ * Attach a dynamic shared memory segment.
+ *
+ * See comments for dsm_segment_handle() for an explanation of how this
+ * is intended to be used.
+ *
+ * This function will return NULL if the segment isn't known to the system.
+ * This can happen if we're asked to attach the segment, but then everyone
+ * else detaches it (causing it to be destroyed) before we get around to
+ * attaching it.
+ *
+ * If there is a non-NULL CurrentResourceOwner, the attached segment is
+ * associated with it and must be detached before the resource owner releases,
+ * or a warning will be logged. Otherwise the segment remains attached until
+ * explicitly detached or the session ends. See the note atop dsm_create().
+ */
+dsm_segment *
+dsm_attach(dsm_handle h)
+{
+ dsm_segment *seg;
+ dlist_iter iter;
+ uint32 i;
+ uint32 nitems;
+
+ /* Unsafe in postmaster (and pointless in a stand-alone backend). */
+ Assert(IsUnderPostmaster);
+
+ if (!dsm_init_done)
+ dsm_backend_startup();
+
+ /*
+ * Since this is just a debugging cross-check, we could leave it out
+ * altogether, or include it only in assert-enabled builds. But since the
+ * list of attached segments should normally be very short, let's include
+ * it always for right now.
+ *
+ * If you're hitting this error, you probably want to attempt to find an
+ * existing mapping via dsm_find_mapping() before calling dsm_attach() to
+ * create a new one.
+ */
+ dlist_foreach(iter, &dsm_segment_list)
+ {
+ seg = dlist_container(dsm_segment, node, iter.cur);
+ if (seg->handle == h)
+ elog(ERROR, "can't attach the same segment more than once");
+ }
+
+ /* Create a new segment descriptor. */
+ seg = dsm_create_descriptor();
+ seg->handle = h;
+
+ /* Bump reference count for this segment in shared memory. */
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ nitems = dsm_control->nitems;
+ for (i = 0; i < nitems; ++i)
+ {
+ /*
+ * If the reference count is 0, the slot is actually unused. If the
+ * reference count is 1, the slot is still in use, but the segment is
+ * in the process of going away; even if the handle matches, another
+ * slot may already have started using the same handle value by
+ * coincidence so we have to keep searching.
+ */
+ if (dsm_control->item[i].refcnt <= 1)
+ continue;
+
+ /* If the handle doesn't match, it's not the slot we want. */
+ if (dsm_control->item[i].handle != seg->handle)
+ continue;
+
+ /* Otherwise we've found a match. */
+ dsm_control->item[i].refcnt++;
+ seg->control_slot = i;
+ if (is_main_region_dsm_handle(seg->handle))
+ {
+ seg->mapped_address = (char *) dsm_main_space_begin +
+ dsm_control->item[i].first_page * FPM_PAGE_SIZE;
+ seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE;
+ }
+ break;
+ }
+ LWLockRelease(DynamicSharedMemoryControlLock);
+
+ /*
+ * If we didn't find the handle we're looking for in the control segment,
+ * it probably means that everyone else who had it mapped, including the
+ * original creator, died before we got to this point. It's up to the
+ * caller to decide what to do about that.
+ */
+ if (seg->control_slot == INVALID_CONTROL_SLOT)
+ {
+ dsm_detach(seg);
+ return NULL;
+ }
+
+ /* Here's where we actually try to map the segment. */
+ if (!is_main_region_dsm_handle(seg->handle))
+ dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, ERROR);
+
+ return seg;
+}
+
+/*
+ * At backend shutdown time, detach any segments that are still attached.
+ * (This is similar to dsm_detach_all, except that there's no reason to
+ * unmap the control segment before exiting, so we don't bother.)
+ */
+void
+dsm_backend_shutdown(void)
+{
+ while (!dlist_is_empty(&dsm_segment_list))
+ {
+ dsm_segment *seg;
+
+ seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
+ dsm_detach(seg);
+ }
+}
+
+/*
+ * Detach all shared memory segments, including the control segments. This
+ * should be called, along with PGSharedMemoryDetach, in processes that
+ * might inherit mappings but are not intended to be connected to dynamic
+ * shared memory.
+ */
+void
+dsm_detach_all(void)
+{
+ void *control_address = dsm_control;
+
+ while (!dlist_is_empty(&dsm_segment_list))
+ {
+ dsm_segment *seg;
+
+ seg = dlist_head_element(dsm_segment, node, &dsm_segment_list);
+ dsm_detach(seg);
+ }
+
+ if (control_address != NULL)
+ dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0,
+ &dsm_control_impl_private, &control_address,
+ &dsm_control_mapped_size, ERROR);
+}
+
+/*
+ * Detach from a shared memory segment, destroying the segment if we
+ * remove the last reference.
+ *
+ * This function should never fail. It will often be invoked when aborting
+ * a transaction, and a further error won't serve any purpose. It's not a
+ * complete disaster if we fail to unmap or destroy the segment; it means a
+ * resource leak, but that doesn't necessarily preclude further operations.
+ */
+void
+dsm_detach(dsm_segment *seg)
+{
+ /*
+ * Invoke registered callbacks. Just in case one of those callbacks
+ * throws a further error that brings us back here, pop the callback
+ * before invoking it, to avoid infinite error recursion. Don't allow
+ * interrupts while running the individual callbacks in non-error code
+ * paths, to avoid leaving cleanup work unfinished if we're interrupted by
+ * a statement timeout or similar.
+ */
+ HOLD_INTERRUPTS();
+ while (!slist_is_empty(&seg->on_detach))
+ {
+ slist_node *node;
+ dsm_segment_detach_callback *cb;
+ on_dsm_detach_callback function;
+ Datum arg;
+
+ node = slist_pop_head_node(&seg->on_detach);
+ cb = slist_container(dsm_segment_detach_callback, node, node);
+ function = cb->function;
+ arg = cb->arg;
+ pfree(cb);
+
+ function(seg, arg);
+ }
+ RESUME_INTERRUPTS();
+
+ /*
+ * Try to remove the mapping, if one exists. Normally, there will be, but
+ * maybe not, if we failed partway through a create or attach operation.
+ * We remove the mapping before decrementing the reference count so that
+ * the process that sees a zero reference count can be certain that no
+ * remaining mappings exist. Even if this fails, we pretend that it
+ * works, because retrying is likely to fail in the same way.
+ */
+ if (seg->mapped_address != NULL)
+ {
+ if (!is_main_region_dsm_handle(seg->handle))
+ dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, WARNING);
+ seg->impl_private = NULL;
+ seg->mapped_address = NULL;
+ seg->mapped_size = 0;
+ }
+
+ /* Reduce reference count, if we previously increased it. */
+ if (seg->control_slot != INVALID_CONTROL_SLOT)
+ {
+ uint32 refcnt;
+ uint32 control_slot = seg->control_slot;
+
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ Assert(dsm_control->item[control_slot].handle == seg->handle);
+ Assert(dsm_control->item[control_slot].refcnt > 1);
+ refcnt = --dsm_control->item[control_slot].refcnt;
+ seg->control_slot = INVALID_CONTROL_SLOT;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+
+ /* If new reference count is 1, try to destroy the segment. */
+ if (refcnt == 1)
+ {
+ /* A pinned segment should never reach 1. */
+ Assert(!dsm_control->item[control_slot].pinned);
+
+ /*
+ * If we fail to destroy the segment here, or are killed before we
+ * finish doing so, the reference count will remain at 1, which
+ * will mean that nobody else can attach to the segment. At
+ * postmaster shutdown time, or when a new postmaster is started
+ * after a hard kill, another attempt will be made to remove the
+ * segment.
+ *
+ * The main case we're worried about here is being killed by a
+ * signal before we can finish removing the segment. In that
+ * case, it's important to be sure that the segment still gets
+ * removed. If we actually fail to remove the segment for some
+ * other reason, the postmaster may not have any better luck than
+ * we did. There's not much we can do about that, though.
+ */
+ if (is_main_region_dsm_handle(seg->handle) ||
+ dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private,
+ &seg->mapped_address, &seg->mapped_size, WARNING))
+ {
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ if (is_main_region_dsm_handle(seg->handle))
+ FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+ dsm_control->item[control_slot].first_page,
+ dsm_control->item[control_slot].npages);
+ Assert(dsm_control->item[control_slot].handle == seg->handle);
+ Assert(dsm_control->item[control_slot].refcnt == 1);
+ dsm_control->item[control_slot].refcnt = 0;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ }
+ }
+ }
+
+ /* Clean up our remaining backend-private data structures. */
+ if (seg->resowner != NULL)
+ ResourceOwnerForgetDSM(seg->resowner, seg);
+ dlist_delete(&seg->node);
+ pfree(seg);
+}
+
+/*
+ * Keep a dynamic shared memory mapping until end of session.
+ *
+ * By default, mappings are owned by the current resource owner, which
+ * typically means they stick around for the duration of the current query
+ * only.
+ */
+void
+dsm_pin_mapping(dsm_segment *seg)
+{
+ if (seg->resowner != NULL)
+ {
+ ResourceOwnerForgetDSM(seg->resowner, seg);
+ seg->resowner = NULL;
+ }
+}
+
+/*
+ * Arrange to remove a dynamic shared memory mapping at cleanup time.
+ *
+ * dsm_pin_mapping() can be used to preserve a mapping for the entire
+ * lifetime of a process; this function reverses that decision, making
+ * the segment owned by the current resource owner. This may be useful
+ * just before performing some operation that will invalidate the segment
+ * for future use by this backend.
+ */
+void
+dsm_unpin_mapping(dsm_segment *seg)
+{
+ Assert(seg->resowner == NULL);
+ ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
+ seg->resowner = CurrentResourceOwner;
+ ResourceOwnerRememberDSM(seg->resowner, seg);
+}
+
+/*
+ * Keep a dynamic shared memory segment until postmaster shutdown, or until
+ * dsm_unpin_segment is called.
+ *
+ * This function should not be called more than once per segment, unless the
+ * segment is explicitly unpinned with dsm_unpin_segment in between calls.
+ *
+ * Note that this function does not arrange for the current process to
+ * keep the segment mapped indefinitely; if that behavior is desired,
+ * dsm_pin_mapping() should be used from each process that needs to
+ * retain the mapping.
+ */
+void
+dsm_pin_segment(dsm_segment *seg)
+{
+ void *handle;
+
+ /*
+ * Bump reference count for this segment in shared memory. This will
+ * ensure that even if there is no session which is attached to this
+ * segment, it will remain until postmaster shutdown or an explicit call
+ * to unpin.
+ */
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ if (dsm_control->item[seg->control_slot].pinned)
+ elog(ERROR, "cannot pin a segment that is already pinned");
+ dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle);
+ dsm_control->item[seg->control_slot].pinned = true;
+ dsm_control->item[seg->control_slot].refcnt++;
+ dsm_control->item[seg->control_slot].impl_private_pm_handle = handle;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+}
+
+/*
+ * Unpin a dynamic shared memory segment that was previously pinned with
+ * dsm_pin_segment. This function should not be called unless dsm_pin_segment
+ * was previously called for this segment.
+ *
+ * The argument is a dsm_handle rather than a dsm_segment in case you want
+ * to unpin a segment to which you haven't attached. This turns out to be
+ * useful if, for example, a reference to one shared memory segment is stored
+ * within another shared memory segment. You might want to unpin the
+ * referenced segment before destroying the referencing segment.
+ */
+void
+dsm_unpin_segment(dsm_handle handle)
+{
+ uint32 control_slot = INVALID_CONTROL_SLOT;
+ bool destroy = false;
+ uint32 i;
+
+ /* Find the control slot for the given handle. */
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ for (i = 0; i < dsm_control->nitems; ++i)
+ {
+ /* Skip unused slots and segments that are concurrently going away. */
+ if (dsm_control->item[i].refcnt <= 1)
+ continue;
+
+ /* If we've found our handle, we can stop searching. */
+ if (dsm_control->item[i].handle == handle)
+ {
+ control_slot = i;
+ break;
+ }
+ }
+
+ /*
+ * We should definitely have found the slot, and it should not already be
+ * in the process of going away, because this function should only be
+ * called on a segment which is pinned.
+ */
+ if (control_slot == INVALID_CONTROL_SLOT)
+ elog(ERROR, "cannot unpin unknown segment handle");
+ if (!dsm_control->item[control_slot].pinned)
+ elog(ERROR, "cannot unpin a segment that is not pinned");
+ Assert(dsm_control->item[control_slot].refcnt > 1);
+
+ /*
+ * Allow implementation-specific code to run. We have to do this before
+ * releasing the lock, because impl_private_pm_handle may get modified by
+ * dsm_impl_unpin_segment.
+ */
+ dsm_impl_unpin_segment(handle,
+ &dsm_control->item[control_slot].impl_private_pm_handle);
+
+ /* Note that 1 means no references (0 means unused slot). */
+ if (--dsm_control->item[control_slot].refcnt == 1)
+ destroy = true;
+ dsm_control->item[control_slot].pinned = false;
+
+ /* Now we can release the lock. */
+ LWLockRelease(DynamicSharedMemoryControlLock);
+
+ /* Clean up resources if that was the last reference. */
+ if (destroy)
+ {
+ void *junk_impl_private = NULL;
+ void *junk_mapped_address = NULL;
+ Size junk_mapped_size = 0;
+
+ /*
+ * For an explanation of how error handling works in this case, see
+ * comments in dsm_detach. Note that if we reach this point, the
+ * current process certainly does not have the segment mapped, because
+ * if it did, the reference count would have still been greater than 1
+ * even after releasing the reference count held by the pin. The fact
+ * that there can't be a dsm_segment for this handle makes it OK to
+ * pass the mapped size, mapped address, and private data as NULL
+ * here.
+ */
+ if (is_main_region_dsm_handle(handle) ||
+ dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private,
+ &junk_mapped_address, &junk_mapped_size, WARNING))
+ {
+ LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE);
+ if (is_main_region_dsm_handle(handle))
+ FreePageManagerPut((FreePageManager *) dsm_main_space_begin,
+ dsm_control->item[control_slot].first_page,
+ dsm_control->item[control_slot].npages);
+ Assert(dsm_control->item[control_slot].handle == handle);
+ Assert(dsm_control->item[control_slot].refcnt == 1);
+ dsm_control->item[control_slot].refcnt = 0;
+ LWLockRelease(DynamicSharedMemoryControlLock);
+ }
+ }
+}
+
+/*
+ * Find an existing mapping for a shared memory segment, if there is one.
+ */
+dsm_segment *
+dsm_find_mapping(dsm_handle h)
+{
+ dlist_iter iter;
+ dsm_segment *seg;
+
+ dlist_foreach(iter, &dsm_segment_list)
+ {
+ seg = dlist_container(dsm_segment, node, iter.cur);
+ if (seg->handle == h)
+ return seg;
+ }
+
+ return NULL;
+}
+
+/*
+ * Get the address at which a dynamic shared memory segment is mapped.
+ */
+void *
+dsm_segment_address(dsm_segment *seg)
+{
+ Assert(seg->mapped_address != NULL);
+ return seg->mapped_address;
+}
+
+/*
+ * Get the size of a mapping.
+ */
+Size
+dsm_segment_map_length(dsm_segment *seg)
+{
+ Assert(seg->mapped_address != NULL);
+ return seg->mapped_size;
+}
+
+/*
+ * Get a handle for a mapping.
+ *
+ * To establish communication via dynamic shared memory between two backends,
+ * one of them should first call dsm_create() to establish a new shared
+ * memory mapping. That process should then call dsm_segment_handle() to
+ * obtain a handle for the mapping, and pass that handle to the
+ * coordinating backend via some means (e.g. bgw_main_arg, or via the
+ * main shared memory segment). The recipient, once in possession of the
+ * handle, should call dsm_attach().
+ */
+dsm_handle
+dsm_segment_handle(dsm_segment *seg)
+{
+ return seg->handle;
+}
+
+/*
+ * Register an on-detach callback for a dynamic shared memory segment.
+ */
+void
+on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg)
+{
+ dsm_segment_detach_callback *cb;
+
+ cb = MemoryContextAlloc(TopMemoryContext,
+ sizeof(dsm_segment_detach_callback));
+ cb->function = function;
+ cb->arg = arg;
+ slist_push_head(&seg->on_detach, &cb->node);
+}
+
+/*
+ * Unregister an on-detach callback for a dynamic shared memory segment.
+ */
+void
+cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function,
+ Datum arg)
+{
+ slist_mutable_iter iter;
+
+ slist_foreach_modify(iter, &seg->on_detach)
+ {
+ dsm_segment_detach_callback *cb;
+
+ cb = slist_container(dsm_segment_detach_callback, node, iter.cur);
+ if (cb->function == function && cb->arg == arg)
+ {
+ slist_delete_current(&iter);
+ pfree(cb);
+ break;
+ }
+ }
+}
+
+/*
+ * Discard all registered on-detach callbacks without executing them.
+ */
+void
+reset_on_dsm_detach(void)
+{
+ dlist_iter iter;
+
+ dlist_foreach(iter, &dsm_segment_list)
+ {
+ dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur);
+
+ /* Throw away explicit on-detach actions one by one. */
+ while (!slist_is_empty(&seg->on_detach))
+ {
+ slist_node *node;
+ dsm_segment_detach_callback *cb;
+
+ node = slist_pop_head_node(&seg->on_detach);
+ cb = slist_container(dsm_segment_detach_callback, node, node);
+ pfree(cb);
+ }
+
+ /*
+ * Decrementing the reference count is a sort of implicit on-detach
+ * action; make sure we don't do that, either.
+ */
+ seg->control_slot = INVALID_CONTROL_SLOT;
+ }
+}
+
+/*
+ * Create a segment descriptor.
+ */
+static dsm_segment *
+dsm_create_descriptor(void)
+{
+ dsm_segment *seg;
+
+ if (CurrentResourceOwner)
+ ResourceOwnerEnlargeDSMs(CurrentResourceOwner);
+
+ seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment));
+ dlist_push_head(&dsm_segment_list, &seg->node);
+
+ /* seg->handle must be initialized by the caller */
+ seg->control_slot = INVALID_CONTROL_SLOT;
+ seg->impl_private = NULL;
+ seg->mapped_address = NULL;
+ seg->mapped_size = 0;
+
+ seg->resowner = CurrentResourceOwner;
+ if (CurrentResourceOwner)
+ ResourceOwnerRememberDSM(CurrentResourceOwner, seg);
+
+ slist_init(&seg->on_detach);
+
+ return seg;
+}
+
+/*
+ * Sanity check a control segment.
+ *
+ * The goal here isn't to detect everything that could possibly be wrong with
+ * the control segment; there's not enough information for that. Rather, the
+ * goal is to make sure that someone can iterate over the items in the segment
+ * without overrunning the end of the mapping and crashing. We also check
+ * the magic number since, if that's messed up, this may not even be one of
+ * our segments at all.
+ */
+static bool
+dsm_control_segment_sane(dsm_control_header *control, Size mapped_size)
+{
+ if (mapped_size < offsetof(dsm_control_header, item))
+ return false; /* Mapped size too short to read header. */
+ if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC)
+ return false; /* Magic number doesn't match. */
+ if (dsm_control_bytes_needed(control->maxitems) > mapped_size)
+ return false; /* Max item count won't fit in map. */
+ if (control->nitems > control->maxitems)
+ return false; /* Overfull. */
+ return true;
+}
+
+/*
+ * Compute the number of control-segment bytes needed to store a given
+ * number of items.
+ */
+static uint64
+dsm_control_bytes_needed(uint32 nitems)
+{
+ return offsetof(dsm_control_header, item)
+ + sizeof(dsm_control_item) * (uint64) nitems;
+}
+
+static inline dsm_handle
+make_main_region_dsm_handle(int slot)
+{
+ dsm_handle handle;
+
+ /*
+ * We need to create a handle that doesn't collide with any existing extra
+ * segment created by dsm_impl_op(), so we'll make it odd. It also
+ * mustn't collide with any other main area pseudo-segment, so we'll
+ * include the slot number in some of the bits. We also want to make an
+ * effort to avoid newly created and recently destroyed handles from being
+ * confused, so we'll make the rest of the bits random.
+ */
+ handle = 1;
+ handle |= slot << 1;
+ handle |= random() << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1);
+ return handle;
+}
+
+static inline bool
+is_main_region_dsm_handle(dsm_handle handle)
+{
+ return handle & 1;
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
new file mode 100644
index 0000000..c51e3e6
--- /dev/null
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -0,0 +1,1058 @@
+/*-------------------------------------------------------------------------
+ *
+ * dsm_impl.c
+ * manage dynamic shared memory segments
+ *
+ * This file provides low-level APIs for creating and destroying shared
+ * memory segments using several different possible techniques. We refer
+ * to these segments as dynamic because they can be created, altered, and
+ * destroyed at any point during the server life cycle. This is unlike
+ * the main shared memory segment, of which there is always exactly one
+ * and which is always mapped at a fixed address in every PostgreSQL
+ * background process.
+ *
+ * Because not all systems provide the same primitives in this area, nor
+ * do all primitives behave the same way on all systems, we provide
+ * several implementations of this facility. Many systems implement
+ * POSIX shared memory (shm_open etc.), which is well-suited to our needs
+ * in this area, with the exception that shared memory identifiers live
+ * in a flat system-wide namespace, raising the uncomfortable prospect of
+ * name collisions with other processes (including other copies of
+ * PostgreSQL) running on the same system. Some systems only support
+ * the older System V shared memory interface (shmget etc.) which is
+ * also usable; however, the default allocation limits are often quite
+ * small, and the namespace is even more restricted.
+ *
+ * We also provide an mmap-based shared memory implementation. This may
+ * be useful on systems that provide shared memory via a special-purpose
+ * filesystem; by opting for this implementation, the user can even
+ * control precisely where their shared memory segments are placed. It
+ * can also be used as a fallback for systems where shm_open and shmget
+ * are not available or can't be used for some reason. Of course,
+ * mapping a file residing on an actual spinning disk is a fairly poor
+ * approximation for shared memory because writeback may hurt performance
+ * substantially, but there should be few systems where we must make do
+ * with such poor tools.
+ *
+ * As ever, Windows requires its own implementation.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/dsm_impl.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <signal.h>
+#include <unistd.h>
+#ifndef WIN32
+#include <sys/mman.h>
+#endif
+#include <sys/stat.h>
+#ifdef HAVE_SYS_IPC_H
+#include <sys/ipc.h>
+#endif
+#ifdef HAVE_SYS_SHM_H
+#include <sys/shm.h>
+#endif
+
+#include "common/file_perm.h"
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "portability/mem.h"
+#include "postmaster/postmaster.h"
+#include "storage/dsm_impl.h"
+#include "storage/fd.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+#ifdef USE_DSM_POSIX
+static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel);
+static int dsm_impl_posix_resize(int fd, off_t size);
+#endif
+#ifdef USE_DSM_SYSV
+static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel);
+#endif
+#ifdef USE_DSM_WINDOWS
+static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel);
+#endif
+#ifdef USE_DSM_MMAP
+static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel);
+#endif
+static int errcode_for_dynamic_shared_memory(void);
+
+const struct config_enum_entry dynamic_shared_memory_options[] = {
+#ifdef USE_DSM_POSIX
+ {"posix", DSM_IMPL_POSIX, false},
+#endif
+#ifdef USE_DSM_SYSV
+ {"sysv", DSM_IMPL_SYSV, false},
+#endif
+#ifdef USE_DSM_WINDOWS
+ {"windows", DSM_IMPL_WINDOWS, false},
+#endif
+#ifdef USE_DSM_MMAP
+ {"mmap", DSM_IMPL_MMAP, false},
+#endif
+ {NULL, 0, false}
+};
+
+/* Implementation selector. */
+int dynamic_shared_memory_type;
+
+/* Amount of space reserved for DSM segments in the main area. */
+int min_dynamic_shared_memory;
+
+/* Size of buffer to be used for zero-filling. */
+#define ZBUFFER_SIZE 8192
+
+#define SEGMENT_NAME_PREFIX "Global/PostgreSQL"
+
+/*------
+ * Perform a low-level shared memory operation in a platform-specific way,
+ * as dictated by the selected implementation. Each implementation is
+ * required to implement the following primitives.
+ *
+ * DSM_OP_CREATE. Create a segment whose size is the request_size and
+ * map it.
+ *
+ * DSM_OP_ATTACH. Map the segment, whose size must be the request_size.
+ *
+ * DSM_OP_DETACH. Unmap the segment.
+ *
+ * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the
+ * segment.
+ *
+ * Arguments:
+ * op: The operation to be performed.
+ * handle: The handle of an existing object, or for DSM_OP_CREATE, the
+ * a new handle the caller wants created.
+ * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0.
+ * impl_private: Private, implementation-specific data. Will be a pointer
+ * to NULL for the first operation on a shared memory segment within this
+ * backend; thereafter, it will point to the value to which it was set
+ * on the previous call.
+ * mapped_address: Pointer to start of current mapping; pointer to NULL
+ * if none. Updated with new mapping address.
+ * mapped_size: Pointer to size of current mapping; pointer to 0 if none.
+ * Updated with new mapped size.
+ * elevel: Level at which to log errors.
+ *
+ * Return value: true on success, false on failure. When false is returned,
+ * a message should first be logged at the specified elevel, except in the
+ * case where DSM_OP_CREATE experiences a name collision, which should
+ * silently return false.
+ *-----
+ */
+bool
+dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel)
+{
+ Assert(op == DSM_OP_CREATE || request_size == 0);
+ Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) ||
+ (*mapped_address == NULL && *mapped_size == 0));
+
+ switch (dynamic_shared_memory_type)
+ {
+#ifdef USE_DSM_POSIX
+ case DSM_IMPL_POSIX:
+ return dsm_impl_posix(op, handle, request_size, impl_private,
+ mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_SYSV
+ case DSM_IMPL_SYSV:
+ return dsm_impl_sysv(op, handle, request_size, impl_private,
+ mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_WINDOWS
+ case DSM_IMPL_WINDOWS:
+ return dsm_impl_windows(op, handle, request_size, impl_private,
+ mapped_address, mapped_size, elevel);
+#endif
+#ifdef USE_DSM_MMAP
+ case DSM_IMPL_MMAP:
+ return dsm_impl_mmap(op, handle, request_size, impl_private,
+ mapped_address, mapped_size, elevel);
+#endif
+ default:
+ elog(ERROR, "unexpected dynamic shared memory type: %d",
+ dynamic_shared_memory_type);
+ return false;
+ }
+}
+
+#ifdef USE_DSM_POSIX
+/*
+ * Operating system primitives to support POSIX shared memory.
+ *
+ * POSIX shared memory segments are created and attached using shm_open()
+ * and shm_unlink(); other operations, such as sizing or mapping the
+ * segment, are performed as if the shared memory segments were files.
+ *
+ * Indeed, on some platforms, they may be implemented that way. While
+ * POSIX shared memory segments seem intended to exist in a flat namespace,
+ * some operating systems may implement them as files, even going so far
+ * to treat a request for /xyz as a request to create a file by that name
+ * in the root directory. Users of such broken platforms should select
+ * a different shared memory implementation.
+ */
+static bool
+dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel)
+{
+ char name[64];
+ int flags;
+ int fd;
+ char *address;
+
+ snprintf(name, 64, "/PostgreSQL.%u", handle);
+
+ /* Handle teardown cases. */
+ if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+ {
+ if (*mapped_address != NULL
+ && munmap(*mapped_address, *mapped_size) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not unmap shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = NULL;
+ *mapped_size = 0;
+ if (op == DSM_OP_DESTROY && shm_unlink(name) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not remove shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ return true;
+ }
+
+ /*
+ * Create new segment or open an existing one for attach.
+ *
+ * Even though we will close the FD before returning, it seems desirable
+ * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE
+ * failure. The fact that we won't hold the FD open long justifies using
+ * ReserveExternalFD rather than AcquireExternalFD, though.
+ */
+ ReserveExternalFD();
+
+ flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
+ if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1)
+ {
+ ReleaseExternalFD();
+ if (op == DSM_OP_ATTACH || errno != EEXIST)
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not open shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ /*
+ * If we're attaching the segment, determine the current size; if we are
+ * creating the segment, set the size to the requested value.
+ */
+ if (op == DSM_OP_ATTACH)
+ {
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ close(fd);
+ ReleaseExternalFD();
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not stat shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ request_size = st.st_size;
+ }
+ else if (dsm_impl_posix_resize(fd, request_size) != 0)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ close(fd);
+ ReleaseExternalFD();
+ shm_unlink(name);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
+ name, request_size)));
+ return false;
+ }
+
+ /* Map it. */
+ address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
+ if (address == MAP_FAILED)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ close(fd);
+ ReleaseExternalFD();
+ if (op == DSM_OP_CREATE)
+ shm_unlink(name);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not map shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = address;
+ *mapped_size = request_size;
+ close(fd);
+ ReleaseExternalFD();
+
+ return true;
+}
+
+/*
+ * Set the size of a virtual memory region associated with a file descriptor.
+ * If necessary, also ensure that virtual memory is actually allocated by the
+ * operating system, to avoid nasty surprises later.
+ *
+ * Returns non-zero if either truncation or allocation fails, and sets errno.
+ */
+static int
+dsm_impl_posix_resize(int fd, off_t size)
+{
+ int rc;
+ int save_errno;
+ sigset_t save_sigmask;
+
+ /*
+ * Block all blockable signals, except SIGQUIT. posix_fallocate() can run
+ * for quite a long time, and is an all-or-nothing operation. If we
+ * allowed SIGUSR1 to interrupt us repeatedly (for example, due to recovery
+ * conflicts), the retry loop might never succeed.
+ */
+ if (IsUnderPostmaster)
+ sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask);
+
+ /* Truncate (or extend) the file to the requested size. */
+ do
+ {
+ rc = ftruncate(fd, size);
+ } while (rc < 0 && errno == EINTR);
+
+ /*
+ * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with
+ * ftruncate, the file may contain a hole. Accessing memory backed by a
+ * hole causes tmpfs to allocate pages, which fails with SIGBUS if there
+ * is no more tmpfs space available. So we ask tmpfs to allocate pages
+ * here, so we can fail gracefully with ENOSPC now rather than risking
+ * SIGBUS later.
+ */
+#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__)
+ if (rc == 0)
+ {
+ /*
+ * We still use a traditional EINTR retry loop to handle SIGCONT.
+ * posix_fallocate() doesn't restart automatically, and we don't want
+ * this to fail if you attach a debugger.
+ */
+ pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
+ do
+ {
+ rc = posix_fallocate(fd, 0, size);
+ } while (rc == EINTR);
+ pgstat_report_wait_end();
+
+ /*
+ * The caller expects errno to be set, but posix_fallocate() doesn't
+ * set it. Instead it returns error numbers directly. So set errno,
+ * even though we'll also return rc to indicate success or failure.
+ */
+ errno = rc;
+ }
+#endif /* HAVE_POSIX_FALLOCATE && __linux__ */
+
+ if (IsUnderPostmaster)
+ {
+ save_errno = errno;
+ sigprocmask(SIG_SETMASK, &save_sigmask, NULL);
+ errno = save_errno;
+ }
+
+ return rc;
+}
+
+#endif /* USE_DSM_POSIX */
+
+#ifdef USE_DSM_SYSV
+/*
+ * Operating system primitives to support System V shared memory.
+ *
+ * System V shared memory segments are manipulated using shmget(), shmat(),
+ * shmdt(), and shmctl(). As the default allocation limits for System V
+ * shared memory are usually quite low, the POSIX facilities may be
+ * preferable; but those are not supported everywhere.
+ */
+static bool
+dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel)
+{
+ key_t key;
+ int ident;
+ char *address;
+ char name[64];
+ int *ident_cache;
+
+ /*
+ * POSIX shared memory and mmap-based shared memory identify segments with
+ * names. To avoid needless error message variation, we use the handle as
+ * the name.
+ */
+ snprintf(name, 64, "%u", handle);
+
+ /*
+ * The System V shared memory namespace is very restricted; names are of
+ * type key_t, which is expected to be some sort of integer data type, but
+ * not necessarily the same one as dsm_handle. Since we use dsm_handle to
+ * identify shared memory segments across processes, this might seem like
+ * a problem, but it's really not. If dsm_handle is bigger than key_t,
+ * the cast below might truncate away some bits from the handle the
+ * user-provided, but it'll truncate exactly the same bits away in exactly
+ * the same fashion every time we use that handle, which is all that
+ * really matters. Conversely, if dsm_handle is smaller than key_t, we
+ * won't use the full range of available key space, but that's no big deal
+ * either.
+ *
+ * We do make sure that the key isn't negative, because that might not be
+ * portable.
+ */
+ key = (key_t) handle;
+ if (key < 1) /* avoid compiler warning if type is unsigned */
+ key = -key;
+
+ /*
+ * There's one special key, IPC_PRIVATE, which can't be used. If we end
+ * up with that value by chance during a create operation, just pretend it
+ * already exists, so that caller will retry. If we run into it anywhere
+ * else, the caller has passed a handle that doesn't correspond to
+ * anything we ever created, which should not happen.
+ */
+ if (key == IPC_PRIVATE)
+ {
+ if (op != DSM_OP_CREATE)
+ elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE");
+ errno = EEXIST;
+ return false;
+ }
+
+ /*
+ * Before we can do anything with a shared memory segment, we have to map
+ * the shared memory key to a shared memory identifier using shmget(). To
+ * avoid repeated lookups, we store the key using impl_private.
+ */
+ if (*impl_private != NULL)
+ {
+ ident_cache = *impl_private;
+ ident = *ident_cache;
+ }
+ else
+ {
+ int flags = IPCProtection;
+ size_t segsize;
+
+ /*
+ * Allocate the memory BEFORE acquiring the resource, so that we don't
+ * leak the resource if memory allocation fails.
+ */
+ ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int));
+
+ /*
+ * When using shmget to find an existing segment, we must pass the
+ * size as 0. Passing a non-zero size which is greater than the
+ * actual size will result in EINVAL.
+ */
+ segsize = 0;
+
+ if (op == DSM_OP_CREATE)
+ {
+ flags |= IPC_CREAT | IPC_EXCL;
+ segsize = request_size;
+ }
+
+ if ((ident = shmget(key, segsize, flags)) == -1)
+ {
+ if (op == DSM_OP_ATTACH || errno != EEXIST)
+ {
+ int save_errno = errno;
+
+ pfree(ident_cache);
+ errno = save_errno;
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not get shared memory segment: %m")));
+ }
+ return false;
+ }
+
+ *ident_cache = ident;
+ *impl_private = ident_cache;
+ }
+
+ /* Handle teardown cases. */
+ if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+ {
+ pfree(ident_cache);
+ *impl_private = NULL;
+ if (*mapped_address != NULL && shmdt(*mapped_address) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not unmap shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = NULL;
+ *mapped_size = 0;
+ if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not remove shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ return true;
+ }
+
+ /* If we're attaching it, we must use IPC_STAT to determine the size. */
+ if (op == DSM_OP_ATTACH)
+ {
+ struct shmid_ds shm;
+
+ if (shmctl(ident, IPC_STAT, &shm) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not stat shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ request_size = shm.shm_segsz;
+ }
+
+ /* Map it. */
+ address = shmat(ident, NULL, PG_SHMAT_FLAGS);
+ if (address == (void *) -1)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ if (op == DSM_OP_CREATE)
+ shmctl(ident, IPC_RMID, NULL);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not map shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = address;
+ *mapped_size = request_size;
+
+ return true;
+}
+#endif
+
+#ifdef USE_DSM_WINDOWS
+/*
+ * Operating system primitives to support Windows shared memory.
+ *
+ * Windows shared memory implementation is done using file mapping
+ * which can be backed by either physical file or system paging file.
+ * Current implementation uses system paging file as other effects
+ * like performance are not clear for physical file and it is used in similar
+ * way for main shared memory in windows.
+ *
+ * A memory mapping object is a kernel object - they always get deleted when
+ * the last reference to them goes away, either explicitly via a CloseHandle or
+ * when the process containing the reference exits.
+ */
+static bool
+dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address,
+ Size *mapped_size, int elevel)
+{
+ char *address;
+ HANDLE hmap;
+ char name[64];
+ MEMORY_BASIC_INFORMATION info;
+
+ /*
+ * Storing the shared memory segment in the Global\ namespace, can allow
+ * any process running in any session to access that file mapping object
+ * provided that the caller has the required access rights. But to avoid
+ * issues faced in main shared memory, we are using the naming convention
+ * similar to main shared memory. We can change here once issue mentioned
+ * in GetSharedMemName is resolved.
+ */
+ snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+
+ /*
+ * Handle teardown cases. Since Windows automatically destroys the object
+ * when no references remain, we can treat it the same as detach.
+ */
+ if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+ {
+ if (*mapped_address != NULL
+ && UnmapViewOfFile(*mapped_address) == 0)
+ {
+ _dosmaperr(GetLastError());
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not unmap shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ if (*impl_private != NULL
+ && CloseHandle(*impl_private) == 0)
+ {
+ _dosmaperr(GetLastError());
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not remove shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ *impl_private = NULL;
+ *mapped_address = NULL;
+ *mapped_size = 0;
+ return true;
+ }
+
+ /* Create new segment or open an existing one for attach. */
+ if (op == DSM_OP_CREATE)
+ {
+ DWORD size_high;
+ DWORD size_low;
+ DWORD errcode;
+
+ /* Shifts >= the width of the type are undefined. */
+#ifdef _WIN64
+ size_high = request_size >> 32;
+#else
+ size_high = 0;
+#endif
+ size_low = (DWORD) request_size;
+
+ /* CreateFileMapping might not clear the error code on success */
+ SetLastError(0);
+
+ hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */
+ NULL, /* Default security attrs */
+ PAGE_READWRITE, /* Memory is read/write */
+ size_high, /* Upper 32 bits of size */
+ size_low, /* Lower 32 bits of size */
+ name);
+
+ errcode = GetLastError();
+ if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED)
+ {
+ /*
+ * On Windows, when the segment already exists, a handle for the
+ * existing segment is returned. We must close it before
+ * returning. However, if the existing segment is created by a
+ * service, then it returns ERROR_ACCESS_DENIED. We don't do
+ * _dosmaperr here, so errno won't be modified.
+ */
+ if (hmap)
+ CloseHandle(hmap);
+ return false;
+ }
+
+ if (!hmap)
+ {
+ _dosmaperr(errcode);
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not create shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ }
+ else
+ {
+ hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ,
+ FALSE, /* do not inherit the name */
+ name); /* name of mapping object */
+ if (!hmap)
+ {
+ _dosmaperr(GetLastError());
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not open shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ }
+
+ /* Map it. */
+ address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ,
+ 0, 0, 0);
+ if (!address)
+ {
+ int save_errno;
+
+ _dosmaperr(GetLastError());
+ /* Back out what's already been done. */
+ save_errno = errno;
+ CloseHandle(hmap);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not map shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ /*
+ * VirtualQuery gives size in page_size units, which is 4K for Windows. We
+ * need size only when we are attaching, but it's better to get the size
+ * when creating new segment to keep size consistent both for
+ * DSM_OP_CREATE and DSM_OP_ATTACH.
+ */
+ if (VirtualQuery(address, &info, sizeof(info)) == 0)
+ {
+ int save_errno;
+
+ _dosmaperr(GetLastError());
+ /* Back out what's already been done. */
+ save_errno = errno;
+ UnmapViewOfFile(address);
+ CloseHandle(hmap);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not stat shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ *mapped_address = address;
+ *mapped_size = info.RegionSize;
+ *impl_private = hmap;
+
+ return true;
+}
+#endif
+
+#ifdef USE_DSM_MMAP
+/*
+ * Operating system primitives to support mmap-based shared memory.
+ *
+ * Calling this "shared memory" is somewhat of a misnomer, because what
+ * we're really doing is creating a bunch of files and mapping them into
+ * our address space. The operating system may feel obliged to
+ * synchronize the contents to disk even if nothing is being paged out,
+ * which will not serve us well. The user can relocate the pg_dynshmem
+ * directory to a ramdisk to avoid this problem, if available.
+ */
+static bool
+dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
+ void **impl_private, void **mapped_address, Size *mapped_size,
+ int elevel)
+{
+ char name[64];
+ int flags;
+ int fd;
+ char *address;
+
+ snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u",
+ handle);
+
+ /* Handle teardown cases. */
+ if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY)
+ {
+ if (*mapped_address != NULL
+ && munmap(*mapped_address, *mapped_size) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not unmap shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = NULL;
+ *mapped_size = 0;
+ if (op == DSM_OP_DESTROY && unlink(name) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not remove shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ return true;
+ }
+
+ /* Create new segment or open an existing one for attach. */
+ flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0);
+ if ((fd = OpenTransientFile(name, flags)) == -1)
+ {
+ if (op == DSM_OP_ATTACH || errno != EEXIST)
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not open shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ /*
+ * If we're attaching the segment, determine the current size; if we are
+ * creating the segment, set the size to the requested value.
+ */
+ if (op == DSM_OP_ATTACH)
+ {
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ CloseTransientFile(fd);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not stat shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ request_size = st.st_size;
+ }
+ else
+ {
+ /*
+ * Allocate a buffer full of zeros.
+ *
+ * Note: palloc zbuffer, instead of just using a local char array, to
+ * ensure it is reasonably well-aligned; this may save a few cycles
+ * transferring data to the kernel.
+ */
+ char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
+ uint32 remaining = request_size;
+ bool success = true;
+
+ /*
+ * Zero-fill the file. We have to do this the hard way to ensure that
+ * all the file space has really been allocated, so that we don't
+ * later seg fault when accessing the memory mapping. This is pretty
+ * pessimal.
+ */
+ while (success && remaining > 0)
+ {
+ Size goal = remaining;
+
+ if (goal > ZBUFFER_SIZE)
+ goal = ZBUFFER_SIZE;
+ pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE);
+ if (write(fd, zbuffer, goal) == goal)
+ remaining -= goal;
+ else
+ success = false;
+ pgstat_report_wait_end();
+ }
+
+ if (!success)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ CloseTransientFile(fd);
+ unlink(name);
+ errno = save_errno ? save_errno : ENOSPC;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m",
+ name, request_size)));
+ return false;
+ }
+ }
+
+ /* Map it. */
+ address = mmap(NULL, request_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0);
+ if (address == MAP_FAILED)
+ {
+ int save_errno;
+
+ /* Back out what's already been done. */
+ save_errno = errno;
+ CloseTransientFile(fd);
+ if (op == DSM_OP_CREATE)
+ unlink(name);
+ errno = save_errno;
+
+ ereport(elevel,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not map shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+ *mapped_address = address;
+ *mapped_size = request_size;
+
+ if (CloseTransientFile(fd) != 0)
+ {
+ ereport(elevel,
+ (errcode_for_file_access(),
+ errmsg("could not close shared memory segment \"%s\": %m",
+ name)));
+ return false;
+ }
+
+ return true;
+}
+#endif
+
+/*
+ * Implementation-specific actions that must be performed when a segment is to
+ * be preserved even when no backend has it attached.
+ *
+ * Except on Windows, we don't need to do anything at all. But since Windows
+ * cleans up segments automatically when no references remain, we duplicate
+ * the segment handle into the postmaster process. The postmaster needn't
+ * do anything to receive the handle; Windows transfers it automatically.
+ */
+void
+dsm_impl_pin_segment(dsm_handle handle, void *impl_private,
+ void **impl_private_pm_handle)
+{
+ switch (dynamic_shared_memory_type)
+ {
+#ifdef USE_DSM_WINDOWS
+ case DSM_IMPL_WINDOWS:
+ {
+ HANDLE hmap;
+
+ if (!DuplicateHandle(GetCurrentProcess(), impl_private,
+ PostmasterHandle, &hmap, 0, FALSE,
+ DUPLICATE_SAME_ACCESS))
+ {
+ char name[64];
+
+ snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+ _dosmaperr(GetLastError());
+ ereport(ERROR,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not duplicate handle for \"%s\": %m",
+ name)));
+ }
+
+ /*
+ * Here, we remember the handle that we created in the
+ * postmaster process. This handle isn't actually usable in
+ * any process other than the postmaster, but that doesn't
+ * matter. We're just holding onto it so that, if the segment
+ * is unpinned, dsm_impl_unpin_segment can close it.
+ */
+ *impl_private_pm_handle = hmap;
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+}
+
+/*
+ * Implementation-specific actions that must be performed when a segment is no
+ * longer to be preserved, so that it will be cleaned up when all backends
+ * have detached from it.
+ *
+ * Except on Windows, we don't need to do anything at all. For Windows, we
+ * close the extra handle that dsm_impl_pin_segment created in the
+ * postmaster's process space.
+ */
+void
+dsm_impl_unpin_segment(dsm_handle handle, void **impl_private)
+{
+ switch (dynamic_shared_memory_type)
+ {
+#ifdef USE_DSM_WINDOWS
+ case DSM_IMPL_WINDOWS:
+ {
+ if (*impl_private &&
+ !DuplicateHandle(PostmasterHandle, *impl_private,
+ NULL, NULL, 0, FALSE,
+ DUPLICATE_CLOSE_SOURCE))
+ {
+ char name[64];
+
+ snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle);
+ _dosmaperr(GetLastError());
+ ereport(ERROR,
+ (errcode_for_dynamic_shared_memory(),
+ errmsg("could not duplicate handle for \"%s\": %m",
+ name)));
+ }
+
+ *impl_private = NULL;
+ break;
+ }
+#endif
+ default:
+ break;
+ }
+}
+
+static int
+errcode_for_dynamic_shared_memory(void)
+{
+ if (errno == EFBIG || errno == ENOMEM)
+ return errcode(ERRCODE_OUT_OF_MEMORY);
+ else
+ return errcode_for_file_access();
+}
diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c
new file mode 100644
index 0000000..4045d7d
--- /dev/null
+++ b/src/backend/storage/ipc/ipc.c
@@ -0,0 +1,435 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipc.c
+ * POSTGRES inter-process communication definitions.
+ *
+ * This file is misnamed, as it no longer has much of anything directly
+ * to do with IPC. The functionality here is concerned with managing
+ * exit-time cleanup for either a postmaster or a backend.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/ipc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+#include "miscadmin.h"
+#ifdef PROFILE_PID_DIR
+#include "postmaster/autovacuum.h"
+#endif
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "tcop/tcopprot.h"
+
+
+/*
+ * This flag is set during proc_exit() to change ereport()'s behavior,
+ * so that an ereport() from an on_proc_exit routine cannot get us out
+ * of the exit procedure. We do NOT want to go back to the idle loop...
+ */
+bool proc_exit_inprogress = false;
+
+/*
+ * Set when shmem_exit() is in progress.
+ */
+bool shmem_exit_inprogress = false;
+
+/*
+ * This flag tracks whether we've called atexit() in the current process
+ * (or in the parent postmaster).
+ */
+static bool atexit_callback_setup = false;
+
+/* local functions */
+static void proc_exit_prepare(int code);
+
+
+/* ----------------------------------------------------------------
+ * exit() handling stuff
+ *
+ * These functions are in generally the same spirit as atexit(),
+ * but provide some additional features we need --- in particular,
+ * we want to register callbacks to invoke when we are disconnecting
+ * from a broken shared-memory context but not exiting the postmaster.
+ *
+ * Callback functions can take zero, one, or two args: the first passed
+ * arg is the integer exitcode, the second is the Datum supplied when
+ * the callback was registered.
+ * ----------------------------------------------------------------
+ */
+
+#define MAX_ON_EXITS 20
+
+struct ONEXIT
+{
+ pg_on_exit_callback function;
+ Datum arg;
+};
+
+static struct ONEXIT on_proc_exit_list[MAX_ON_EXITS];
+static struct ONEXIT on_shmem_exit_list[MAX_ON_EXITS];
+static struct ONEXIT before_shmem_exit_list[MAX_ON_EXITS];
+
+static int on_proc_exit_index,
+ on_shmem_exit_index,
+ before_shmem_exit_index;
+
+
+/* ----------------------------------------------------------------
+ * proc_exit
+ *
+ * this function calls all the callbacks registered
+ * for it (to free resources) and then calls exit.
+ *
+ * This should be the only function to call exit().
+ * -cim 2/6/90
+ *
+ * Unfortunately, we can't really guarantee that add-on code
+ * obeys the rule of not calling exit() directly. So, while
+ * this is the preferred way out of the system, we also register
+ * an atexit callback that will make sure cleanup happens.
+ * ----------------------------------------------------------------
+ */
+void
+proc_exit(int code)
+{
+ /* Clean up everything that must be cleaned up */
+ proc_exit_prepare(code);
+
+#ifdef PROFILE_PID_DIR
+ {
+ /*
+ * If we are profiling ourself then gprof's mcleanup() is about to
+ * write out a profile to ./gmon.out. Since mcleanup() always uses a
+ * fixed file name, each backend will overwrite earlier profiles. To
+ * fix that, we create a separate subdirectory for each backend
+ * (./gprof/pid) and 'cd' to that subdirectory before we exit() - that
+ * forces mcleanup() to write each profile into its own directory. We
+ * end up with something like: $PGDATA/gprof/8829/gmon.out
+ * $PGDATA/gprof/8845/gmon.out ...
+ *
+ * To avoid undesirable disk space bloat, autovacuum workers are
+ * discriminated against: all their gmon.out files go into the same
+ * subdirectory. Without this, an installation that is "just sitting
+ * there" nonetheless eats megabytes of disk space every few seconds.
+ *
+ * Note that we do this here instead of in an on_proc_exit() callback
+ * because we want to ensure that this code executes last - we don't
+ * want to interfere with any other on_proc_exit() callback. For the
+ * same reason, we do not include it in proc_exit_prepare ... so if
+ * you are exiting in the "wrong way" you won't drop your profile in a
+ * nice place.
+ */
+ char gprofDirName[32];
+
+ if (IsAutoVacuumWorkerProcess())
+ snprintf(gprofDirName, 32, "gprof/avworker");
+ else
+ snprintf(gprofDirName, 32, "gprof/%d", (int) getpid());
+
+ /*
+ * Use mkdir() instead of MakePGDirectory() since we aren't making a
+ * PG directory here.
+ */
+ mkdir("gprof", S_IRWXU | S_IRWXG | S_IRWXO);
+ mkdir(gprofDirName, S_IRWXU | S_IRWXG | S_IRWXO);
+ chdir(gprofDirName);
+ }
+#endif
+
+ elog(DEBUG3, "exit(%d)", code);
+
+ exit(code);
+}
+
+/*
+ * Code shared between proc_exit and the atexit handler. Note that in
+ * normal exit through proc_exit, this will actually be called twice ...
+ * but the second call will have nothing to do.
+ */
+static void
+proc_exit_prepare(int code)
+{
+ /*
+ * Once we set this flag, we are committed to exit. Any ereport() will
+ * NOT send control back to the main loop, but right back here.
+ */
+ proc_exit_inprogress = true;
+
+ /*
+ * Forget any pending cancel or die requests; we're doing our best to
+ * close up shop already. Note that the signal handlers will not set
+ * these flags again, now that proc_exit_inprogress is set.
+ */
+ InterruptPending = false;
+ ProcDiePending = false;
+ QueryCancelPending = false;
+ InterruptHoldoffCount = 1;
+ CritSectionCount = 0;
+
+ /*
+ * Also clear the error context stack, to prevent error callbacks from
+ * being invoked by any elog/ereport calls made during proc_exit. Whatever
+ * context they might want to offer is probably not relevant, and in any
+ * case they are likely to fail outright after we've done things like
+ * aborting any open transaction. (In normal exit scenarios the context
+ * stack should be empty anyway, but it might not be in the case of
+ * elog(FATAL) for example.)
+ */
+ error_context_stack = NULL;
+ /* For the same reason, reset debug_query_string before it's clobbered */
+ debug_query_string = NULL;
+
+ /* do our shared memory exits first */
+ shmem_exit(code);
+
+ elog(DEBUG3, "proc_exit(%d): %d callbacks to make",
+ code, on_proc_exit_index);
+
+ /*
+ * call all the registered callbacks.
+ *
+ * Note that since we decrement on_proc_exit_index each time, if a
+ * callback calls ereport(ERROR) or ereport(FATAL) then it won't be
+ * invoked again when control comes back here (nor will the
+ * previously-completed callbacks). So, an infinite loop should not be
+ * possible.
+ */
+ while (--on_proc_exit_index >= 0)
+ on_proc_exit_list[on_proc_exit_index].function(code,
+ on_proc_exit_list[on_proc_exit_index].arg);
+
+ on_proc_exit_index = 0;
+}
+
+/* ------------------
+ * Run all of the on_shmem_exit routines --- but don't actually exit.
+ * This is used by the postmaster to re-initialize shared memory and
+ * semaphores after a backend dies horribly. As with proc_exit(), we
+ * remove each callback from the list before calling it, to avoid
+ * infinite loop in case of error.
+ * ------------------
+ */
+void
+shmem_exit(int code)
+{
+ shmem_exit_inprogress = true;
+
+ /*
+ * Call before_shmem_exit callbacks.
+ *
+ * These should be things that need most of the system to still be up and
+ * working, such as cleanup of temp relations, which requires catalog
+ * access; or things that need to be completed because later cleanup steps
+ * depend on them, such as releasing lwlocks.
+ */
+ elog(DEBUG3, "shmem_exit(%d): %d before_shmem_exit callbacks to make",
+ code, before_shmem_exit_index);
+ while (--before_shmem_exit_index >= 0)
+ before_shmem_exit_list[before_shmem_exit_index].function(code,
+ before_shmem_exit_list[before_shmem_exit_index].arg);
+ before_shmem_exit_index = 0;
+
+ /*
+ * Call dynamic shared memory callbacks.
+ *
+ * These serve the same purpose as late callbacks, but for dynamic shared
+ * memory segments rather than the main shared memory segment.
+ * dsm_backend_shutdown() has the same kind of progressive logic we use
+ * for the main shared memory segment; namely, it unregisters each
+ * callback before invoking it, so that we don't get stuck in an infinite
+ * loop if one of those callbacks itself throws an ERROR or FATAL.
+ *
+ * Note that explicitly calling this function here is quite different from
+ * registering it as an on_shmem_exit callback for precisely this reason:
+ * if one dynamic shared memory callback errors out, the remaining
+ * callbacks will still be invoked. Thus, hard-coding this call puts it
+ * equal footing with callbacks for the main shared memory segment.
+ */
+ dsm_backend_shutdown();
+
+ /*
+ * Call on_shmem_exit callbacks.
+ *
+ * These are generally releasing low-level shared memory resources. In
+ * some cases, this is a backstop against the possibility that the early
+ * callbacks might themselves fail, leading to re-entry to this routine;
+ * in other cases, it's cleanup that only happens at process exit.
+ */
+ elog(DEBUG3, "shmem_exit(%d): %d on_shmem_exit callbacks to make",
+ code, on_shmem_exit_index);
+ while (--on_shmem_exit_index >= 0)
+ on_shmem_exit_list[on_shmem_exit_index].function(code,
+ on_shmem_exit_list[on_shmem_exit_index].arg);
+ on_shmem_exit_index = 0;
+
+ shmem_exit_inprogress = false;
+}
+
+/* ----------------------------------------------------------------
+ * atexit_callback
+ *
+ * Backstop to ensure that direct calls of exit() don't mess us up.
+ *
+ * Somebody who was being really uncooperative could call _exit(),
+ * but for that case we have a "dead man switch" that will make the
+ * postmaster treat it as a crash --- see pmsignal.c.
+ * ----------------------------------------------------------------
+ */
+static void
+atexit_callback(void)
+{
+ /* Clean up everything that must be cleaned up */
+ /* ... too bad we don't know the real exit code ... */
+ proc_exit_prepare(-1);
+}
+
+/* ----------------------------------------------------------------
+ * on_proc_exit
+ *
+ * this function adds a callback function to the list of
+ * functions invoked by proc_exit(). -cim 2/6/90
+ * ----------------------------------------------------------------
+ */
+void
+on_proc_exit(pg_on_exit_callback function, Datum arg)
+{
+ if (on_proc_exit_index >= MAX_ON_EXITS)
+ ereport(FATAL,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg_internal("out of on_proc_exit slots")));
+
+ on_proc_exit_list[on_proc_exit_index].function = function;
+ on_proc_exit_list[on_proc_exit_index].arg = arg;
+
+ ++on_proc_exit_index;
+
+ if (!atexit_callback_setup)
+ {
+ atexit(atexit_callback);
+ atexit_callback_setup = true;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * before_shmem_exit
+ *
+ * Register early callback to perform user-level cleanup,
+ * e.g. transaction abort, before we begin shutting down
+ * low-level subsystems.
+ * ----------------------------------------------------------------
+ */
+void
+before_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+ if (before_shmem_exit_index >= MAX_ON_EXITS)
+ ereport(FATAL,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg_internal("out of before_shmem_exit slots")));
+
+ before_shmem_exit_list[before_shmem_exit_index].function = function;
+ before_shmem_exit_list[before_shmem_exit_index].arg = arg;
+
+ ++before_shmem_exit_index;
+
+ if (!atexit_callback_setup)
+ {
+ atexit(atexit_callback);
+ atexit_callback_setup = true;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * on_shmem_exit
+ *
+ * Register ordinary callback to perform low-level shutdown
+ * (e.g. releasing our PGPROC); run after before_shmem_exit
+ * callbacks and before on_proc_exit callbacks.
+ * ----------------------------------------------------------------
+ */
+void
+on_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+ if (on_shmem_exit_index >= MAX_ON_EXITS)
+ ereport(FATAL,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg_internal("out of on_shmem_exit slots")));
+
+ on_shmem_exit_list[on_shmem_exit_index].function = function;
+ on_shmem_exit_list[on_shmem_exit_index].arg = arg;
+
+ ++on_shmem_exit_index;
+
+ if (!atexit_callback_setup)
+ {
+ atexit(atexit_callback);
+ atexit_callback_setup = true;
+ }
+}
+
+/* ----------------------------------------------------------------
+ * cancel_before_shmem_exit
+ *
+ * this function removes a previously-registered before_shmem_exit
+ * callback. We only look at the latest entry for removal, as we
+ * expect callers to add and remove temporary before_shmem_exit
+ * callbacks in strict LIFO order.
+ * ----------------------------------------------------------------
+ */
+void
+cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg)
+{
+ if (before_shmem_exit_index > 0 &&
+ before_shmem_exit_list[before_shmem_exit_index - 1].function
+ == function &&
+ before_shmem_exit_list[before_shmem_exit_index - 1].arg == arg)
+ --before_shmem_exit_index;
+ else
+ elog(ERROR, "before_shmem_exit callback (%p,0x%llx) is not the latest entry",
+ function, (long long) arg);
+}
+
+/* ----------------------------------------------------------------
+ * on_exit_reset
+ *
+ * this function clears all on_proc_exit() and on_shmem_exit()
+ * registered functions. This is used just after forking a backend,
+ * so that the backend doesn't believe it should call the postmaster's
+ * on-exit routines when it exits...
+ * ----------------------------------------------------------------
+ */
+void
+on_exit_reset(void)
+{
+ before_shmem_exit_index = 0;
+ on_shmem_exit_index = 0;
+ on_proc_exit_index = 0;
+ reset_on_dsm_detach();
+}
+
+/* ----------------------------------------------------------------
+ * check_on_shmem_exit_lists_are_empty
+ *
+ * Debugging check that no shmem cleanup handlers have been registered
+ * prematurely in the current process.
+ * ----------------------------------------------------------------
+ */
+void
+check_on_shmem_exit_lists_are_empty(void)
+{
+ if (before_shmem_exit_index)
+ elog(FATAL, "before_shmem_exit has been called prematurely");
+ if (on_shmem_exit_index)
+ elog(FATAL, "on_shmem_exit has been called prematurely");
+ /* Checking DSM detach state seems unnecessary given the above */
+}
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
new file mode 100644
index 0000000..3e4ec53
--- /dev/null
+++ b/src/backend/storage/ipc/ipci.c
@@ -0,0 +1,291 @@
+/*-------------------------------------------------------------------------
+ *
+ * ipci.c
+ * POSTGRES inter-process communication initialization code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/ipci.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/clog.h"
+#include "access/commit_ts.h"
+#include "access/heapam.h"
+#include "access/multixact.h"
+#include "access/nbtree.h"
+#include "access/subtrans.h"
+#include "access/syncscan.h"
+#include "access/twophase.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/autovacuum.h"
+#include "postmaster/bgworker_internals.h"
+#include "postmaster/bgwriter.h"
+#include "postmaster/postmaster.h"
+#include "replication/logicallauncher.h"
+#include "replication/origin.h"
+#include "replication/slot.h"
+#include "replication/walreceiver.h"
+#include "replication/walsender.h"
+#include "storage/bufmgr.h"
+#include "storage/dsm.h"
+#include "storage/ipc.h"
+#include "storage/pg_shmem.h"
+#include "storage/pmsignal.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/procsignal.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+#include "utils/snapmgr.h"
+
+/* GUCs */
+int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE;
+
+shmem_startup_hook_type shmem_startup_hook = NULL;
+
+static Size total_addin_request = 0;
+static bool addin_request_allowed = true;
+
+
+/*
+ * RequestAddinShmemSpace
+ * Request that extra shmem space be allocated for use by
+ * a loadable module.
+ *
+ * This is only useful if called from the _PG_init hook of a library that
+ * is loaded into the postmaster via shared_preload_libraries. Once
+ * shared memory has been allocated, calls will be ignored. (We could
+ * raise an error, but it seems better to make it a no-op, so that
+ * libraries containing such calls can be reloaded if needed.)
+ */
+void
+RequestAddinShmemSpace(Size size)
+{
+ if (IsUnderPostmaster || !addin_request_allowed)
+ return; /* too late */
+ total_addin_request = add_size(total_addin_request, size);
+}
+
+
+/*
+ * CreateSharedMemoryAndSemaphores
+ * Creates and initializes shared memory and semaphores.
+ *
+ * This is called by the postmaster or by a standalone backend.
+ * It is also called by a backend forked from the postmaster in the
+ * EXEC_BACKEND case. In the latter case, the shared memory segment
+ * already exists and has been physically attached to, but we have to
+ * initialize pointers in local memory that reference the shared structures,
+ * because we didn't inherit the correct pointer values from the postmaster
+ * as we do in the fork() scenario. The easiest way to do that is to run
+ * through the same code as before. (Note that the called routines mostly
+ * check IsUnderPostmaster, rather than EXEC_BACKEND, to detect this case.
+ * This is a bit code-wasteful and could be cleaned up.)
+ */
+void
+CreateSharedMemoryAndSemaphores(void)
+{
+ PGShmemHeader *shim = NULL;
+
+ if (!IsUnderPostmaster)
+ {
+ PGShmemHeader *seghdr;
+ Size size;
+ int numSemas;
+
+ /* Compute number of semaphores we'll need */
+ numSemas = ProcGlobalSemas();
+ numSemas += SpinlockSemas();
+
+ /*
+ * Size of the Postgres shared-memory block is estimated via
+ * moderately-accurate estimates for the big hogs, plus 100K for the
+ * stuff that's too small to bother with estimating.
+ *
+ * We take some care during this phase to ensure that the total size
+ * request doesn't overflow size_t. If this gets through, we don't
+ * need to be so careful during the actual allocation phase.
+ */
+ size = 100000;
+ size = add_size(size, PGSemaphoreShmemSize(numSemas));
+ size = add_size(size, SpinlockSemaSize());
+ size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE,
+ sizeof(ShmemIndexEnt)));
+ size = add_size(size, dsm_estimate_size());
+ size = add_size(size, BufferShmemSize());
+ size = add_size(size, LockShmemSize());
+ size = add_size(size, PredicateLockShmemSize());
+ size = add_size(size, ProcGlobalShmemSize());
+ size = add_size(size, XLOGShmemSize());
+ size = add_size(size, CLOGShmemSize());
+ size = add_size(size, CommitTsShmemSize());
+ size = add_size(size, SUBTRANSShmemSize());
+ size = add_size(size, TwoPhaseShmemSize());
+ size = add_size(size, BackgroundWorkerShmemSize());
+ size = add_size(size, MultiXactShmemSize());
+ size = add_size(size, LWLockShmemSize());
+ size = add_size(size, ProcArrayShmemSize());
+ size = add_size(size, BackendStatusShmemSize());
+ size = add_size(size, SInvalShmemSize());
+ size = add_size(size, PMSignalShmemSize());
+ size = add_size(size, ProcSignalShmemSize());
+ size = add_size(size, CheckpointerShmemSize());
+ size = add_size(size, AutoVacuumShmemSize());
+ size = add_size(size, ReplicationSlotsShmemSize());
+ size = add_size(size, ReplicationOriginShmemSize());
+ size = add_size(size, WalSndShmemSize());
+ size = add_size(size, WalRcvShmemSize());
+ size = add_size(size, PgArchShmemSize());
+ size = add_size(size, ApplyLauncherShmemSize());
+ size = add_size(size, SnapMgrShmemSize());
+ size = add_size(size, BTreeShmemSize());
+ size = add_size(size, SyncScanShmemSize());
+ size = add_size(size, AsyncShmemSize());
+#ifdef EXEC_BACKEND
+ size = add_size(size, ShmemBackendArraySize());
+#endif
+
+ /* freeze the addin request size and include it */
+ addin_request_allowed = false;
+ size = add_size(size, total_addin_request);
+
+ /* might as well round it off to a multiple of a typical page size */
+ size = add_size(size, 8192 - (size % 8192));
+
+ elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size);
+
+ /*
+ * Create the shmem segment
+ */
+ seghdr = PGSharedMemoryCreate(size, &shim);
+
+ InitShmemAccess(seghdr);
+
+ /*
+ * Create semaphores
+ */
+ PGReserveSemaphores(numSemas);
+
+ /*
+ * If spinlocks are disabled, initialize emulation layer (which
+ * depends on semaphores, so the order is important here).
+ */
+#ifndef HAVE_SPINLOCKS
+ SpinlockSemaInit();
+#endif
+ }
+ else
+ {
+ /*
+ * We are reattaching to an existing shared memory segment. This
+ * should only be reached in the EXEC_BACKEND case.
+ */
+#ifndef EXEC_BACKEND
+ elog(PANIC, "should be attached to shared memory already");
+#endif
+ }
+
+ /*
+ * Set up shared memory allocation mechanism
+ */
+ if (!IsUnderPostmaster)
+ InitShmemAllocation();
+
+ /*
+ * Now initialize LWLocks, which do shared memory allocation and are
+ * needed for InitShmemIndex.
+ */
+ CreateLWLocks();
+
+ /*
+ * Set up shmem.c index hashtable
+ */
+ InitShmemIndex();
+
+ dsm_shmem_init();
+
+ /*
+ * Set up xlog, clog, and buffers
+ */
+ XLOGShmemInit();
+ CLOGShmemInit();
+ CommitTsShmemInit();
+ SUBTRANSShmemInit();
+ MultiXactShmemInit();
+ InitBufferPool();
+
+ /*
+ * Set up lock manager
+ */
+ InitLocks();
+
+ /*
+ * Set up predicate lock manager
+ */
+ InitPredicateLocks();
+
+ /*
+ * Set up process table
+ */
+ if (!IsUnderPostmaster)
+ InitProcGlobal();
+ CreateSharedProcArray();
+ CreateSharedBackendStatus();
+ TwoPhaseShmemInit();
+ BackgroundWorkerShmemInit();
+
+ /*
+ * Set up shared-inval messaging
+ */
+ CreateSharedInvalidationState();
+
+ /*
+ * Set up interprocess signaling mechanisms
+ */
+ PMSignalShmemInit();
+ ProcSignalShmemInit();
+ CheckpointerShmemInit();
+ AutoVacuumShmemInit();
+ ReplicationSlotsShmemInit();
+ ReplicationOriginShmemInit();
+ WalSndShmemInit();
+ WalRcvShmemInit();
+ PgArchShmemInit();
+ ApplyLauncherShmemInit();
+
+ /*
+ * Set up other modules that need some shared memory space
+ */
+ SnapMgrInit();
+ BTreeShmemInit();
+ SyncScanShmemInit();
+ AsyncShmemInit();
+
+#ifdef EXEC_BACKEND
+
+ /*
+ * Alloc the win32 shared backend array
+ */
+ if (!IsUnderPostmaster)
+ ShmemBackendArrayAllocation();
+#endif
+
+ /* Initialize dynamic shared memory facilities. */
+ if (!IsUnderPostmaster)
+ dsm_postmaster_startup(shim);
+
+ /*
+ * Now give loadable modules a chance to set up their shmem allocations
+ */
+ if (shmem_startup_hook)
+ shmem_startup_hook();
+}
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
new file mode 100644
index 0000000..3427bcf
--- /dev/null
+++ b/src/backend/storage/ipc/latch.c
@@ -0,0 +1,2158 @@
+/*-------------------------------------------------------------------------
+ *
+ * latch.c
+ * Routines for inter-process latches
+ *
+ * The poll() implementation uses the so-called self-pipe trick to overcome the
+ * race condition involved with poll() and setting a global flag in the signal
+ * handler. When a latch is set and the current process is waiting for it, the
+ * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe.
+ * A signal by itself doesn't interrupt poll() on all platforms, and even on
+ * platforms where it does, a signal that arrives just before the poll() call
+ * does not prevent poll() from entering sleep. An incoming byte on a pipe
+ * however reliably interrupts the sleep, and causes poll() to return
+ * immediately even if the signal arrives before poll() begins.
+ *
+ * The epoll() implementation overcomes the race with a different technique: it
+ * keeps SIGURG blocked and consumes from a signalfd() descriptor instead. We
+ * don't need to register a signal handler or create our own self-pipe. We
+ * assume that any system that has Linux epoll() also has Linux signalfd().
+ *
+ * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL.
+ *
+ * The Windows implementation uses Windows events that are inherited by all
+ * postmaster child processes. There's no need for the self-pipe trick there.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/latch.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <limits.h>
+#include <signal.h>
+#include <unistd.h>
+#ifdef HAVE_SYS_EPOLL_H
+#include <sys/epoll.h>
+#endif
+#ifdef HAVE_SYS_EVENT_H
+#include <sys/event.h>
+#endif
+#ifdef HAVE_SYS_SIGNALFD_H
+#include <sys/signalfd.h>
+#endif
+#ifdef HAVE_POLL_H
+#include <poll.h>
+#endif
+
+#include "libpq/pqsignal.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "port/atomics.h"
+#include "portability/instr_time.h"
+#include "postmaster/postmaster.h"
+#include "storage/fd.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+#include "utils/memutils.h"
+
+/*
+ * Select the fd readiness primitive to use. Normally the "most modern"
+ * primitive supported by the OS will be used, but for testing it can be
+ * useful to manually specify the used primitive. If desired, just add a
+ * define somewhere before this block.
+ */
+#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \
+ defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32)
+/* don't overwrite manual choice */
+#elif defined(HAVE_SYS_EPOLL_H)
+#define WAIT_USE_EPOLL
+#elif defined(HAVE_KQUEUE)
+#define WAIT_USE_KQUEUE
+#elif defined(HAVE_POLL)
+#define WAIT_USE_POLL
+#elif WIN32
+#define WAIT_USE_WIN32
+#else
+#error "no wait set implementation available"
+#endif
+
+/*
+ * By default, we use a self-pipe with poll() and a signalfd with epoll(), if
+ * available. We avoid signalfd on illumos for now based on problem reports.
+ * For testing the choice can also be manually specified.
+ */
+#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL)
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+/* don't overwrite manual choice */
+#elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H) && \
+ !defined(__illumos__)
+#define WAIT_USE_SIGNALFD
+#else
+#define WAIT_USE_SELF_PIPE
+#endif
+#endif
+
+/* typedef in latch.h */
+struct WaitEventSet
+{
+ int nevents; /* number of registered events */
+ int nevents_space; /* maximum number of events in this set */
+
+ /*
+ * Array, of nevents_space length, storing the definition of events this
+ * set is waiting for.
+ */
+ WaitEvent *events;
+
+ /*
+ * If WL_LATCH_SET is specified in any wait event, latch is a pointer to
+ * said latch, and latch_pos the offset in the ->events array. This is
+ * useful because we check the state of the latch before performing doing
+ * syscalls related to waiting.
+ */
+ Latch *latch;
+ int latch_pos;
+
+ /*
+ * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag
+ * is set so that we'll exit immediately if postmaster death is detected,
+ * instead of returning.
+ */
+ bool exit_on_postmaster_death;
+
+#if defined(WAIT_USE_EPOLL)
+ int epoll_fd;
+ /* epoll_wait returns events in a user provided arrays, allocate once */
+ struct epoll_event *epoll_ret_events;
+#elif defined(WAIT_USE_KQUEUE)
+ int kqueue_fd;
+ /* kevent returns events in a user provided arrays, allocate once */
+ struct kevent *kqueue_ret_events;
+ bool report_postmaster_not_running;
+#elif defined(WAIT_USE_POLL)
+ /* poll expects events to be waited on every poll() call, prepare once */
+ struct pollfd *pollfds;
+#elif defined(WAIT_USE_WIN32)
+
+ /*
+ * Array of windows events. The first element always contains
+ * pgwin32_signal_event, so the remaining elements are offset by one (i.e.
+ * event->pos + 1).
+ */
+ HANDLE *handles;
+#endif
+};
+
+/* A common WaitEventSet used to implement WatchLatch() */
+static WaitEventSet *LatchWaitSet;
+
+/* The position of the latch in LatchWaitSet. */
+#define LatchWaitSetLatchPos 0
+
+#ifndef WIN32
+/* Are we currently in WaitLatch? The signal handler would like to know. */
+static volatile sig_atomic_t waiting = false;
+#endif
+
+#ifdef WAIT_USE_SIGNALFD
+/* On Linux, we'll receive SIGURG via a signalfd file descriptor. */
+static int signal_fd = -1;
+#endif
+
+#ifdef WAIT_USE_SELF_PIPE
+/* Read and write ends of the self-pipe */
+static int selfpipe_readfd = -1;
+static int selfpipe_writefd = -1;
+
+/* Process owning the self-pipe --- needed for checking purposes */
+static int selfpipe_owner_pid = 0;
+
+/* Private function prototypes */
+static void latch_sigurg_handler(SIGNAL_ARGS);
+static void sendSelfPipeByte(void);
+#endif
+
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+static void drain(void);
+#endif
+
+#if defined(WAIT_USE_EPOLL)
+static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action);
+#elif defined(WAIT_USE_KQUEUE)
+static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events);
+#elif defined(WAIT_USE_POLL)
+static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event);
+#elif defined(WAIT_USE_WIN32)
+static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event);
+#endif
+
+static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents);
+
+/*
+ * Initialize the process-local latch infrastructure.
+ *
+ * This must be called once during startup of any process that can wait on
+ * latches, before it issues any InitLatch() or OwnLatch() calls.
+ */
+void
+InitializeLatchSupport(void)
+{
+#if defined(WAIT_USE_SELF_PIPE)
+ int pipefd[2];
+
+ if (IsUnderPostmaster)
+ {
+ /*
+ * We might have inherited connections to a self-pipe created by the
+ * postmaster. It's critical that child processes create their own
+ * self-pipes, of course, and we really want them to close the
+ * inherited FDs for safety's sake.
+ */
+ if (selfpipe_owner_pid != 0)
+ {
+ /* Assert we go through here but once in a child process */
+ Assert(selfpipe_owner_pid != MyProcPid);
+ /* Release postmaster's pipe FDs; ignore any error */
+ (void) close(selfpipe_readfd);
+ (void) close(selfpipe_writefd);
+ /* Clean up, just for safety's sake; we'll set these below */
+ selfpipe_readfd = selfpipe_writefd = -1;
+ selfpipe_owner_pid = 0;
+ /* Keep fd.c's accounting straight */
+ ReleaseExternalFD();
+ ReleaseExternalFD();
+ }
+ else
+ {
+ /*
+ * Postmaster didn't create a self-pipe ... or else we're in an
+ * EXEC_BACKEND build, in which case it doesn't matter since the
+ * postmaster's pipe FDs were closed by the action of FD_CLOEXEC.
+ * fd.c won't have state to clean up, either.
+ */
+ Assert(selfpipe_readfd == -1);
+ }
+ }
+ else
+ {
+ /* In postmaster or standalone backend, assert we do this but once */
+ Assert(selfpipe_readfd == -1);
+ Assert(selfpipe_owner_pid == 0);
+ }
+
+ /*
+ * Set up the self-pipe that allows a signal handler to wake up the
+ * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so
+ * that SetLatch won't block if the event has already been set many times
+ * filling the kernel buffer. Make the read-end non-blocking too, so that
+ * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK.
+ * Also, make both FDs close-on-exec, since we surely do not want any
+ * child processes messing with them.
+ */
+ if (pipe(pipefd) < 0)
+ elog(FATAL, "pipe() failed: %m");
+ if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1)
+ elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m");
+ if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1)
+ elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m");
+ if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1)
+ elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m");
+ if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1)
+ elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m");
+
+ selfpipe_readfd = pipefd[0];
+ selfpipe_writefd = pipefd[1];
+ selfpipe_owner_pid = MyProcPid;
+
+ /* Tell fd.c about these two long-lived FDs */
+ ReserveExternalFD();
+ ReserveExternalFD();
+
+ pqsignal(SIGURG, latch_sigurg_handler);
+#endif
+
+#ifdef WAIT_USE_SIGNALFD
+ sigset_t signalfd_mask;
+
+ /* Block SIGURG, because we'll receive it through a signalfd. */
+ sigaddset(&UnBlockSig, SIGURG);
+
+ /* Set up the signalfd to receive SIGURG notifications. */
+ sigemptyset(&signalfd_mask);
+ sigaddset(&signalfd_mask, SIGURG);
+ signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC);
+ if (signal_fd < 0)
+ elog(FATAL, "signalfd() failed");
+ ReserveExternalFD();
+#endif
+
+#ifdef WAIT_USE_KQUEUE
+ /* Ignore SIGURG, because we'll receive it via kqueue. */
+ pqsignal(SIGURG, SIG_IGN);
+#endif
+}
+
+void
+InitializeLatchWaitSet(void)
+{
+ int latch_pos PG_USED_FOR_ASSERTS_ONLY;
+
+ Assert(LatchWaitSet == NULL);
+
+ /* Set up the WaitEventSet used by WaitLatch(). */
+ LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2);
+ latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET,
+ MyLatch, NULL);
+ if (IsUnderPostmaster)
+ AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH,
+ PGINVALID_SOCKET, NULL, NULL);
+
+ Assert(latch_pos == LatchWaitSetLatchPos);
+}
+
+void
+ShutdownLatchSupport(void)
+{
+#if defined(WAIT_USE_POLL)
+ pqsignal(SIGURG, SIG_IGN);
+#endif
+
+ if (LatchWaitSet)
+ {
+ FreeWaitEventSet(LatchWaitSet);
+ LatchWaitSet = NULL;
+ }
+
+#if defined(WAIT_USE_SELF_PIPE)
+ close(selfpipe_readfd);
+ close(selfpipe_writefd);
+ selfpipe_readfd = -1;
+ selfpipe_writefd = -1;
+ selfpipe_owner_pid = InvalidPid;
+#endif
+
+#if defined(WAIT_USE_SIGNALFD)
+ close(signal_fd);
+ signal_fd = -1;
+#endif
+}
+
+/*
+ * Initialize a process-local latch.
+ */
+void
+InitLatch(Latch *latch)
+{
+ latch->is_set = false;
+ latch->maybe_sleeping = false;
+ latch->owner_pid = MyProcPid;
+ latch->is_shared = false;
+
+#if defined(WAIT_USE_SELF_PIPE)
+ /* Assert InitializeLatchSupport has been called in this process */
+ Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
+#elif defined(WAIT_USE_SIGNALFD)
+ /* Assert InitializeLatchSupport has been called in this process */
+ Assert(signal_fd >= 0);
+#elif defined(WAIT_USE_WIN32)
+ latch->event = CreateEvent(NULL, TRUE, FALSE, NULL);
+ if (latch->event == NULL)
+ elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
+#endif /* WIN32 */
+}
+
+/*
+ * Initialize a shared latch that can be set from other processes. The latch
+ * is initially owned by no-one; use OwnLatch to associate it with the
+ * current process.
+ *
+ * InitSharedLatch needs to be called in postmaster before forking child
+ * processes, usually right after allocating the shared memory block
+ * containing the latch with ShmemInitStruct. (The Unix implementation
+ * doesn't actually require that, but the Windows one does.) Because of
+ * this restriction, we have no concurrency issues to worry about here.
+ *
+ * Note that other handles created in this module are never marked as
+ * inheritable. Thus we do not need to worry about cleaning up child
+ * process references to postmaster-private latches or WaitEventSets.
+ */
+void
+InitSharedLatch(Latch *latch)
+{
+#ifdef WIN32
+ SECURITY_ATTRIBUTES sa;
+
+ /*
+ * Set up security attributes to specify that the events are inherited.
+ */
+ ZeroMemory(&sa, sizeof(sa));
+ sa.nLength = sizeof(sa);
+ sa.bInheritHandle = TRUE;
+
+ latch->event = CreateEvent(&sa, TRUE, FALSE, NULL);
+ if (latch->event == NULL)
+ elog(ERROR, "CreateEvent failed: error code %lu", GetLastError());
+#endif
+
+ latch->is_set = false;
+ latch->maybe_sleeping = false;
+ latch->owner_pid = 0;
+ latch->is_shared = true;
+}
+
+/*
+ * Associate a shared latch with the current process, allowing it to
+ * wait on the latch.
+ *
+ * Although there is a sanity check for latch-already-owned, we don't do
+ * any sort of locking here, meaning that we could fail to detect the error
+ * if two processes try to own the same latch at about the same time. If
+ * there is any risk of that, caller must provide an interlock to prevent it.
+ */
+void
+OwnLatch(Latch *latch)
+{
+ /* Sanity checks */
+ Assert(latch->is_shared);
+
+#if defined(WAIT_USE_SELF_PIPE)
+ /* Assert InitializeLatchSupport has been called in this process */
+ Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid);
+#elif defined(WAIT_USE_SIGNALFD)
+ /* Assert InitializeLatchSupport has been called in this process */
+ Assert(signal_fd >= 0);
+#endif
+
+ if (latch->owner_pid != 0)
+ elog(ERROR, "latch already owned");
+
+ latch->owner_pid = MyProcPid;
+}
+
+/*
+ * Disown a shared latch currently owned by the current process.
+ */
+void
+DisownLatch(Latch *latch)
+{
+ Assert(latch->is_shared);
+ Assert(latch->owner_pid == MyProcPid);
+
+ latch->owner_pid = 0;
+}
+
+/*
+ * Wait for a given latch to be set, or for postmaster death, or until timeout
+ * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events
+ * to wait for. If the latch is already set (and WL_LATCH_SET is given), the
+ * function returns immediately.
+ *
+ * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag
+ * is given. Although it is declared as "long", we don't actually support
+ * timeouts longer than INT_MAX milliseconds. Note that some extra overhead
+ * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible.
+ *
+ * The latch must be owned by the current process, ie. it must be a
+ * process-local latch initialized with InitLatch, or a shared latch
+ * associated with the current process by calling OwnLatch.
+ *
+ * Returns bit mask indicating which condition(s) caused the wake-up. Note
+ * that if multiple wake-up conditions are true, there is no guarantee that
+ * we return all of them in one call, but we will return at least one.
+ */
+int
+WaitLatch(Latch *latch, int wakeEvents, long timeout,
+ uint32 wait_event_info)
+{
+ WaitEvent event;
+
+ /* Postmaster-managed callers must handle postmaster death somehow. */
+ Assert(!IsUnderPostmaster ||
+ (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+ (wakeEvents & WL_POSTMASTER_DEATH));
+
+ /*
+ * Some callers may have a latch other than MyLatch, or no latch at all,
+ * or want to handle postmaster death differently. It's cheap to assign
+ * those, so just do it every time.
+ */
+ if (!(wakeEvents & WL_LATCH_SET))
+ latch = NULL;
+ ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch);
+ LatchWaitSet->exit_on_postmaster_death =
+ ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0);
+
+ if (WaitEventSetWait(LatchWaitSet,
+ (wakeEvents & WL_TIMEOUT) ? timeout : -1,
+ &event, 1,
+ wait_event_info) == 0)
+ return WL_TIMEOUT;
+ else
+ return event.events;
+}
+
+/*
+ * Like WaitLatch, but with an extra socket argument for WL_SOCKET_*
+ * conditions.
+ *
+ * When waiting on a socket, EOF and error conditions always cause the socket
+ * to be reported as readable/writable/connected, so that the caller can deal
+ * with the condition.
+ *
+ * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit
+ * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the
+ * return value if the postmaster dies. The latter is useful for rare cases
+ * where some behavior other than immediate exit is needed.
+ *
+ * NB: These days this is just a wrapper around the WaitEventSet API. When
+ * using a latch very frequently, consider creating a longer living
+ * WaitEventSet instead; that's more efficient.
+ */
+int
+WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
+ long timeout, uint32 wait_event_info)
+{
+ int ret = 0;
+ int rc;
+ WaitEvent event;
+ WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
+
+ if (wakeEvents & WL_TIMEOUT)
+ Assert(timeout >= 0);
+ else
+ timeout = -1;
+
+ if (wakeEvents & WL_LATCH_SET)
+ AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
+ latch, NULL);
+
+ /* Postmaster-managed callers must handle postmaster death somehow. */
+ Assert(!IsUnderPostmaster ||
+ (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+ (wakeEvents & WL_POSTMASTER_DEATH));
+
+ if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
+ AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
+ NULL, NULL);
+
+ if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
+ AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+ NULL, NULL);
+
+ if (wakeEvents & WL_SOCKET_MASK)
+ {
+ int ev;
+
+ ev = wakeEvents & WL_SOCKET_MASK;
+ AddWaitEventToSet(set, ev, sock, NULL, NULL);
+ }
+
+ rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
+
+ if (rc == 0)
+ ret |= WL_TIMEOUT;
+ else
+ {
+ ret |= event.events & (WL_LATCH_SET |
+ WL_POSTMASTER_DEATH |
+ WL_SOCKET_MASK);
+ }
+
+ FreeWaitEventSet(set);
+
+ return ret;
+}
+
+/*
+ * Sets a latch and wakes up anyone waiting on it.
+ *
+ * This is cheap if the latch is already set, otherwise not so much.
+ *
+ * NB: when calling this in a signal handler, be sure to save and restore
+ * errno around it. (That's standard practice in most signal handlers, of
+ * course, but we used to omit it in handlers that only set a flag.)
+ *
+ * NB: this function is called from critical sections and signal handlers so
+ * throwing an error is not a good idea.
+ */
+void
+SetLatch(Latch *latch)
+{
+#ifndef WIN32
+ pid_t owner_pid;
+#else
+ HANDLE handle;
+#endif
+
+ /*
+ * The memory barrier has to be placed here to ensure that any flag
+ * variables possibly changed by this process have been flushed to main
+ * memory, before we check/set is_set.
+ */
+ pg_memory_barrier();
+
+ /* Quick exit if already set */
+ if (latch->is_set)
+ return;
+
+ latch->is_set = true;
+
+ pg_memory_barrier();
+ if (!latch->maybe_sleeping)
+ return;
+
+#ifndef WIN32
+
+ /*
+ * See if anyone's waiting for the latch. It can be the current process if
+ * we're in a signal handler. We use the self-pipe or SIGURG to ourselves
+ * to wake up WaitEventSetWaitBlock() without races in that case. If it's
+ * another process, send a signal.
+ *
+ * Fetch owner_pid only once, in case the latch is concurrently getting
+ * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't
+ * guaranteed to be true! In practice, the effective range of pid_t fits
+ * in a 32 bit integer, and so should be atomic. In the worst case, we
+ * might end up signaling the wrong process. Even then, you're very
+ * unlucky if a process with that bogus pid exists and belongs to
+ * Postgres; and PG database processes should handle excess SIGUSR1
+ * interrupts without a problem anyhow.
+ *
+ * Another sort of race condition that's possible here is for a new
+ * process to own the latch immediately after we look, so we don't signal
+ * it. This is okay so long as all callers of ResetLatch/WaitLatch follow
+ * the standard coding convention of waiting at the bottom of their loops,
+ * not the top, so that they'll correctly process latch-setting events
+ * that happen before they enter the loop.
+ */
+ owner_pid = latch->owner_pid;
+ if (owner_pid == 0)
+ return;
+ else if (owner_pid == MyProcPid)
+ {
+#if defined(WAIT_USE_SELF_PIPE)
+ if (waiting)
+ sendSelfPipeByte();
+#else
+ if (waiting)
+ kill(MyProcPid, SIGURG);
+#endif
+ }
+ else
+ kill(owner_pid, SIGURG);
+
+#else
+
+ /*
+ * See if anyone's waiting for the latch. It can be the current process if
+ * we're in a signal handler.
+ *
+ * Use a local variable here just in case somebody changes the event field
+ * concurrently (which really should not happen).
+ */
+ handle = latch->event;
+ if (handle)
+ {
+ SetEvent(handle);
+
+ /*
+ * Note that we silently ignore any errors. We might be in a signal
+ * handler or other critical path where it's not safe to call elog().
+ */
+ }
+#endif
+
+}
+
+/*
+ * Clear the latch. Calling WaitLatch after this will sleep, unless
+ * the latch is set again before the WaitLatch call.
+ */
+void
+ResetLatch(Latch *latch)
+{
+ /* Only the owner should reset the latch */
+ Assert(latch->owner_pid == MyProcPid);
+ Assert(latch->maybe_sleeping == false);
+
+ latch->is_set = false;
+
+ /*
+ * Ensure that the write to is_set gets flushed to main memory before we
+ * examine any flag variables. Otherwise a concurrent SetLatch might
+ * falsely conclude that it needn't signal us, even though we have missed
+ * seeing some flag updates that SetLatch was supposed to inform us of.
+ */
+ pg_memory_barrier();
+}
+
+/*
+ * Create a WaitEventSet with space for nevents different events to wait for.
+ *
+ * These events can then be efficiently waited upon together, using
+ * WaitEventSetWait().
+ */
+WaitEventSet *
+CreateWaitEventSet(MemoryContext context, int nevents)
+{
+ WaitEventSet *set;
+ char *data;
+ Size sz = 0;
+
+ /*
+ * Use MAXALIGN size/alignment to guarantee that later uses of memory are
+ * aligned correctly. E.g. epoll_event might need 8 byte alignment on some
+ * platforms, but earlier allocations like WaitEventSet and WaitEvent
+ * might not be sized to guarantee that when purely using sizeof().
+ */
+ sz += MAXALIGN(sizeof(WaitEventSet));
+ sz += MAXALIGN(sizeof(WaitEvent) * nevents);
+
+#if defined(WAIT_USE_EPOLL)
+ sz += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+ sz += MAXALIGN(sizeof(struct kevent) * nevents);
+#elif defined(WAIT_USE_POLL)
+ sz += MAXALIGN(sizeof(struct pollfd) * nevents);
+#elif defined(WAIT_USE_WIN32)
+ /* need space for the pgwin32_signal_event */
+ sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1));
+#endif
+
+ data = (char *) MemoryContextAllocZero(context, sz);
+
+ set = (WaitEventSet *) data;
+ data += MAXALIGN(sizeof(WaitEventSet));
+
+ set->events = (WaitEvent *) data;
+ data += MAXALIGN(sizeof(WaitEvent) * nevents);
+
+#if defined(WAIT_USE_EPOLL)
+ set->epoll_ret_events = (struct epoll_event *) data;
+ data += MAXALIGN(sizeof(struct epoll_event) * nevents);
+#elif defined(WAIT_USE_KQUEUE)
+ set->kqueue_ret_events = (struct kevent *) data;
+ data += MAXALIGN(sizeof(struct kevent) * nevents);
+#elif defined(WAIT_USE_POLL)
+ set->pollfds = (struct pollfd *) data;
+ data += MAXALIGN(sizeof(struct pollfd) * nevents);
+#elif defined(WAIT_USE_WIN32)
+ set->handles = (HANDLE) data;
+ data += MAXALIGN(sizeof(HANDLE) * nevents);
+#endif
+
+ set->latch = NULL;
+ set->nevents_space = nevents;
+ set->exit_on_postmaster_death = false;
+
+#if defined(WAIT_USE_EPOLL)
+ if (!AcquireExternalFD())
+ {
+ /* treat this as though epoll_create1 itself returned EMFILE */
+ elog(ERROR, "epoll_create1 failed: %m");
+ }
+ set->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
+ if (set->epoll_fd < 0)
+ {
+ ReleaseExternalFD();
+ elog(ERROR, "epoll_create1 failed: %m");
+ }
+#elif defined(WAIT_USE_KQUEUE)
+ if (!AcquireExternalFD())
+ {
+ /* treat this as though kqueue itself returned EMFILE */
+ elog(ERROR, "kqueue failed: %m");
+ }
+ set->kqueue_fd = kqueue();
+ if (set->kqueue_fd < 0)
+ {
+ ReleaseExternalFD();
+ elog(ERROR, "kqueue failed: %m");
+ }
+ if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1)
+ {
+ int save_errno = errno;
+
+ close(set->kqueue_fd);
+ ReleaseExternalFD();
+ errno = save_errno;
+ elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m");
+ }
+ set->report_postmaster_not_running = false;
+#elif defined(WAIT_USE_WIN32)
+
+ /*
+ * To handle signals while waiting, we need to add a win32 specific event.
+ * We accounted for the additional event at the top of this routine. See
+ * port/win32/signal.c for more details.
+ *
+ * Note: pgwin32_signal_event should be first to ensure that it will be
+ * reported when multiple events are set. We want to guarantee that
+ * pending signals are serviced.
+ */
+ set->handles[0] = pgwin32_signal_event;
+ StaticAssertStmt(WSA_INVALID_EVENT == NULL, "");
+#endif
+
+ return set;
+}
+
+/*
+ * Free a previously created WaitEventSet.
+ *
+ * Note: preferably, this shouldn't have to free any resources that could be
+ * inherited across an exec(). If it did, we'd likely leak those resources in
+ * many scenarios. For the epoll case, we ensure that by setting EPOLL_CLOEXEC
+ * when the FD is created. For the Windows case, we assume that the handles
+ * involved are non-inheritable.
+ */
+void
+FreeWaitEventSet(WaitEventSet *set)
+{
+#if defined(WAIT_USE_EPOLL)
+ close(set->epoll_fd);
+ ReleaseExternalFD();
+#elif defined(WAIT_USE_KQUEUE)
+ close(set->kqueue_fd);
+ ReleaseExternalFD();
+#elif defined(WAIT_USE_WIN32)
+ WaitEvent *cur_event;
+
+ for (cur_event = set->events;
+ cur_event < (set->events + set->nevents);
+ cur_event++)
+ {
+ if (cur_event->events & WL_LATCH_SET)
+ {
+ /* uses the latch's HANDLE */
+ }
+ else if (cur_event->events & WL_POSTMASTER_DEATH)
+ {
+ /* uses PostmasterHandle */
+ }
+ else
+ {
+ /* Clean up the event object we created for the socket */
+ WSAEventSelect(cur_event->fd, NULL, 0);
+ WSACloseEvent(set->handles[cur_event->pos + 1]);
+ }
+ }
+#endif
+
+ pfree(set);
+}
+
+/* ---
+ * Add an event to the set. Possible events are:
+ * - WL_LATCH_SET: Wait for the latch to be set
+ * - WL_POSTMASTER_DEATH: Wait for postmaster to die
+ * - WL_SOCKET_READABLE: Wait for socket to become readable,
+ * can be combined in one event with other WL_SOCKET_* events
+ * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable,
+ * can be combined with other WL_SOCKET_* events
+ * - WL_SOCKET_CONNECTED: Wait for socket connection to be established,
+ * can be combined with other WL_SOCKET_* events (on non-Windows
+ * platforms, this is the same as WL_SOCKET_WRITEABLE)
+ * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies
+ *
+ * Returns the offset in WaitEventSet->events (starting from 0), which can be
+ * used to modify previously added wait events using ModifyWaitEvent().
+ *
+ * In the WL_LATCH_SET case the latch must be owned by the current process,
+ * i.e. it must be a process-local latch initialized with InitLatch, or a
+ * shared latch associated with the current process by calling OwnLatch.
+ *
+ * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error
+ * conditions cause the socket to be reported as readable/writable/connected,
+ * so that the caller can deal with the condition.
+ *
+ * The user_data pointer specified here will be set for the events returned
+ * by WaitEventSetWait(), allowing to easily associate additional data with
+ * events.
+ */
+int
+AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch,
+ void *user_data)
+{
+ WaitEvent *event;
+
+ /* not enough space */
+ Assert(set->nevents < set->nevents_space);
+
+ if (events == WL_EXIT_ON_PM_DEATH)
+ {
+ events = WL_POSTMASTER_DEATH;
+ set->exit_on_postmaster_death = true;
+ }
+
+ if (latch)
+ {
+ if (latch->owner_pid != MyProcPid)
+ elog(ERROR, "cannot wait on a latch owned by another process");
+ if (set->latch)
+ elog(ERROR, "cannot wait on more than one latch");
+ if ((events & WL_LATCH_SET) != WL_LATCH_SET)
+ elog(ERROR, "latch events only support being set");
+ }
+ else
+ {
+ if (events & WL_LATCH_SET)
+ elog(ERROR, "cannot wait on latch without a specified latch");
+ }
+
+ /* waiting for socket readiness without a socket indicates a bug */
+ if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK))
+ elog(ERROR, "cannot wait on socket event without a socket");
+
+ event = &set->events[set->nevents];
+ event->pos = set->nevents++;
+ event->fd = fd;
+ event->events = events;
+ event->user_data = user_data;
+#ifdef WIN32
+ event->reset = false;
+#endif
+
+ if (events == WL_LATCH_SET)
+ {
+ set->latch = latch;
+ set->latch_pos = event->pos;
+#if defined(WAIT_USE_SELF_PIPE)
+ event->fd = selfpipe_readfd;
+#elif defined(WAIT_USE_SIGNALFD)
+ event->fd = signal_fd;
+#else
+ event->fd = PGINVALID_SOCKET;
+#ifdef WAIT_USE_EPOLL
+ return event->pos;
+#endif
+#endif
+ }
+ else if (events == WL_POSTMASTER_DEATH)
+ {
+#ifndef WIN32
+ event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH];
+#endif
+ }
+
+ /* perform wait primitive specific initialization, if needed */
+#if defined(WAIT_USE_EPOLL)
+ WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD);
+#elif defined(WAIT_USE_KQUEUE)
+ WaitEventAdjustKqueue(set, event, 0);
+#elif defined(WAIT_USE_POLL)
+ WaitEventAdjustPoll(set, event);
+#elif defined(WAIT_USE_WIN32)
+ WaitEventAdjustWin32(set, event);
+#endif
+
+ return event->pos;
+}
+
+/*
+ * Change the event mask and, in the WL_LATCH_SET case, the latch associated
+ * with the WaitEvent. The latch may be changed to NULL to disable the latch
+ * temporarily, and then set back to a latch later.
+ *
+ * 'pos' is the id returned by AddWaitEventToSet.
+ */
+void
+ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch)
+{
+ WaitEvent *event;
+#if defined(WAIT_USE_KQUEUE)
+ int old_events;
+#endif
+
+ Assert(pos < set->nevents);
+
+ event = &set->events[pos];
+#if defined(WAIT_USE_KQUEUE)
+ old_events = event->events;
+#endif
+
+ /*
+ * If neither the event mask nor the associated latch changes, return
+ * early. That's an important optimization for some sockets, where
+ * ModifyWaitEvent is frequently used to switch from waiting for reads to
+ * waiting on writes.
+ */
+ if (events == event->events &&
+ (!(event->events & WL_LATCH_SET) || set->latch == latch))
+ return;
+
+ if (event->events & WL_LATCH_SET &&
+ events != event->events)
+ {
+ elog(ERROR, "cannot modify latch event");
+ }
+
+ if (event->events & WL_POSTMASTER_DEATH)
+ {
+ elog(ERROR, "cannot modify postmaster death event");
+ }
+
+ /* FIXME: validate event mask */
+ event->events = events;
+
+ if (events == WL_LATCH_SET)
+ {
+ if (latch && latch->owner_pid != MyProcPid)
+ elog(ERROR, "cannot wait on a latch owned by another process");
+ set->latch = latch;
+
+ /*
+ * On Unix, we don't need to modify the kernel object because the
+ * underlying pipe (if there is one) is the same for all latches so we
+ * can return immediately. On Windows, we need to update our array of
+ * handles, but we leave the old one in place and tolerate spurious
+ * wakeups if the latch is disabled.
+ */
+#if defined(WAIT_USE_WIN32)
+ if (!latch)
+ return;
+#else
+ return;
+#endif
+ }
+
+#if defined(WAIT_USE_EPOLL)
+ WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD);
+#elif defined(WAIT_USE_KQUEUE)
+ WaitEventAdjustKqueue(set, event, old_events);
+#elif defined(WAIT_USE_POLL)
+ WaitEventAdjustPoll(set, event);
+#elif defined(WAIT_USE_WIN32)
+ WaitEventAdjustWin32(set, event);
+#endif
+}
+
+#if defined(WAIT_USE_EPOLL)
+/*
+ * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL
+ */
+static void
+WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action)
+{
+ struct epoll_event epoll_ev;
+ int rc;
+
+ /* pointer to our event, returned by epoll_wait */
+ epoll_ev.data.ptr = event;
+ /* always wait for errors */
+ epoll_ev.events = EPOLLERR | EPOLLHUP;
+
+ /* prepare pollfd entry once */
+ if (event->events == WL_LATCH_SET)
+ {
+ Assert(set->latch != NULL);
+ epoll_ev.events |= EPOLLIN;
+ }
+ else if (event->events == WL_POSTMASTER_DEATH)
+ {
+ epoll_ev.events |= EPOLLIN;
+ }
+ else
+ {
+ Assert(event->fd != PGINVALID_SOCKET);
+ Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+
+ if (event->events & WL_SOCKET_READABLE)
+ epoll_ev.events |= EPOLLIN;
+ if (event->events & WL_SOCKET_WRITEABLE)
+ epoll_ev.events |= EPOLLOUT;
+ }
+
+ /*
+ * Even though unused, we also pass epoll_ev as the data argument if
+ * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug
+ * requiring that, and actually it makes the code simpler...
+ */
+ rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev);
+
+ if (rc < 0)
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "epoll_ctl")));
+}
+#endif
+
+#if defined(WAIT_USE_POLL)
+static void
+WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event)
+{
+ struct pollfd *pollfd = &set->pollfds[event->pos];
+
+ pollfd->revents = 0;
+ pollfd->fd = event->fd;
+
+ /* prepare pollfd entry once */
+ if (event->events == WL_LATCH_SET)
+ {
+ Assert(set->latch != NULL);
+ pollfd->events = POLLIN;
+ }
+ else if (event->events == WL_POSTMASTER_DEATH)
+ {
+ pollfd->events = POLLIN;
+ }
+ else
+ {
+ Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE));
+ pollfd->events = 0;
+ if (event->events & WL_SOCKET_READABLE)
+ pollfd->events |= POLLIN;
+ if (event->events & WL_SOCKET_WRITEABLE)
+ pollfd->events |= POLLOUT;
+ }
+
+ Assert(event->fd != PGINVALID_SOCKET);
+}
+#endif
+
+#if defined(WAIT_USE_KQUEUE)
+
+/*
+ * On most BSD family systems, the udata member of struct kevent is of type
+ * void *, so we could directly convert to/from WaitEvent *. Unfortunately,
+ * NetBSD has it as intptr_t, so here we wallpaper over that difference with
+ * an lvalue cast.
+ */
+#define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata)))
+
+static inline void
+WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action,
+ WaitEvent *event)
+{
+ k_ev->ident = event->fd;
+ k_ev->filter = filter;
+ k_ev->flags = action;
+ k_ev->fflags = 0;
+ k_ev->data = 0;
+ AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event)
+{
+ /* For now postmaster death can only be added, not removed. */
+ k_ev->ident = PostmasterPid;
+ k_ev->filter = EVFILT_PROC;
+ k_ev->flags = EV_ADD;
+ k_ev->fflags = NOTE_EXIT;
+ k_ev->data = 0;
+ AccessWaitEvent(k_ev) = event;
+}
+
+static inline void
+WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event)
+{
+ /* For now latch can only be added, not removed. */
+ k_ev->ident = SIGURG;
+ k_ev->filter = EVFILT_SIGNAL;
+ k_ev->flags = EV_ADD;
+ k_ev->fflags = 0;
+ k_ev->data = 0;
+ AccessWaitEvent(k_ev) = event;
+}
+
+/*
+ * old_events is the previous event mask, used to compute what has changed.
+ */
+static void
+WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events)
+{
+ int rc;
+ struct kevent k_ev[2];
+ int count = 0;
+ bool new_filt_read = false;
+ bool old_filt_read = false;
+ bool new_filt_write = false;
+ bool old_filt_write = false;
+
+ if (old_events == event->events)
+ return;
+
+ Assert(event->events != WL_LATCH_SET || set->latch != NULL);
+ Assert(event->events == WL_LATCH_SET ||
+ event->events == WL_POSTMASTER_DEATH ||
+ (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)));
+
+ if (event->events == WL_POSTMASTER_DEATH)
+ {
+ /*
+ * Unlike all the other implementations, we detect postmaster death
+ * using process notification instead of waiting on the postmaster
+ * alive pipe.
+ */
+ WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event);
+ }
+ else if (event->events == WL_LATCH_SET)
+ {
+ /* We detect latch wakeup using a signal event. */
+ WaitEventAdjustKqueueAddLatch(&k_ev[count++], event);
+ }
+ else
+ {
+ /*
+ * We need to compute the adds and deletes required to get from the
+ * old event mask to the new event mask, since kevent treats readable
+ * and writable as separate events.
+ */
+ if (old_events & WL_SOCKET_READABLE)
+ old_filt_read = true;
+ if (event->events & WL_SOCKET_READABLE)
+ new_filt_read = true;
+ if (old_events & WL_SOCKET_WRITEABLE)
+ old_filt_write = true;
+ if (event->events & WL_SOCKET_WRITEABLE)
+ new_filt_write = true;
+ if (old_filt_read && !new_filt_read)
+ WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE,
+ event);
+ else if (!old_filt_read && new_filt_read)
+ WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD,
+ event);
+ if (old_filt_write && !new_filt_write)
+ WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE,
+ event);
+ else if (!old_filt_write && new_filt_write)
+ WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD,
+ event);
+ }
+
+ Assert(count > 0);
+ Assert(count <= 2);
+
+ rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL);
+
+ /*
+ * When adding the postmaster's pid, we have to consider that it might
+ * already have exited and perhaps even been replaced by another process
+ * with the same pid. If so, we have to defer reporting this as an event
+ * until the next call to WaitEventSetWaitBlock().
+ */
+
+ if (rc < 0)
+ {
+ if (event->events == WL_POSTMASTER_DEATH &&
+ (errno == ESRCH || errno == EACCES))
+ set->report_postmaster_not_running = true;
+ else
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "kevent")));
+ }
+ else if (event->events == WL_POSTMASTER_DEATH &&
+ PostmasterPid != getppid() &&
+ !PostmasterIsAlive())
+ {
+ /*
+ * The extra PostmasterIsAliveInternal() check prevents false alarms
+ * on systems that give a different value for getppid() while being
+ * traced by a debugger.
+ */
+ set->report_postmaster_not_running = true;
+ }
+}
+
+#endif
+
+#if defined(WAIT_USE_WIN32)
+static void
+WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event)
+{
+ HANDLE *handle = &set->handles[event->pos + 1];
+
+ if (event->events == WL_LATCH_SET)
+ {
+ Assert(set->latch != NULL);
+ *handle = set->latch->event;
+ }
+ else if (event->events == WL_POSTMASTER_DEATH)
+ {
+ *handle = PostmasterHandle;
+ }
+ else
+ {
+ int flags = FD_CLOSE; /* always check for errors/EOF */
+
+ if (event->events & WL_SOCKET_READABLE)
+ flags |= FD_READ;
+ if (event->events & WL_SOCKET_WRITEABLE)
+ flags |= FD_WRITE;
+ if (event->events & WL_SOCKET_CONNECTED)
+ flags |= FD_CONNECT;
+
+ if (*handle == WSA_INVALID_EVENT)
+ {
+ *handle = WSACreateEvent();
+ if (*handle == WSA_INVALID_EVENT)
+ elog(ERROR, "failed to create event for socket: error code %d",
+ WSAGetLastError());
+ }
+ if (WSAEventSelect(event->fd, *handle, flags) != 0)
+ elog(ERROR, "failed to set up event for socket: error code %d",
+ WSAGetLastError());
+
+ Assert(event->fd != PGINVALID_SOCKET);
+ }
+}
+#endif
+
+/*
+ * Wait for events added to the set to happen, or until the timeout is
+ * reached. At most nevents occurred events are returned.
+ *
+ * If timeout = -1, block until an event occurs; if 0, check sockets for
+ * readiness, but don't block; if > 0, block for at most timeout milliseconds.
+ *
+ * Returns the number of events occurred, or 0 if the timeout was reached.
+ *
+ * Returned events will have the fd, pos, user_data fields set to the
+ * values associated with the registered event.
+ */
+int
+WaitEventSetWait(WaitEventSet *set, long timeout,
+ WaitEvent *occurred_events, int nevents,
+ uint32 wait_event_info)
+{
+ int returned_events = 0;
+ instr_time start_time;
+ instr_time cur_time;
+ long cur_timeout = -1;
+
+ Assert(nevents > 0);
+
+ /*
+ * Initialize timeout if requested. We must record the current time so
+ * that we can determine the remaining timeout if interrupted.
+ */
+ if (timeout >= 0)
+ {
+ INSTR_TIME_SET_CURRENT(start_time);
+ Assert(timeout >= 0 && timeout <= INT_MAX);
+ cur_timeout = timeout;
+ }
+
+ pgstat_report_wait_start(wait_event_info);
+
+#ifndef WIN32
+ waiting = true;
+#else
+ /* Ensure that signals are serviced even if latch is already set */
+ pgwin32_dispatch_queued_signals();
+#endif
+ while (returned_events == 0)
+ {
+ int rc;
+
+ /*
+ * Check if the latch is set already. If so, leave the loop
+ * immediately, avoid blocking again. We don't attempt to report any
+ * other events that might also be satisfied.
+ *
+ * If someone sets the latch between this and the
+ * WaitEventSetWaitBlock() below, the setter will write a byte to the
+ * pipe (or signal us and the signal handler will do that), and the
+ * readiness routine will return immediately.
+ *
+ * On unix, If there's a pending byte in the self pipe, we'll notice
+ * whenever blocking. Only clearing the pipe in that case avoids
+ * having to drain it every time WaitLatchOrSocket() is used. Should
+ * the pipe-buffer fill up we're still ok, because the pipe is in
+ * nonblocking mode. It's unlikely for that to happen, because the
+ * self pipe isn't filled unless we're blocking (waiting = true), or
+ * from inside a signal handler in latch_sigurg_handler().
+ *
+ * On windows, we'll also notice if there's a pending event for the
+ * latch when blocking, but there's no danger of anything filling up,
+ * as "Setting an event that is already set has no effect.".
+ *
+ * Note: we assume that the kernel calls involved in latch management
+ * will provide adequate synchronization on machines with weak memory
+ * ordering, so that we cannot miss seeing is_set if a notification
+ * has already been queued.
+ */
+ if (set->latch && !set->latch->is_set)
+ {
+ /* about to sleep on a latch */
+ set->latch->maybe_sleeping = true;
+ pg_memory_barrier();
+ /* and recheck */
+ }
+
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->pos = set->latch_pos;
+ occurred_events->user_data =
+ set->events[set->latch_pos].user_data;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+
+ /* could have been set above */
+ set->latch->maybe_sleeping = false;
+
+ break;
+ }
+
+ /*
+ * Wait for events using the readiness primitive chosen at the top of
+ * this file. If -1 is returned, a timeout has occurred, if 0 we have
+ * to retry, everything >= 1 is the number of returned events.
+ */
+ rc = WaitEventSetWaitBlock(set, cur_timeout,
+ occurred_events, nevents);
+
+ if (set->latch)
+ {
+ Assert(set->latch->maybe_sleeping);
+ set->latch->maybe_sleeping = false;
+ }
+
+ if (rc == -1)
+ break; /* timeout occurred */
+ else
+ returned_events = rc;
+
+ /* If we're not done, update cur_timeout for next iteration */
+ if (returned_events == 0 && timeout >= 0)
+ {
+ INSTR_TIME_SET_CURRENT(cur_time);
+ INSTR_TIME_SUBTRACT(cur_time, start_time);
+ cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time);
+ if (cur_timeout <= 0)
+ break;
+ }
+ }
+#ifndef WIN32
+ waiting = false;
+#endif
+
+ pgstat_report_wait_end();
+
+ return returned_events;
+}
+
+
+#if defined(WAIT_USE_EPOLL)
+
+/*
+ * Wait using linux's epoll_wait(2).
+ *
+ * This is the preferable wait method, as several readiness notifications are
+ * delivered, without having to iterate through all of set->events. The return
+ * epoll_event struct contain a pointer to our events, making association
+ * easy.
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ int rc;
+ WaitEvent *cur_event;
+ struct epoll_event *cur_epoll_event;
+
+ /* Sleep */
+ rc = epoll_wait(set->epoll_fd, set->epoll_ret_events,
+ nevents, cur_timeout);
+
+ /* Check return code */
+ if (rc < 0)
+ {
+ /* EINTR is okay, otherwise complain */
+ if (errno != EINTR)
+ {
+ waiting = false;
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "epoll_wait")));
+ }
+ return 0;
+ }
+ else if (rc == 0)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ /*
+ * At least one event occurred, iterate over the returned epoll events
+ * until they're either all processed, or we've returned all the events
+ * the caller desired.
+ */
+ for (cur_epoll_event = set->epoll_ret_events;
+ cur_epoll_event < (set->epoll_ret_events + rc) &&
+ returned_events < nevents;
+ cur_epoll_event++)
+ {
+ /* epoll's data pointer is set to the associated WaitEvent */
+ cur_event = (WaitEvent *) cur_epoll_event->data.ptr;
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET &&
+ cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
+ {
+ /* Drain the signalfd. */
+ drain();
+
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH &&
+ cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))
+ {
+ /*
+ * We expect an EPOLLHUP when the remote end is closed, but
+ * because we don't expect the pipe to become readable or to have
+ * any errors either, treat those cases as postmaster death, too.
+ *
+ * Be paranoid about a spurious event signaling the postmaster as
+ * being dead. There have been reports about that happening with
+ * older primitives (select(2) to be specific), and a spurious
+ * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+ * cost much.
+ */
+ if (!PostmasterIsAliveInternal())
+ {
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ {
+ Assert(cur_event->fd != PGINVALID_SOCKET);
+
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)))
+ {
+ /* data available in socket, or EOF */
+ occurred_events->events |= WL_SOCKET_READABLE;
+ }
+
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP)))
+ {
+ /* writable, or EOF */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events->fd = cur_event->fd;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ }
+
+ return returned_events;
+}
+
+#elif defined(WAIT_USE_KQUEUE)
+
+/*
+ * Wait using kevent(2) on BSD-family systems and macOS.
+ *
+ * For now this mirrors the epoll code, but in future it could modify the fd
+ * set in the same call to kevent as it uses for waiting instead of doing that
+ * with separate system calls.
+ */
+static int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ int rc;
+ WaitEvent *cur_event;
+ struct kevent *cur_kqueue_event;
+ struct timespec timeout;
+ struct timespec *timeout_p;
+
+ if (cur_timeout < 0)
+ timeout_p = NULL;
+ else
+ {
+ timeout.tv_sec = cur_timeout / 1000;
+ timeout.tv_nsec = (cur_timeout % 1000) * 1000000;
+ timeout_p = &timeout;
+ }
+
+ /*
+ * Report postmaster events discovered by WaitEventAdjustKqueue() or an
+ * earlier call to WaitEventSetWait().
+ */
+ if (unlikely(set->report_postmaster_not_running))
+ {
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ return 1;
+ }
+
+ /* Sleep */
+ rc = kevent(set->kqueue_fd, NULL, 0,
+ set->kqueue_ret_events, nevents,
+ timeout_p);
+
+ /* Check return code */
+ if (rc < 0)
+ {
+ /* EINTR is okay, otherwise complain */
+ if (errno != EINTR)
+ {
+ waiting = false;
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "kevent")));
+ }
+ return 0;
+ }
+ else if (rc == 0)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ /*
+ * At least one event occurred, iterate over the returned kqueue events
+ * until they're either all processed, or we've returned all the events
+ * the caller desired.
+ */
+ for (cur_kqueue_event = set->kqueue_ret_events;
+ cur_kqueue_event < (set->kqueue_ret_events + rc) &&
+ returned_events < nevents;
+ cur_kqueue_event++)
+ {
+ /* kevent's udata points to the associated WaitEvent */
+ cur_event = AccessWaitEvent(cur_kqueue_event);
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET &&
+ cur_kqueue_event->filter == EVFILT_SIGNAL)
+ {
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH &&
+ cur_kqueue_event->filter == EVFILT_PROC &&
+ (cur_kqueue_event->fflags & NOTE_EXIT) != 0)
+ {
+ /*
+ * The kernel will tell this kqueue object only once about the
+ * exit of the postmaster, so let's remember that for next time so
+ * that we provide level-triggered semantics.
+ */
+ set->report_postmaster_not_running = true;
+
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ {
+ Assert(cur_event->fd >= 0);
+
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (cur_kqueue_event->filter == EVFILT_READ))
+ {
+ /* readable, or EOF */
+ occurred_events->events |= WL_SOCKET_READABLE;
+ }
+
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (cur_kqueue_event->filter == EVFILT_WRITE))
+ {
+ /* writable, or EOF */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events->fd = cur_event->fd;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ }
+
+ return returned_events;
+}
+
+#elif defined(WAIT_USE_POLL)
+
+/*
+ * Wait using poll(2).
+ *
+ * This allows to receive readiness notifications for several events at once,
+ * but requires iterating through all of set->pollfds.
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ int rc;
+ WaitEvent *cur_event;
+ struct pollfd *cur_pollfd;
+
+ /* Sleep */
+ rc = poll(set->pollfds, set->nevents, (int) cur_timeout);
+
+ /* Check return code */
+ if (rc < 0)
+ {
+ /* EINTR is okay, otherwise complain */
+ if (errno != EINTR)
+ {
+ waiting = false;
+ ereport(ERROR,
+ (errcode_for_socket_access(),
+ errmsg("%s() failed: %m",
+ "poll")));
+ }
+ return 0;
+ }
+ else if (rc == 0)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ for (cur_event = set->events, cur_pollfd = set->pollfds;
+ cur_event < (set->events + set->nevents) &&
+ returned_events < nevents;
+ cur_event++, cur_pollfd++)
+ {
+ /* no activity on this FD, skip */
+ if (cur_pollfd->revents == 0)
+ continue;
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET &&
+ (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
+ {
+ /* There's data in the self-pipe, clear it. */
+ drain();
+
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH &&
+ (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL)))
+ {
+ /*
+ * We expect an POLLHUP when the remote end is closed, but because
+ * we don't expect the pipe to become readable or to have any
+ * errors either, treat those cases as postmaster death, too.
+ *
+ * Be paranoid about a spurious event signaling the postmaster as
+ * being dead. There have been reports about that happening with
+ * older primitives (select(2) to be specific), and a spurious
+ * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't
+ * cost much.
+ */
+ if (!PostmasterIsAliveInternal())
+ {
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))
+ {
+ int errflags = POLLHUP | POLLERR | POLLNVAL;
+
+ Assert(cur_event->fd >= PGINVALID_SOCKET);
+
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (cur_pollfd->revents & (POLLIN | errflags)))
+ {
+ /* data available in socket, or EOF */
+ occurred_events->events |= WL_SOCKET_READABLE;
+ }
+
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (cur_pollfd->revents & (POLLOUT | errflags)))
+ {
+ /* writeable, or EOF */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events->fd = cur_event->fd;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ }
+ return returned_events;
+}
+
+#elif defined(WAIT_USE_WIN32)
+
+/*
+ * Wait using Windows' WaitForMultipleObjects().
+ *
+ * Unfortunately this will only ever return a single readiness notification at
+ * a time. Note that while the official documentation for
+ * WaitForMultipleObjects is ambiguous about multiple events being "consumed"
+ * with a single bWaitAll = FALSE call,
+ * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms
+ * that only one event is "consumed".
+ */
+static inline int
+WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout,
+ WaitEvent *occurred_events, int nevents)
+{
+ int returned_events = 0;
+ DWORD rc;
+ WaitEvent *cur_event;
+
+ /* Reset any wait events that need it */
+ for (cur_event = set->events;
+ cur_event < (set->events + set->nevents);
+ cur_event++)
+ {
+ if (cur_event->reset)
+ {
+ WaitEventAdjustWin32(set, cur_event);
+ cur_event->reset = false;
+ }
+
+ /*
+ * Windows does not guarantee to log an FD_WRITE network event
+ * indicating that more data can be sent unless the previous send()
+ * failed with WSAEWOULDBLOCK. While our caller might well have made
+ * such a call, we cannot assume that here. Therefore, if waiting for
+ * write-ready, force the issue by doing a dummy send(). If the dummy
+ * send() succeeds, assume that the socket is in fact write-ready, and
+ * return immediately. Also, if it fails with something other than
+ * WSAEWOULDBLOCK, return a write-ready indication to let our caller
+ * deal with the error condition.
+ */
+ if (cur_event->events & WL_SOCKET_WRITEABLE)
+ {
+ char c;
+ WSABUF buf;
+ DWORD sent;
+ int r;
+
+ buf.buf = &c;
+ buf.len = 0;
+
+ r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL);
+ if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK)
+ {
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = WL_SOCKET_WRITEABLE;
+ occurred_events->fd = cur_event->fd;
+ return 1;
+ }
+ }
+ }
+
+ /*
+ * Sleep.
+ *
+ * Need to wait for ->nevents + 1, because signal handle is in [0].
+ */
+ rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE,
+ cur_timeout);
+
+ /* Check return code */
+ if (rc == WAIT_FAILED)
+ elog(ERROR, "WaitForMultipleObjects() failed: error code %lu",
+ GetLastError());
+ else if (rc == WAIT_TIMEOUT)
+ {
+ /* timeout exceeded */
+ return -1;
+ }
+
+ if (rc == WAIT_OBJECT_0)
+ {
+ /* Service newly-arrived signals */
+ pgwin32_dispatch_queued_signals();
+ return 0; /* retry */
+ }
+
+ /*
+ * With an offset of one, due to the always present pgwin32_signal_event,
+ * the handle offset directly corresponds to a wait event.
+ */
+ cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1];
+
+ occurred_events->pos = cur_event->pos;
+ occurred_events->user_data = cur_event->user_data;
+ occurred_events->events = 0;
+
+ if (cur_event->events == WL_LATCH_SET)
+ {
+ /*
+ * We cannot use set->latch->event to reset the fired event if we
+ * aren't waiting on this latch now.
+ */
+ if (!ResetEvent(set->handles[cur_event->pos + 1]))
+ elog(ERROR, "ResetEvent failed: error code %lu", GetLastError());
+
+ if (set->latch && set->latch->is_set)
+ {
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_LATCH_SET;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events == WL_POSTMASTER_DEATH)
+ {
+ /*
+ * Postmaster apparently died. Since the consequences of falsely
+ * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take
+ * the trouble to positively verify this with PostmasterIsAlive(),
+ * even though there is no known reason to think that the event could
+ * be falsely set on Windows.
+ */
+ if (!PostmasterIsAliveInternal())
+ {
+ if (set->exit_on_postmaster_death)
+ proc_exit(1);
+ occurred_events->fd = PGINVALID_SOCKET;
+ occurred_events->events = WL_POSTMASTER_DEATH;
+ occurred_events++;
+ returned_events++;
+ }
+ }
+ else if (cur_event->events & WL_SOCKET_MASK)
+ {
+ WSANETWORKEVENTS resEvents;
+ HANDLE handle = set->handles[cur_event->pos + 1];
+
+ Assert(cur_event->fd);
+
+ occurred_events->fd = cur_event->fd;
+
+ ZeroMemory(&resEvents, sizeof(resEvents));
+ if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0)
+ elog(ERROR, "failed to enumerate network events: error code %d",
+ WSAGetLastError());
+ if ((cur_event->events & WL_SOCKET_READABLE) &&
+ (resEvents.lNetworkEvents & FD_READ))
+ {
+ /* data available in socket */
+ occurred_events->events |= WL_SOCKET_READABLE;
+
+ /*------
+ * WaitForMultipleObjects doesn't guarantee that a read event will
+ * be returned if the latch is set at the same time. Even if it
+ * did, the caller might drop that event expecting it to reoccur
+ * on next call. So, we must force the event to be reset if this
+ * WaitEventSet is used again in order to avoid an indefinite
+ * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx
+ * for the behavior of socket events.
+ *------
+ */
+ cur_event->reset = true;
+ }
+ if ((cur_event->events & WL_SOCKET_WRITEABLE) &&
+ (resEvents.lNetworkEvents & FD_WRITE))
+ {
+ /* writeable */
+ occurred_events->events |= WL_SOCKET_WRITEABLE;
+ }
+ if ((cur_event->events & WL_SOCKET_CONNECTED) &&
+ (resEvents.lNetworkEvents & FD_CONNECT))
+ {
+ /* connected */
+ occurred_events->events |= WL_SOCKET_CONNECTED;
+ }
+ if (resEvents.lNetworkEvents & FD_CLOSE)
+ {
+ /* EOF/error, so signal all caller-requested socket flags */
+ occurred_events->events |= (cur_event->events & WL_SOCKET_MASK);
+ }
+
+ if (occurred_events->events != 0)
+ {
+ occurred_events++;
+ returned_events++;
+ }
+ }
+
+ return returned_events;
+}
+#endif
+
+/*
+ * Get the number of wait events registered in a given WaitEventSet.
+ */
+int
+GetNumRegisteredWaitEvents(WaitEventSet *set)
+{
+ return set->nevents;
+}
+
+#if defined(WAIT_USE_SELF_PIPE)
+
+/*
+ * SetLatch uses SIGURG to wake up the process waiting on the latch.
+ *
+ * Wake up WaitLatch, if we're waiting.
+ */
+static void
+latch_sigurg_handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ if (waiting)
+ sendSelfPipeByte();
+
+ errno = save_errno;
+}
+
+/* Send one byte to the self-pipe, to wake up WaitLatch */
+static void
+sendSelfPipeByte(void)
+{
+ int rc;
+ char dummy = 0;
+
+retry:
+ rc = write(selfpipe_writefd, &dummy, 1);
+ if (rc < 0)
+ {
+ /* If interrupted by signal, just retry */
+ if (errno == EINTR)
+ goto retry;
+
+ /*
+ * If the pipe is full, we don't need to retry, the data that's there
+ * already is enough to wake up WaitLatch.
+ */
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ return;
+
+ /*
+ * Oops, the write() failed for some other reason. We might be in a
+ * signal handler, so it's not safe to elog(). We have no choice but
+ * silently ignore the error.
+ */
+ return;
+ }
+}
+
+#endif
+
+#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD)
+
+/*
+ * Read all available data from self-pipe or signalfd.
+ *
+ * Note: this is only called when waiting = true. If it fails and doesn't
+ * return, it must reset that flag first (though ideally, this will never
+ * happen).
+ */
+static void
+drain(void)
+{
+ char buf[1024];
+ int rc;
+ int fd;
+
+#ifdef WAIT_USE_SELF_PIPE
+ fd = selfpipe_readfd;
+#else
+ fd = signal_fd;
+#endif
+
+ for (;;)
+ {
+ rc = read(fd, buf, sizeof(buf));
+ if (rc < 0)
+ {
+ if (errno == EAGAIN || errno == EWOULDBLOCK)
+ break; /* the descriptor is empty */
+ else if (errno == EINTR)
+ continue; /* retry */
+ else
+ {
+ waiting = false;
+#ifdef WAIT_USE_SELF_PIPE
+ elog(ERROR, "read() on self-pipe failed: %m");
+#else
+ elog(ERROR, "read() on signalfd failed: %m");
+#endif
+ }
+ }
+ else if (rc == 0)
+ {
+ waiting = false;
+#ifdef WAIT_USE_SELF_PIPE
+ elog(ERROR, "unexpected EOF on self-pipe");
+#else
+ elog(ERROR, "unexpected EOF on signalfd");
+#endif
+ }
+ else if (rc < sizeof(buf))
+ {
+ /* we successfully drained the pipe; no need to read() again */
+ break;
+ }
+ /* else buffer wasn't big enough, so read again */
+ }
+}
+
+#endif
diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c
new file mode 100644
index 0000000..280c239
--- /dev/null
+++ b/src/backend/storage/ipc/pmsignal.c
@@ -0,0 +1,430 @@
+/*-------------------------------------------------------------------------
+ *
+ * pmsignal.c
+ * routines for signaling between the postmaster and its child processes
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/pmsignal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#ifdef HAVE_SYS_PRCTL_H
+#include <sys/prctl.h>
+#endif
+
+#include "miscadmin.h"
+#include "postmaster/postmaster.h"
+#include "replication/walsender.h"
+#include "storage/pmsignal.h"
+#include "storage/shmem.h"
+
+
+/*
+ * The postmaster is signaled by its children by sending SIGUSR1. The
+ * specific reason is communicated via flags in shared memory. We keep
+ * a boolean flag for each possible "reason", so that different reasons
+ * can be signaled by different backends at the same time. (However,
+ * if the same reason is signaled more than once simultaneously, the
+ * postmaster will observe it only once.)
+ *
+ * The flags are actually declared as "volatile sig_atomic_t" for maximum
+ * portability. This should ensure that loads and stores of the flag
+ * values are atomic, allowing us to dispense with any explicit locking.
+ *
+ * In addition to the per-reason flags, we store a set of per-child-process
+ * flags that are currently used only for detecting whether a backend has
+ * exited without performing proper shutdown. The per-child-process flags
+ * have three possible states: UNUSED, ASSIGNED, ACTIVE. An UNUSED slot is
+ * available for assignment. An ASSIGNED slot is associated with a postmaster
+ * child process, but either the process has not touched shared memory yet,
+ * or it has successfully cleaned up after itself. A ACTIVE slot means the
+ * process is actively using shared memory. The slots are assigned to
+ * child processes at random, and postmaster.c is responsible for tracking
+ * which one goes with which PID.
+ *
+ * Actually there is a fourth state, WALSENDER. This is just like ACTIVE,
+ * but carries the extra information that the child is a WAL sender.
+ * WAL senders too start in ACTIVE state, but switch to WALSENDER once they
+ * start streaming the WAL (and they never go back to ACTIVE after that).
+ *
+ * We also have a shared-memory field that is used for communication in
+ * the opposite direction, from postmaster to children: it tells why the
+ * postmaster has broadcasted SIGQUIT signals, if indeed it has done so.
+ */
+
+#define PM_CHILD_UNUSED 0 /* these values must fit in sig_atomic_t */
+#define PM_CHILD_ASSIGNED 1
+#define PM_CHILD_ACTIVE 2
+#define PM_CHILD_WALSENDER 3
+
+/* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */
+struct PMSignalData
+{
+ /* per-reason flags for signaling the postmaster */
+ sig_atomic_t PMSignalFlags[NUM_PMSIGNALS];
+ /* global flags for signals from postmaster to children */
+ QuitSignalReason sigquit_reason; /* why SIGQUIT was sent */
+ /* per-child-process flags */
+ int num_child_flags; /* # of entries in PMChildFlags[] */
+ int next_child_flag; /* next slot to try to assign */
+ sig_atomic_t PMChildFlags[FLEXIBLE_ARRAY_MEMBER];
+};
+
+NON_EXEC_STATIC volatile PMSignalData *PMSignalState = NULL;
+
+/*
+ * Signal handler to be notified if postmaster dies.
+ */
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+volatile sig_atomic_t postmaster_possibly_dead = false;
+
+static void
+postmaster_death_handler(int signo)
+{
+ postmaster_possibly_dead = true;
+}
+
+/*
+ * The available signals depend on the OS. SIGUSR1 and SIGUSR2 are already
+ * used for other things, so choose another one.
+ *
+ * Currently, we assume that we can always find a signal to use. That
+ * seems like a reasonable assumption for all platforms that are modern
+ * enough to have a parent-death signaling mechanism.
+ */
+#if defined(SIGINFO)
+#define POSTMASTER_DEATH_SIGNAL SIGINFO
+#elif defined(SIGPWR)
+#define POSTMASTER_DEATH_SIGNAL SIGPWR
+#else
+#error "cannot find a signal to use for postmaster death"
+#endif
+
+#endif /* USE_POSTMASTER_DEATH_SIGNAL */
+
+/*
+ * PMSignalShmemSize
+ * Compute space needed for pmsignal.c's shared memory
+ */
+Size
+PMSignalShmemSize(void)
+{
+ Size size;
+
+ size = offsetof(PMSignalData, PMChildFlags);
+ size = add_size(size, mul_size(MaxLivePostmasterChildren(),
+ sizeof(sig_atomic_t)));
+
+ return size;
+}
+
+/*
+ * PMSignalShmemInit - initialize during shared-memory creation
+ */
+void
+PMSignalShmemInit(void)
+{
+ bool found;
+
+ PMSignalState = (PMSignalData *)
+ ShmemInitStruct("PMSignalState", PMSignalShmemSize(), &found);
+
+ if (!found)
+ {
+ /* initialize all flags to zeroes */
+ MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize());
+ PMSignalState->num_child_flags = MaxLivePostmasterChildren();
+ }
+}
+
+/*
+ * SendPostmasterSignal - signal the postmaster from a child process
+ */
+void
+SendPostmasterSignal(PMSignalReason reason)
+{
+ /* If called in a standalone backend, do nothing */
+ if (!IsUnderPostmaster)
+ return;
+ /* Atomically set the proper flag */
+ PMSignalState->PMSignalFlags[reason] = true;
+ /* Send signal to postmaster */
+ kill(PostmasterPid, SIGUSR1);
+}
+
+/*
+ * CheckPostmasterSignal - check to see if a particular reason has been
+ * signaled, and clear the signal flag. Should be called by postmaster
+ * after receiving SIGUSR1.
+ */
+bool
+CheckPostmasterSignal(PMSignalReason reason)
+{
+ /* Careful here --- don't clear flag if we haven't seen it set */
+ if (PMSignalState->PMSignalFlags[reason])
+ {
+ PMSignalState->PMSignalFlags[reason] = false;
+ return true;
+ }
+ return false;
+}
+
+/*
+ * SetQuitSignalReason - broadcast the reason for a system shutdown.
+ * Should be called by postmaster before sending SIGQUIT to children.
+ *
+ * Note: in a crash-and-restart scenario, the "reason" field gets cleared
+ * as a part of rebuilding shared memory; the postmaster need not do it
+ * explicitly.
+ */
+void
+SetQuitSignalReason(QuitSignalReason reason)
+{
+ PMSignalState->sigquit_reason = reason;
+}
+
+/*
+ * GetQuitSignalReason - obtain the reason for a system shutdown.
+ * Called by child processes when they receive SIGQUIT.
+ * If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT.
+ */
+QuitSignalReason
+GetQuitSignalReason(void)
+{
+ /* This is called in signal handlers, so be extra paranoid. */
+ if (!IsUnderPostmaster || PMSignalState == NULL)
+ return PMQUIT_NOT_SENT;
+ return PMSignalState->sigquit_reason;
+}
+
+
+/*
+ * AssignPostmasterChildSlot - select an unused slot for a new postmaster
+ * child process, and set its state to ASSIGNED. Returns a slot number
+ * (one to N).
+ *
+ * Only the postmaster is allowed to execute this routine, so we need no
+ * special locking.
+ */
+int
+AssignPostmasterChildSlot(void)
+{
+ int slot = PMSignalState->next_child_flag;
+ int n;
+
+ /*
+ * Scan for a free slot. We track the last slot assigned so as not to
+ * waste time repeatedly rescanning low-numbered slots.
+ */
+ for (n = PMSignalState->num_child_flags; n > 0; n--)
+ {
+ if (--slot < 0)
+ slot = PMSignalState->num_child_flags - 1;
+ if (PMSignalState->PMChildFlags[slot] == PM_CHILD_UNUSED)
+ {
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
+ PMSignalState->next_child_flag = slot;
+ return slot + 1;
+ }
+ }
+
+ /* Out of slots ... should never happen, else postmaster.c messed up */
+ elog(FATAL, "no free slots in PMChildFlags array");
+ return 0; /* keep compiler quiet */
+}
+
+/*
+ * ReleasePostmasterChildSlot - release a slot after death of a postmaster
+ * child process. This must be called in the postmaster process.
+ *
+ * Returns true if the slot had been in ASSIGNED state (the expected case),
+ * false otherwise (implying that the child failed to clean itself up).
+ */
+bool
+ReleasePostmasterChildSlot(int slot)
+{
+ bool result;
+
+ Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+ slot--;
+
+ /*
+ * Note: the slot state might already be unused, because the logic in
+ * postmaster.c is such that this might get called twice when a child
+ * crashes. So we don't try to Assert anything about the state.
+ */
+ result = (PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_UNUSED;
+ return result;
+}
+
+/*
+ * IsPostmasterChildWalSender - check if given slot is in use by a
+ * walsender process.
+ */
+bool
+IsPostmasterChildWalSender(int slot)
+{
+ Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+ slot--;
+
+ if (PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER)
+ return true;
+ else
+ return false;
+}
+
+/*
+ * MarkPostmasterChildActive - mark a postmaster child as about to begin
+ * actively using shared memory. This is called in the child process.
+ */
+void
+MarkPostmasterChildActive(void)
+{
+ int slot = MyPMChildSlot;
+
+ Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+ slot--;
+ Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED);
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_ACTIVE;
+}
+
+/*
+ * MarkPostmasterChildWalSender - mark a postmaster child as a WAL sender
+ * process. This is called in the child process, sometime after marking the
+ * child as active.
+ */
+void
+MarkPostmasterChildWalSender(void)
+{
+ int slot = MyPMChildSlot;
+
+ Assert(am_walsender);
+
+ Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+ slot--;
+ Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE);
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_WALSENDER;
+}
+
+/*
+ * MarkPostmasterChildInactive - mark a postmaster child as done using
+ * shared memory. This is called in the child process.
+ */
+void
+MarkPostmasterChildInactive(void)
+{
+ int slot = MyPMChildSlot;
+
+ Assert(slot > 0 && slot <= PMSignalState->num_child_flags);
+ slot--;
+ Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE ||
+ PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER);
+ PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED;
+}
+
+
+/*
+ * PostmasterIsAliveInternal - check whether postmaster process is still alive
+ *
+ * This is the slow path of PostmasterIsAlive(), where the caller has already
+ * checked 'postmaster_possibly_dead'. (On platforms that don't support
+ * a signal for parent death, PostmasterIsAlive() is just an alias for this.)
+ */
+bool
+PostmasterIsAliveInternal(void)
+{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+ /*
+ * Reset the flag before checking, so that we don't miss a signal if
+ * postmaster dies right after the check. If postmaster was indeed dead,
+ * we'll re-arm it before returning to caller.
+ */
+ postmaster_possibly_dead = false;
+#endif
+
+#ifndef WIN32
+ {
+ char c;
+ ssize_t rc;
+
+ rc = read(postmaster_alive_fds[POSTMASTER_FD_WATCH], &c, 1);
+
+ /*
+ * In the usual case, the postmaster is still alive, and there is no
+ * data in the pipe.
+ */
+ if (rc < 0 && (errno == EAGAIN || errno == EWOULDBLOCK))
+ return true;
+ else
+ {
+ /*
+ * Postmaster is dead, or something went wrong with the read()
+ * call.
+ */
+
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+ postmaster_possibly_dead = true;
+#endif
+
+ if (rc < 0)
+ elog(FATAL, "read on postmaster death monitoring pipe failed: %m");
+ else if (rc > 0)
+ elog(FATAL, "unexpected data in postmaster death monitoring pipe");
+
+ return false;
+ }
+ }
+
+#else /* WIN32 */
+ if (WaitForSingleObject(PostmasterHandle, 0) == WAIT_TIMEOUT)
+ return true;
+ else
+ {
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+ postmaster_possibly_dead = true;
+#endif
+ return false;
+ }
+#endif /* WIN32 */
+}
+
+/*
+ * PostmasterDeathSignalInit - request signal on postmaster death if possible
+ */
+void
+PostmasterDeathSignalInit(void)
+{
+#ifdef USE_POSTMASTER_DEATH_SIGNAL
+ int signum = POSTMASTER_DEATH_SIGNAL;
+
+ /* Register our signal handler. */
+ pqsignal(signum, postmaster_death_handler);
+
+ /* Request a signal on parent exit. */
+#if defined(PR_SET_PDEATHSIG)
+ if (prctl(PR_SET_PDEATHSIG, signum) < 0)
+ elog(ERROR, "could not request parent death signal: %m");
+#elif defined(PROC_PDEATHSIG_CTL)
+ if (procctl(P_PID, 0, PROC_PDEATHSIG_CTL, &signum) < 0)
+ elog(ERROR, "could not request parent death signal: %m");
+#else
+#error "USE_POSTMASTER_DEATH_SIGNAL set, but there is no mechanism to request the signal"
+#endif
+
+ /*
+ * Just in case the parent was gone already and we missed it, we'd better
+ * check the slow way on the first call.
+ */
+ postmaster_possibly_dead = true;
+#endif /* USE_POSTMASTER_DEATH_SIGNAL */
+}
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
new file mode 100644
index 0000000..755f842
--- /dev/null
+++ b/src/backend/storage/ipc/procarray.c
@@ -0,0 +1,5220 @@
+/*-------------------------------------------------------------------------
+ *
+ * procarray.c
+ * POSTGRES process array code.
+ *
+ *
+ * This module maintains arrays of PGPROC substructures, as well as associated
+ * arrays in ProcGlobal, for all active backends. Although there are several
+ * uses for this, the principal one is as a means of determining the set of
+ * currently running transactions.
+ *
+ * Because of various subtle race conditions it is critical that a backend
+ * hold the correct locks while setting or clearing its xid (in
+ * ProcGlobal->xids[]/MyProc->xid). See notes in
+ * src/backend/access/transam/README.
+ *
+ * The process arrays now also include structures representing prepared
+ * transactions. The xid and subxids fields of these are valid, as are the
+ * myProcLocks lists. They can be distinguished from regular backend PGPROCs
+ * at need by checking for pid == 0.
+ *
+ * During hot standby, we also keep a list of XIDs representing transactions
+ * that are known to be running on the primary (or more precisely, were running
+ * as of the current point in the WAL stream). This list is kept in the
+ * KnownAssignedXids array, and is updated by watching the sequence of
+ * arriving XIDs. This is necessary because if we leave those XIDs out of
+ * snapshots taken for standby queries, then they will appear to be already
+ * complete, leading to MVCC failures. Note that in hot standby, the PGPROC
+ * array represents standby processes, which by definition are not running
+ * transactions that have XIDs.
+ *
+ * It is perhaps possible for a backend on the primary to terminate without
+ * writing an abort record for its transaction. While that shouldn't really
+ * happen, it would tie up KnownAssignedXids indefinitely, so we protect
+ * ourselves by pruning the array when a valid list of running XIDs arrives.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/procarray.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "access/clog.h"
+#include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "catalog/catalog.h"
+#include "catalog/pg_authid.h"
+#include "commands/dbcommands.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/spin.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+#include "utils/rel.h"
+#include "utils/snapmgr.h"
+
+#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var))))
+
+/* Our shared memory area */
+typedef struct ProcArrayStruct
+{
+ int numProcs; /* number of valid procs entries */
+ int maxProcs; /* allocated size of procs array */
+
+ /*
+ * Known assigned XIDs handling
+ */
+ int maxKnownAssignedXids; /* allocated size of array */
+ int numKnownAssignedXids; /* current # of valid entries */
+ int tailKnownAssignedXids; /* index of oldest valid element */
+ int headKnownAssignedXids; /* index of newest element, + 1 */
+ slock_t known_assigned_xids_lck; /* protects head/tail pointers */
+
+ /*
+ * Highest subxid that has been removed from KnownAssignedXids array to
+ * prevent overflow; or InvalidTransactionId if none. We track this for
+ * similar reasons to tracking overflowing cached subxids in PGPROC
+ * entries. Must hold exclusive ProcArrayLock to change this, and shared
+ * lock to read it.
+ */
+ TransactionId lastOverflowedXid;
+
+ /* oldest xmin of any replication slot */
+ TransactionId replication_slot_xmin;
+ /* oldest catalog xmin of any replication slot */
+ TransactionId replication_slot_catalog_xmin;
+
+ /* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */
+ int pgprocnos[FLEXIBLE_ARRAY_MEMBER];
+} ProcArrayStruct;
+
+/*
+ * State for the GlobalVisTest* family of functions. Those functions can
+ * e.g. be used to decide if a deleted row can be removed without violating
+ * MVCC semantics: If the deleted row's xmax is not considered to be running
+ * by anyone, the row can be removed.
+ *
+ * To avoid slowing down GetSnapshotData(), we don't calculate a precise
+ * cutoff XID while building a snapshot (looking at the frequently changing
+ * xmins scales badly). Instead we compute two boundaries while building the
+ * snapshot:
+ *
+ * 1) definitely_needed, indicating that rows deleted by XIDs >=
+ * definitely_needed are definitely still visible.
+ *
+ * 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can
+ * definitely be removed
+ *
+ * When testing an XID that falls in between the two (i.e. XID >= maybe_needed
+ * && XID < definitely_needed), the boundaries can be recomputed (using
+ * ComputeXidHorizons()) to get a more accurate answer. This is cheaper than
+ * maintaining an accurate value all the time.
+ *
+ * As it is not cheap to compute accurate boundaries, we limit the number of
+ * times that happens in short succession. See GlobalVisTestShouldUpdate().
+ *
+ *
+ * There are three backend lifetime instances of this struct, optimized for
+ * different types of relations. As e.g. a normal user defined table in one
+ * database is inaccessible to backends connected to another database, a test
+ * specific to a relation can be more aggressive than a test for a shared
+ * relation. Currently we track four different states:
+ *
+ * 1) GlobalVisSharedRels, which only considers an XID's
+ * effects visible-to-everyone if neither snapshots in any database, nor a
+ * replication slot's xmin, nor a replication slot's catalog_xmin might
+ * still consider XID as running.
+ *
+ * 2) GlobalVisCatalogRels, which only considers an XID's
+ * effects visible-to-everyone if neither snapshots in the current
+ * database, nor a replication slot's xmin, nor a replication slot's
+ * catalog_xmin might still consider XID as running.
+ *
+ * I.e. the difference to GlobalVisSharedRels is that
+ * snapshot in other databases are ignored.
+ *
+ * 3) GlobalVisDataRels, which only considers an XID's
+ * effects visible-to-everyone if neither snapshots in the current
+ * database, nor a replication slot's xmin consider XID as running.
+ *
+ * I.e. the difference to GlobalVisCatalogRels is that
+ * replication slot's catalog_xmin is not taken into account.
+ *
+ * 4) GlobalVisTempRels, which only considers the current session, as temp
+ * tables are not visible to other sessions.
+ *
+ * GlobalVisTestFor(relation) returns the appropriate state
+ * for the relation.
+ *
+ * The boundaries are FullTransactionIds instead of TransactionIds to avoid
+ * wraparound dangers. There e.g. would otherwise exist no procarray state to
+ * prevent maybe_needed to become old enough after the GetSnapshotData()
+ * call.
+ *
+ * The typedef is in the header.
+ */
+struct GlobalVisState
+{
+ /* XIDs >= are considered running by some backend */
+ FullTransactionId definitely_needed;
+
+ /* XIDs < are not considered to be running by any backend */
+ FullTransactionId maybe_needed;
+};
+
+/*
+ * Result of ComputeXidHorizons().
+ */
+typedef struct ComputeXidHorizonsResult
+{
+ /*
+ * The value of ShmemVariableCache->latestCompletedXid when
+ * ComputeXidHorizons() held ProcArrayLock.
+ */
+ FullTransactionId latest_completed;
+
+ /*
+ * The same for procArray->replication_slot_xmin and.
+ * procArray->replication_slot_catalog_xmin.
+ */
+ TransactionId slot_xmin;
+ TransactionId slot_catalog_xmin;
+
+ /*
+ * Oldest xid that any backend might still consider running. This needs to
+ * include processes running VACUUM, in contrast to the normal visibility
+ * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when
+ * determining visibility, but doesn't care about rows above its xmin to
+ * be removed.
+ *
+ * This likely should only be needed to determine whether pg_subtrans can
+ * be truncated. It currently includes the effects of replication slots,
+ * for historical reasons. But that could likely be changed.
+ */
+ TransactionId oldest_considered_running;
+
+ /*
+ * Oldest xid for which deleted tuples need to be retained in shared
+ * tables.
+ *
+ * This includes the effects of replication slots. If that's not desired,
+ * look at shared_oldest_nonremovable_raw;
+ */
+ TransactionId shared_oldest_nonremovable;
+
+ /*
+ * Oldest xid that may be necessary to retain in shared tables. This is
+ * the same as shared_oldest_nonremovable, except that is not affected by
+ * replication slot's catalog_xmin.
+ *
+ * This is mainly useful to be able to send the catalog_xmin to upstream
+ * streaming replication servers via hot_standby_feedback, so they can
+ * apply the limit only when accessing catalog tables.
+ */
+ TransactionId shared_oldest_nonremovable_raw;
+
+ /*
+ * Oldest xid for which deleted tuples need to be retained in non-shared
+ * catalog tables.
+ */
+ TransactionId catalog_oldest_nonremovable;
+
+ /*
+ * Oldest xid for which deleted tuples need to be retained in normal user
+ * defined tables.
+ */
+ TransactionId data_oldest_nonremovable;
+
+ /*
+ * Oldest xid for which deleted tuples need to be retained in this
+ * session's temporary tables.
+ */
+ TransactionId temp_oldest_nonremovable;
+
+} ComputeXidHorizonsResult;
+
+/*
+ * Return value for GlobalVisHorizonKindForRel().
+ */
+typedef enum GlobalVisHorizonKind
+{
+ VISHORIZON_SHARED,
+ VISHORIZON_CATALOG,
+ VISHORIZON_DATA,
+ VISHORIZON_TEMP
+} GlobalVisHorizonKind;
+
+
+static ProcArrayStruct *procArray;
+
+static PGPROC *allProcs;
+
+/*
+ * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress()
+ */
+static TransactionId cachedXidIsNotInProgress = InvalidTransactionId;
+
+/*
+ * Bookkeeping for tracking emulated transactions in recovery
+ */
+static TransactionId *KnownAssignedXids;
+static bool *KnownAssignedXidsValid;
+static TransactionId latestObservedXid = InvalidTransactionId;
+
+/*
+ * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is
+ * the highest xid that might still be running that we don't have in
+ * KnownAssignedXids.
+ */
+static TransactionId standbySnapshotPendingXmin;
+
+/*
+ * State for visibility checks on different types of relations. See struct
+ * GlobalVisState for details. As shared, catalog, normal and temporary
+ * relations can have different horizons, one such state exists for each.
+ */
+static GlobalVisState GlobalVisSharedRels;
+static GlobalVisState GlobalVisCatalogRels;
+static GlobalVisState GlobalVisDataRels;
+static GlobalVisState GlobalVisTempRels;
+
+/*
+ * This backend's RecentXmin at the last time the accurate xmin horizon was
+ * recomputed, or InvalidTransactionId if it has not. Used to limit how many
+ * times accurate horizons are recomputed. See GlobalVisTestShouldUpdate().
+ */
+static TransactionId ComputeXidHorizonsResultLastXmin;
+
+#ifdef XIDCACHE_DEBUG
+
+/* counters for XidCache measurement */
+static long xc_by_recent_xmin = 0;
+static long xc_by_known_xact = 0;
+static long xc_by_my_xact = 0;
+static long xc_by_latest_xid = 0;
+static long xc_by_main_xid = 0;
+static long xc_by_child_xid = 0;
+static long xc_by_known_assigned = 0;
+static long xc_no_overflow = 0;
+static long xc_slow_answer = 0;
+
+#define xc_by_recent_xmin_inc() (xc_by_recent_xmin++)
+#define xc_by_known_xact_inc() (xc_by_known_xact++)
+#define xc_by_my_xact_inc() (xc_by_my_xact++)
+#define xc_by_latest_xid_inc() (xc_by_latest_xid++)
+#define xc_by_main_xid_inc() (xc_by_main_xid++)
+#define xc_by_child_xid_inc() (xc_by_child_xid++)
+#define xc_by_known_assigned_inc() (xc_by_known_assigned++)
+#define xc_no_overflow_inc() (xc_no_overflow++)
+#define xc_slow_answer_inc() (xc_slow_answer++)
+
+static void DisplayXidCache(void);
+#else /* !XIDCACHE_DEBUG */
+
+#define xc_by_recent_xmin_inc() ((void) 0)
+#define xc_by_known_xact_inc() ((void) 0)
+#define xc_by_my_xact_inc() ((void) 0)
+#define xc_by_latest_xid_inc() ((void) 0)
+#define xc_by_main_xid_inc() ((void) 0)
+#define xc_by_child_xid_inc() ((void) 0)
+#define xc_by_known_assigned_inc() ((void) 0)
+#define xc_no_overflow_inc() ((void) 0)
+#define xc_slow_answer_inc() ((void) 0)
+#endif /* XIDCACHE_DEBUG */
+
+static VirtualTransactionId *GetVirtualXIDsDelayingChkptGuts(int *nvxids,
+ int type);
+static bool HaveVirtualXIDsDelayingChkptGuts(VirtualTransactionId *vxids,
+ int nvxids, int type);
+
+/* Primitives for KnownAssignedXids array handling for standby */
+static void KnownAssignedXidsCompress(bool force);
+static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
+ bool exclusive_lock);
+static bool KnownAssignedXidsSearch(TransactionId xid, bool remove);
+static bool KnownAssignedXidExists(TransactionId xid);
+static void KnownAssignedXidsRemove(TransactionId xid);
+static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
+ TransactionId *subxids);
+static void KnownAssignedXidsRemovePreceding(TransactionId xid);
+static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax);
+static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray,
+ TransactionId *xmin,
+ TransactionId xmax);
+static TransactionId KnownAssignedXidsGetOldestXmin(void);
+static void KnownAssignedXidsDisplay(int trace_level);
+static void KnownAssignedXidsReset(void);
+static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid);
+static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid);
+static void MaintainLatestCompletedXid(TransactionId latestXid);
+static void MaintainLatestCompletedXidRecovery(TransactionId latestXid);
+
+static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel,
+ TransactionId xid);
+static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons);
+
+/*
+ * Report shared-memory space needed by CreateSharedProcArray.
+ */
+Size
+ProcArrayShmemSize(void)
+{
+ Size size;
+
+ /* Size of the ProcArray structure itself */
+#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts)
+
+ size = offsetof(ProcArrayStruct, pgprocnos);
+ size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS));
+
+ /*
+ * During Hot Standby processing we have a data structure called
+ * KnownAssignedXids, created in shared memory. Local data structures are
+ * also created in various backends during GetSnapshotData(),
+ * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the
+ * main structures created in those functions must be identically sized,
+ * since we may at times copy the whole of the data structures around. We
+ * refer to this size as TOTAL_MAX_CACHED_SUBXIDS.
+ *
+ * Ideally we'd only create this structure if we were actually doing hot
+ * standby in the current run, but we don't know that yet at the time
+ * shared memory is being set up.
+ */
+#define TOTAL_MAX_CACHED_SUBXIDS \
+ ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+
+ if (EnableHotStandby)
+ {
+ size = add_size(size,
+ mul_size(sizeof(TransactionId),
+ TOTAL_MAX_CACHED_SUBXIDS));
+ size = add_size(size,
+ mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS));
+ }
+
+ return size;
+}
+
+/*
+ * Initialize the shared PGPROC array during postmaster startup.
+ */
+void
+CreateSharedProcArray(void)
+{
+ bool found;
+
+ /* Create or attach to the ProcArray shared structure */
+ procArray = (ProcArrayStruct *)
+ ShmemInitStruct("Proc Array",
+ add_size(offsetof(ProcArrayStruct, pgprocnos),
+ mul_size(sizeof(int),
+ PROCARRAY_MAXPROCS)),
+ &found);
+
+ if (!found)
+ {
+ /*
+ * We're the first - initialize.
+ */
+ procArray->numProcs = 0;
+ procArray->maxProcs = PROCARRAY_MAXPROCS;
+ procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS;
+ procArray->numKnownAssignedXids = 0;
+ procArray->tailKnownAssignedXids = 0;
+ procArray->headKnownAssignedXids = 0;
+ SpinLockInit(&procArray->known_assigned_xids_lck);
+ procArray->lastOverflowedXid = InvalidTransactionId;
+ procArray->replication_slot_xmin = InvalidTransactionId;
+ procArray->replication_slot_catalog_xmin = InvalidTransactionId;
+ ShmemVariableCache->xactCompletionCount = 1;
+ }
+
+ allProcs = ProcGlobal->allProcs;
+
+ /* Create or attach to the KnownAssignedXids arrays too, if needed */
+ if (EnableHotStandby)
+ {
+ KnownAssignedXids = (TransactionId *)
+ ShmemInitStruct("KnownAssignedXids",
+ mul_size(sizeof(TransactionId),
+ TOTAL_MAX_CACHED_SUBXIDS),
+ &found);
+ KnownAssignedXidsValid = (bool *)
+ ShmemInitStruct("KnownAssignedXidsValid",
+ mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS),
+ &found);
+ }
+}
+
+/*
+ * Add the specified PGPROC to the shared array.
+ */
+void
+ProcArrayAdd(PGPROC *proc)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+ int movecount;
+
+ /* See ProcGlobal comment explaining why both locks are held */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+ if (arrayP->numProcs >= arrayP->maxProcs)
+ {
+ /*
+ * Oops, no room. (This really shouldn't happen, since there is a
+ * fixed supply of PGPROC structs too, and so we should have failed
+ * earlier.)
+ */
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("sorry, too many clients already")));
+ }
+
+ /*
+ * Keep the procs array sorted by (PGPROC *) so that we can utilize
+ * locality of references much better. This is useful while traversing the
+ * ProcArray because there is an increased likelihood of finding the next
+ * PGPROC structure in the cache.
+ *
+ * Since the occurrence of adding/removing a proc is much lower than the
+ * access to the ProcArray itself, the overhead should be marginal
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int procno PG_USED_FOR_ASSERTS_ONLY = arrayP->pgprocnos[index];
+
+ Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+ Assert(allProcs[procno].pgxactoff == index);
+
+ /* If we have found our right position in the array, break */
+ if (arrayP->pgprocnos[index] > proc->pgprocno)
+ break;
+ }
+
+ movecount = arrayP->numProcs - index;
+ memmove(&arrayP->pgprocnos[index + 1],
+ &arrayP->pgprocnos[index],
+ movecount * sizeof(*arrayP->pgprocnos));
+ memmove(&ProcGlobal->xids[index + 1],
+ &ProcGlobal->xids[index],
+ movecount * sizeof(*ProcGlobal->xids));
+ memmove(&ProcGlobal->subxidStates[index + 1],
+ &ProcGlobal->subxidStates[index],
+ movecount * sizeof(*ProcGlobal->subxidStates));
+ memmove(&ProcGlobal->statusFlags[index + 1],
+ &ProcGlobal->statusFlags[index],
+ movecount * sizeof(*ProcGlobal->statusFlags));
+
+ arrayP->pgprocnos[index] = proc->pgprocno;
+ proc->pgxactoff = index;
+ ProcGlobal->xids[index] = proc->xid;
+ ProcGlobal->subxidStates[index] = proc->subxidStatus;
+ ProcGlobal->statusFlags[index] = proc->statusFlags;
+
+ arrayP->numProcs++;
+
+ /* adjust pgxactoff for all following PGPROCs */
+ index++;
+ for (; index < arrayP->numProcs; index++)
+ {
+ int procno = arrayP->pgprocnos[index];
+
+ Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+ Assert(allProcs[procno].pgxactoff == index - 1);
+
+ allProcs[procno].pgxactoff = index;
+ }
+
+ /*
+ * Release in reversed acquisition order, to reduce frequency of having to
+ * wait for XidGenLock while holding ProcArrayLock.
+ */
+ LWLockRelease(XidGenLock);
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * Remove the specified PGPROC from the shared array.
+ *
+ * When latestXid is a valid XID, we are removing a live 2PC gxact from the
+ * array, and thus causing it to appear as "not running" anymore. In this
+ * case we must advance latestCompletedXid. (This is essentially the same
+ * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take
+ * the ProcArrayLock only once, and don't damage the content of the PGPROC;
+ * twophase.c depends on the latter.)
+ */
+void
+ProcArrayRemove(PGPROC *proc, TransactionId latestXid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int myoff;
+ int movecount;
+
+#ifdef XIDCACHE_DEBUG
+ /* dump stats at backend shutdown, but not prepared-xact end */
+ if (proc->pid != 0)
+ DisplayXidCache();
+#endif
+
+ /* See ProcGlobal comment explaining why both locks are held */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
+
+ myoff = proc->pgxactoff;
+
+ Assert(myoff >= 0 && myoff < arrayP->numProcs);
+ Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff);
+
+ if (TransactionIdIsValid(latestXid))
+ {
+ Assert(TransactionIdIsValid(ProcGlobal->xids[myoff]));
+
+ /* Advance global latestCompletedXid while holding the lock */
+ MaintainLatestCompletedXid(latestXid);
+
+ /* Same with xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+
+ ProcGlobal->xids[myoff] = InvalidTransactionId;
+ ProcGlobal->subxidStates[myoff].overflowed = false;
+ ProcGlobal->subxidStates[myoff].count = 0;
+ }
+ else
+ {
+ /* Shouldn't be trying to remove a live transaction here */
+ Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
+ }
+
+ Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff]));
+ Assert(ProcGlobal->subxidStates[myoff].count == 0);
+ Assert(ProcGlobal->subxidStates[myoff].overflowed == false);
+
+ ProcGlobal->statusFlags[myoff] = 0;
+
+ /* Keep the PGPROC array sorted. See notes above */
+ movecount = arrayP->numProcs - myoff - 1;
+ memmove(&arrayP->pgprocnos[myoff],
+ &arrayP->pgprocnos[myoff + 1],
+ movecount * sizeof(*arrayP->pgprocnos));
+ memmove(&ProcGlobal->xids[myoff],
+ &ProcGlobal->xids[myoff + 1],
+ movecount * sizeof(*ProcGlobal->xids));
+ memmove(&ProcGlobal->subxidStates[myoff],
+ &ProcGlobal->subxidStates[myoff + 1],
+ movecount * sizeof(*ProcGlobal->subxidStates));
+ memmove(&ProcGlobal->statusFlags[myoff],
+ &ProcGlobal->statusFlags[myoff + 1],
+ movecount * sizeof(*ProcGlobal->statusFlags));
+
+ arrayP->pgprocnos[arrayP->numProcs - 1] = -1; /* for debugging */
+ arrayP->numProcs--;
+
+ /*
+ * Adjust pgxactoff of following procs for removed PGPROC (note that
+ * numProcs already has been decremented).
+ */
+ for (int index = myoff; index < arrayP->numProcs; index++)
+ {
+ int procno = arrayP->pgprocnos[index];
+
+ Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS));
+ Assert(allProcs[procno].pgxactoff - 1 == index);
+
+ allProcs[procno].pgxactoff = index;
+ }
+
+ /*
+ * Release in reversed acquisition order, to reduce frequency of having to
+ * wait for XidGenLock while holding ProcArrayLock.
+ */
+ LWLockRelease(XidGenLock);
+ LWLockRelease(ProcArrayLock);
+}
+
+
+/*
+ * ProcArrayEndTransaction -- mark a transaction as no longer running
+ *
+ * This is used interchangeably for commit and abort cases. The transaction
+ * commit/abort must already be reported to WAL and pg_xact.
+ *
+ * proc is currently always MyProc, but we pass it explicitly for flexibility.
+ * latestXid is the latest Xid among the transaction's main XID and
+ * subtransactions, or InvalidTransactionId if it has no XID. (We must ask
+ * the caller to pass latestXid, instead of computing it from the PGPROC's
+ * contents, because the subxid information in the PGPROC might be
+ * incomplete.)
+ */
+void
+ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid)
+{
+ if (TransactionIdIsValid(latestXid))
+ {
+ /*
+ * We must lock ProcArrayLock while clearing our advertised XID, so
+ * that we do not exit the set of "running" transactions while someone
+ * else is taking a snapshot. See discussion in
+ * src/backend/access/transam/README.
+ */
+ Assert(TransactionIdIsValid(proc->xid));
+
+ /*
+ * If we can immediately acquire ProcArrayLock, we clear our own XID
+ * and release the lock. If not, use group XID clearing to improve
+ * efficiency.
+ */
+ if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE))
+ {
+ ProcArrayEndTransactionInternal(proc, latestXid);
+ LWLockRelease(ProcArrayLock);
+ }
+ else
+ ProcArrayGroupClearXid(proc, latestXid);
+ }
+ else
+ {
+ /*
+ * If we have no XID, we don't need to lock, since we won't affect
+ * anyone else's calculation of a snapshot. We might change their
+ * estimate of global xmin, but that's OK.
+ */
+ Assert(!TransactionIdIsValid(proc->xid));
+ Assert(proc->subxidStatus.count == 0);
+ Assert(!proc->subxidStatus.overflowed);
+
+ proc->lxid = InvalidLocalTransactionId;
+ proc->xmin = InvalidTransactionId;
+
+ /* be sure these are cleared in abort */
+ proc->delayChkpt = false;
+ proc->delayChkptEnd = false;
+
+ proc->recoveryConflictPending = false;
+
+ /* must be cleared with xid/xmin: */
+ /* avoid unnecessarily dirtying shared cachelines */
+ if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
+ {
+ Assert(!LWLockHeldByMe(ProcArrayLock));
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ Assert(proc->statusFlags == ProcGlobal->statusFlags[proc->pgxactoff]);
+ proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
+ ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
+ LWLockRelease(ProcArrayLock);
+ }
+ }
+}
+
+/*
+ * Mark a write transaction as no longer running.
+ *
+ * We don't do any locking here; caller must handle that.
+ */
+static inline void
+ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid)
+{
+ int pgxactoff = proc->pgxactoff;
+
+ /*
+ * Note: we need exclusive lock here because we're going to change other
+ * processes' PGPROC entries.
+ */
+ Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE));
+ Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff]));
+ Assert(ProcGlobal->xids[pgxactoff] == proc->xid);
+
+ ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+ proc->xid = InvalidTransactionId;
+ proc->lxid = InvalidLocalTransactionId;
+ proc->xmin = InvalidTransactionId;
+
+ /* be sure these are cleared in abort */
+ proc->delayChkpt = false;
+ proc->delayChkptEnd = false;
+
+ proc->recoveryConflictPending = false;
+
+ /* must be cleared with xid/xmin: */
+ /* avoid unnecessarily dirtying shared cachelines */
+ if (proc->statusFlags & PROC_VACUUM_STATE_MASK)
+ {
+ proc->statusFlags &= ~PROC_VACUUM_STATE_MASK;
+ ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags;
+ }
+
+ /* Clear the subtransaction-XID cache too while holding the lock */
+ Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+ ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+ if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+ {
+ ProcGlobal->subxidStates[pgxactoff].count = 0;
+ ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+ proc->subxidStatus.count = 0;
+ proc->subxidStatus.overflowed = false;
+ }
+
+ /* Also advance global latestCompletedXid while holding the lock */
+ MaintainLatestCompletedXid(latestXid);
+
+ /* Same with xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+}
+
+/*
+ * ProcArrayGroupClearXid -- group XID clearing
+ *
+ * When we cannot immediately acquire ProcArrayLock in exclusive mode at
+ * commit time, add ourselves to a list of processes that need their XIDs
+ * cleared. The first process to add itself to the list will acquire
+ * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal
+ * on behalf of all group members. This avoids a great deal of contention
+ * around ProcArrayLock when many processes are trying to commit at once,
+ * since the lock need not be repeatedly handed off from one committing
+ * process to the next.
+ */
+static void
+ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid)
+{
+ PROC_HDR *procglobal = ProcGlobal;
+ uint32 nextidx;
+ uint32 wakeidx;
+
+ /* We should definitely have an XID to clear. */
+ Assert(TransactionIdIsValid(proc->xid));
+
+ /* Add ourselves to the list of processes needing a group XID clear. */
+ proc->procArrayGroupMember = true;
+ proc->procArrayGroupMemberXid = latestXid;
+ nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst);
+ while (true)
+ {
+ pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx);
+
+ if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst,
+ &nextidx,
+ (uint32) proc->pgprocno))
+ break;
+ }
+
+ /*
+ * If the list was not empty, the leader will clear our XID. It is
+ * impossible to have followers without a leader because the first process
+ * that has added itself to the list will always have nextidx as
+ * INVALID_PGPROCNO.
+ */
+ if (nextidx != INVALID_PGPROCNO)
+ {
+ int extraWaits = 0;
+
+ /* Sleep until the leader clears our XID. */
+ pgstat_report_wait_start(WAIT_EVENT_PROCARRAY_GROUP_UPDATE);
+ for (;;)
+ {
+ /* acts as a read barrier */
+ PGSemaphoreLock(proc->sem);
+ if (!proc->procArrayGroupMember)
+ break;
+ extraWaits++;
+ }
+ pgstat_report_wait_end();
+
+ Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO);
+
+ /* Fix semaphore count for any absorbed wakeups */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+ return;
+ }
+
+ /* We are the leader. Acquire the lock on behalf of everyone. */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Now that we've got the lock, clear the list of processes waiting for
+ * group XID clearing, saving a pointer to the head of the list. Trying
+ * to pop elements one at a time could lead to an ABA problem.
+ */
+ nextidx = pg_atomic_exchange_u32(&procglobal->procArrayGroupFirst,
+ INVALID_PGPROCNO);
+
+ /* Remember head of list so we can perform wakeups after dropping lock. */
+ wakeidx = nextidx;
+
+ /* Walk the list and clear all XIDs. */
+ while (nextidx != INVALID_PGPROCNO)
+ {
+ PGPROC *nextproc = &allProcs[nextidx];
+
+ ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid);
+
+ /* Move to next proc in list. */
+ nextidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
+ }
+
+ /* We're done with the lock now. */
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * Now that we've released the lock, go back and wake everybody up. We
+ * don't do this under the lock so as to keep lock hold times to a
+ * minimum. The system calls we need to perform to wake other processes
+ * up are probably much slower than the simple memory writes we did while
+ * holding the lock.
+ */
+ while (wakeidx != INVALID_PGPROCNO)
+ {
+ PGPROC *nextproc = &allProcs[wakeidx];
+
+ wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext);
+ pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PGPROCNO);
+
+ /* ensure all previous writes are visible before follower continues. */
+ pg_write_barrier();
+
+ nextproc->procArrayGroupMember = false;
+
+ if (nextproc != MyProc)
+ PGSemaphoreUnlock(nextproc->sem);
+ }
+}
+
+/*
+ * ProcArrayClearTransaction -- clear the transaction fields
+ *
+ * This is used after successfully preparing a 2-phase transaction. We are
+ * not actually reporting the transaction's XID as no longer running --- it
+ * will still appear as running because the 2PC's gxact is in the ProcArray
+ * too. We just have to clear out our own PGPROC.
+ */
+void
+ProcArrayClearTransaction(PGPROC *proc)
+{
+ int pgxactoff;
+
+ /*
+ * Currently we need to lock ProcArrayLock exclusively here, as we
+ * increment xactCompletionCount below. We also need it at least in shared
+ * mode for pgproc->pgxactoff to stay the same below.
+ *
+ * We could however, as this action does not actually change anyone's view
+ * of the set of running XIDs (our entry is duplicate with the gxact that
+ * has already been inserted into the ProcArray), lower the lock level to
+ * shared if we were to make xactCompletionCount an atomic variable. But
+ * that doesn't seem worth it currently, as a 2PC commit is heavyweight
+ * enough for this not to be the bottleneck. If it ever becomes a
+ * bottleneck it may also be worth considering to combine this with the
+ * subsequent ProcArrayRemove()
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ pgxactoff = proc->pgxactoff;
+
+ ProcGlobal->xids[pgxactoff] = InvalidTransactionId;
+ proc->xid = InvalidTransactionId;
+
+ proc->lxid = InvalidLocalTransactionId;
+ proc->xmin = InvalidTransactionId;
+ proc->recoveryConflictPending = false;
+
+ Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK));
+ Assert(!proc->delayChkpt);
+
+ /*
+ * Need to increment completion count even though transaction hasn't
+ * really committed yet. The reason for that is that GetSnapshotData()
+ * omits the xid of the current transaction, thus without the increment we
+ * otherwise could end up reusing the snapshot later. Which would be bad,
+ * because it might not count the prepared transaction as running.
+ */
+ ShmemVariableCache->xactCompletionCount++;
+
+ /* Clear the subtransaction-XID cache too */
+ Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count &&
+ ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed);
+ if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed)
+ {
+ ProcGlobal->subxidStates[pgxactoff].count = 0;
+ ProcGlobal->subxidStates[pgxactoff].overflowed = false;
+ proc->subxidStatus.count = 0;
+ proc->subxidStatus.overflowed = false;
+ }
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * Update ShmemVariableCache->latestCompletedXid to point to latestXid if
+ * currently older.
+ */
+static void
+MaintainLatestCompletedXid(TransactionId latestXid)
+{
+ FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+
+ Assert(FullTransactionIdIsValid(cur_latest));
+ Assert(!RecoveryInProgress());
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+ {
+ ShmemVariableCache->latestCompletedXid =
+ FullXidRelativeTo(cur_latest, latestXid);
+ }
+
+ Assert(IsBootstrapProcessingMode() ||
+ FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * Same as MaintainLatestCompletedXid, except for use during WAL replay.
+ */
+static void
+MaintainLatestCompletedXidRecovery(TransactionId latestXid)
+{
+ FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid;
+ FullTransactionId rel;
+
+ Assert(AmStartupProcess() || !IsUnderPostmaster);
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ /*
+ * Need a FullTransactionId to compare latestXid with. Can't rely on
+ * latestCompletedXid to be initialized in recovery. But in recovery it's
+ * safe to access nextXid without a lock for the startup process.
+ */
+ rel = ShmemVariableCache->nextXid;
+ Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+ if (!FullTransactionIdIsValid(cur_latest) ||
+ TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid))
+ {
+ ShmemVariableCache->latestCompletedXid =
+ FullXidRelativeTo(rel, latestXid);
+ }
+
+ Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid));
+}
+
+/*
+ * ProcArrayInitRecovery -- initialize recovery xid mgmt environment
+ *
+ * Remember up to where the startup process initialized the CLOG and subtrans
+ * so we can ensure it's initialized gaplessly up to the point where necessary
+ * while in recovery.
+ */
+void
+ProcArrayInitRecovery(TransactionId initializedUptoXID)
+{
+ Assert(standbyState == STANDBY_INITIALIZED);
+ Assert(TransactionIdIsNormal(initializedUptoXID));
+
+ /*
+ * we set latestObservedXid to the xid SUBTRANS has been initialized up
+ * to, so we can extend it from that point onwards in
+ * RecordKnownAssignedTransactionIds, and when we get consistent in
+ * ProcArrayApplyRecoveryInfo().
+ */
+ latestObservedXid = initializedUptoXID;
+ TransactionIdRetreat(latestObservedXid);
+}
+
+/*
+ * ProcArrayApplyRecoveryInfo -- apply recovery info about xids
+ *
+ * Takes us through 3 states: Initialized, Pending and Ready.
+ * Normal case is to go all the way to Ready straight away, though there
+ * are atypical cases where we need to take it in steps.
+ *
+ * Use the data about running transactions on the primary to create the initial
+ * state of KnownAssignedXids. We also use these records to regularly prune
+ * KnownAssignedXids because we know it is possible that some transactions
+ * with FATAL errors fail to write abort records, which could cause eventual
+ * overflow.
+ *
+ * See comments for LogStandbySnapshot().
+ */
+void
+ProcArrayApplyRecoveryInfo(RunningTransactions running)
+{
+ TransactionId *xids;
+ int nxids;
+ int i;
+
+ Assert(standbyState >= STANDBY_INITIALIZED);
+ Assert(TransactionIdIsValid(running->nextXid));
+ Assert(TransactionIdIsValid(running->oldestRunningXid));
+ Assert(TransactionIdIsNormal(running->latestCompletedXid));
+
+ /*
+ * Remove stale transactions, if any.
+ */
+ ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid);
+
+ /*
+ * Remove stale locks, if any.
+ */
+ StandbyReleaseOldLocks(running->oldestRunningXid);
+
+ /*
+ * If our snapshot is already valid, nothing else to do...
+ */
+ if (standbyState == STANDBY_SNAPSHOT_READY)
+ return;
+
+ /*
+ * If our initial RunningTransactionsData had an overflowed snapshot then
+ * we knew we were missing some subxids from our snapshot. If we continue
+ * to see overflowed snapshots then we might never be able to start up, so
+ * we make another test to see if our snapshot is now valid. We know that
+ * the missing subxids are equal to or earlier than nextXid. After we
+ * initialise we continue to apply changes during recovery, so once the
+ * oldestRunningXid is later than the nextXid from the initial snapshot we
+ * know that we no longer have missing information and can mark the
+ * snapshot as valid.
+ */
+ if (standbyState == STANDBY_SNAPSHOT_PENDING)
+ {
+ /*
+ * If the snapshot isn't overflowed or if its empty we can reset our
+ * pending state and use this snapshot instead.
+ */
+ if (!running->subxid_overflow || running->xcnt == 0)
+ {
+ /*
+ * If we have already collected known assigned xids, we need to
+ * throw them away before we apply the recovery snapshot.
+ */
+ KnownAssignedXidsReset();
+ standbyState = STANDBY_INITIALIZED;
+ }
+ else
+ {
+ if (TransactionIdPrecedes(standbySnapshotPendingXmin,
+ running->oldestRunningXid))
+ {
+ standbyState = STANDBY_SNAPSHOT_READY;
+ elog(trace_recovery(DEBUG1),
+ "recovery snapshots are now enabled");
+ }
+ else
+ elog(trace_recovery(DEBUG1),
+ "recovery snapshot waiting for non-overflowed snapshot or "
+ "until oldest active xid on standby is at least %u (now %u)",
+ standbySnapshotPendingXmin,
+ running->oldestRunningXid);
+ return;
+ }
+ }
+
+ Assert(standbyState == STANDBY_INITIALIZED);
+
+ /*
+ * NB: this can be reached at least twice, so make sure new code can deal
+ * with that.
+ */
+
+ /*
+ * Nobody else is running yet, but take locks anyhow
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * KnownAssignedXids is sorted so we cannot just add the xids, we have to
+ * sort them first.
+ *
+ * Some of the new xids are top-level xids and some are subtransactions.
+ * We don't call SubTransSetParent because it doesn't matter yet. If we
+ * aren't overflowed then all xids will fit in snapshot and so we don't
+ * need subtrans. If we later overflow, an xid assignment record will add
+ * xids to subtrans. If RunningTransactionsData is overflowed then we
+ * don't have enough information to correctly update subtrans anyway.
+ */
+
+ /*
+ * Allocate a temporary array to avoid modifying the array passed as
+ * argument.
+ */
+ xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt));
+
+ /*
+ * Add to the temp array any xids which have not already completed.
+ */
+ nxids = 0;
+ for (i = 0; i < running->xcnt + running->subxcnt; i++)
+ {
+ TransactionId xid = running->xids[i];
+
+ /*
+ * The running-xacts snapshot can contain xids that were still visible
+ * in the procarray when the snapshot was taken, but were already
+ * WAL-logged as completed. They're not running anymore, so ignore
+ * them.
+ */
+ if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid))
+ continue;
+
+ xids[nxids++] = xid;
+ }
+
+ if (nxids > 0)
+ {
+ if (procArray->numKnownAssignedXids != 0)
+ {
+ LWLockRelease(ProcArrayLock);
+ elog(ERROR, "KnownAssignedXids is not empty");
+ }
+
+ /*
+ * Sort the array so that we can add them safely into
+ * KnownAssignedXids.
+ *
+ * We have to sort them logically, because in KnownAssignedXidsAdd we
+ * call TransactionIdFollowsOrEquals and so on. But we know these XIDs
+ * come from RUNNING_XACTS, which means there are only normal XIDs from
+ * the same epoch, so this is safe.
+ */
+ qsort(xids, nxids, sizeof(TransactionId), xidLogicalComparator);
+
+ /*
+ * Add the sorted snapshot into KnownAssignedXids. The running-xacts
+ * snapshot may include duplicated xids because of prepared
+ * transactions, so ignore them.
+ */
+ for (i = 0; i < nxids; i++)
+ {
+ if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i]))
+ {
+ elog(DEBUG1,
+ "found duplicated transaction %u for KnownAssignedXids insertion",
+ xids[i]);
+ continue;
+ }
+ KnownAssignedXidsAdd(xids[i], xids[i], true);
+ }
+
+ KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+ }
+
+ pfree(xids);
+
+ /*
+ * latestObservedXid is at least set to the point where SUBTRANS was
+ * started up to (cf. ProcArrayInitRecovery()) or to the biggest xid
+ * RecordKnownAssignedTransactionIds() was called for. Initialize
+ * subtrans from thereon, up to nextXid - 1.
+ *
+ * We need to duplicate parts of RecordKnownAssignedTransactionId() here,
+ * because we've just added xids to the known assigned xids machinery that
+ * haven't gone through RecordKnownAssignedTransactionId().
+ */
+ Assert(TransactionIdIsNormal(latestObservedXid));
+ TransactionIdAdvance(latestObservedXid);
+ while (TransactionIdPrecedes(latestObservedXid, running->nextXid))
+ {
+ ExtendSUBTRANS(latestObservedXid);
+ TransactionIdAdvance(latestObservedXid);
+ }
+ TransactionIdRetreat(latestObservedXid); /* = running->nextXid - 1 */
+
+ /* ----------
+ * Now we've got the running xids we need to set the global values that
+ * are used to track snapshots as they evolve further.
+ *
+ * - latestCompletedXid which will be the xmax for snapshots
+ * - lastOverflowedXid which shows whether snapshots overflow
+ * - nextXid
+ *
+ * If the snapshot overflowed, then we still initialise with what we know,
+ * but the recovery snapshot isn't fully valid yet because we know there
+ * are some subxids missing. We don't know the specific subxids that are
+ * missing, so conservatively assume the last one is latestObservedXid.
+ * ----------
+ */
+ if (running->subxid_overflow)
+ {
+ standbyState = STANDBY_SNAPSHOT_PENDING;
+
+ standbySnapshotPendingXmin = latestObservedXid;
+ procArray->lastOverflowedXid = latestObservedXid;
+ }
+ else
+ {
+ standbyState = STANDBY_SNAPSHOT_READY;
+
+ standbySnapshotPendingXmin = InvalidTransactionId;
+ }
+
+ /*
+ * If a transaction wrote a commit record in the gap between taking and
+ * logging the snapshot then latestCompletedXid may already be higher than
+ * the value from the snapshot, so check before we use the incoming value.
+ * It also might not yet be set at all.
+ */
+ MaintainLatestCompletedXidRecovery(running->latestCompletedXid);
+
+ /*
+ * NB: No need to increment ShmemVariableCache->xactCompletionCount here,
+ * nobody can see it yet.
+ */
+
+ LWLockRelease(ProcArrayLock);
+
+ /* ShmemVariableCache->nextXid must be beyond any observed xid. */
+ AdvanceNextFullTransactionIdPastXid(latestObservedXid);
+
+ Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid));
+
+ KnownAssignedXidsDisplay(trace_recovery(DEBUG3));
+ if (standbyState == STANDBY_SNAPSHOT_READY)
+ elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled");
+ else
+ elog(trace_recovery(DEBUG1),
+ "recovery snapshot waiting for non-overflowed snapshot or "
+ "until oldest active xid on standby is at least %u (now %u)",
+ standbySnapshotPendingXmin,
+ running->oldestRunningXid);
+}
+
+/*
+ * ProcArrayApplyXidAssignment
+ * Process an XLOG_XACT_ASSIGNMENT WAL record
+ */
+void
+ProcArrayApplyXidAssignment(TransactionId topxid,
+ int nsubxids, TransactionId *subxids)
+{
+ TransactionId max_xid;
+ int i;
+
+ Assert(standbyState >= STANDBY_INITIALIZED);
+
+ max_xid = TransactionIdLatest(topxid, nsubxids, subxids);
+
+ /*
+ * Mark all the subtransactions as observed.
+ *
+ * NOTE: This will fail if the subxid contains too many previously
+ * unobserved xids to fit into known-assigned-xids. That shouldn't happen
+ * as the code stands, because xid-assignment records should never contain
+ * more than PGPROC_MAX_CACHED_SUBXIDS entries.
+ */
+ RecordKnownAssignedTransactionIds(max_xid);
+
+ /*
+ * Notice that we update pg_subtrans with the top-level xid, rather than
+ * the parent xid. This is a difference between normal processing and
+ * recovery, yet is still correct in all cases. The reason is that
+ * subtransaction commit is not marked in clog until commit processing, so
+ * all aborted subtransactions have already been clearly marked in clog.
+ * As a result we are able to refer directly to the top-level
+ * transaction's state rather than skipping through all the intermediate
+ * states in the subtransaction tree. This should be the first time we
+ * have attempted to SubTransSetParent().
+ */
+ for (i = 0; i < nsubxids; i++)
+ SubTransSetParent(subxids[i], topxid);
+
+ /* KnownAssignedXids isn't maintained yet, so we're done for now */
+ if (standbyState == STANDBY_INITIALIZED)
+ return;
+
+ /*
+ * Uses same locking as transaction commit
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Remove subxids from known-assigned-xacts.
+ */
+ KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids);
+
+ /*
+ * Advance lastOverflowedXid to be at least the last of these subxids.
+ */
+ if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid))
+ procArray->lastOverflowedXid = max_xid;
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * TransactionIdIsInProgress -- is given transaction running in some backend
+ *
+ * Aside from some shortcuts such as checking RecentXmin and our own Xid,
+ * there are four possibilities for finding a running transaction:
+ *
+ * 1. The given Xid is a main transaction Id. We will find this out cheaply
+ * by looking at ProcGlobal->xids.
+ *
+ * 2. The given Xid is one of the cached subxact Xids in the PGPROC array.
+ * We can find this out cheaply too.
+ *
+ * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see
+ * if the Xid is running on the primary.
+ *
+ * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see
+ * if that is running according to ProcGlobal->xids[] or KnownAssignedXids.
+ * This is the slowest way, but sadly it has to be done always if the others
+ * failed, unless we see that the cached subxact sets are complete (none have
+ * overflowed).
+ *
+ * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids
+ * while doing 1 and 3, we can release the ProcArrayLock while we do 4.
+ * This buys back some concurrency (and we can't retrieve the main Xids from
+ * ProcGlobal->xids[] again anyway; see GetNewTransactionId).
+ */
+bool
+TransactionIdIsInProgress(TransactionId xid)
+{
+ static TransactionId *xids = NULL;
+ static TransactionId *other_xids;
+ XidCacheStatus *other_subxidstates;
+ int nxids = 0;
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId topxid;
+ TransactionId latestCompletedXid;
+ int mypgxactoff;
+ int numProcs;
+ int j;
+
+ /*
+ * Don't bother checking a transaction older than RecentXmin; it could not
+ * possibly still be running. (Note: in particular, this guarantees that
+ * we reject InvalidTransactionId, FrozenTransactionId, etc as not
+ * running.)
+ */
+ if (TransactionIdPrecedes(xid, RecentXmin))
+ {
+ xc_by_recent_xmin_inc();
+ return false;
+ }
+
+ /*
+ * We may have just checked the status of this transaction, so if it is
+ * already known to be completed, we can fall out without any access to
+ * shared memory.
+ */
+ if (TransactionIdEquals(cachedXidIsNotInProgress, xid))
+ {
+ xc_by_known_xact_inc();
+ return false;
+ }
+
+ /*
+ * Also, we can handle our own transaction (and subtransactions) without
+ * any access to shared memory.
+ */
+ if (TransactionIdIsCurrentTransactionId(xid))
+ {
+ xc_by_my_xact_inc();
+ return true;
+ }
+
+ /*
+ * If first time through, get workspace to remember main XIDs in. We
+ * malloc it permanently to avoid repeated palloc/pfree overhead.
+ */
+ if (xids == NULL)
+ {
+ /*
+ * In hot standby mode, reserve enough space to hold all xids in the
+ * known-assigned list. If we later finish recovery, we no longer need
+ * the bigger array, but we don't bother to shrink it.
+ */
+ int maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs;
+
+ xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId));
+ if (xids == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ other_xids = ProcGlobal->xids;
+ other_subxidstates = ProcGlobal->subxidStates;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ /*
+ * Now that we have the lock, we can check latestCompletedXid; if the
+ * target Xid is after that, it's surely still running.
+ */
+ latestCompletedXid =
+ XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+ if (TransactionIdPrecedes(latestCompletedXid, xid))
+ {
+ LWLockRelease(ProcArrayLock);
+ xc_by_latest_xid_inc();
+ return true;
+ }
+
+ /* No shortcuts, gotta grovel through the array */
+ mypgxactoff = MyProc->pgxactoff;
+ numProcs = arrayP->numProcs;
+ for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
+ {
+ int pgprocno;
+ PGPROC *proc;
+ TransactionId pxid;
+ int pxids;
+
+ /* Ignore ourselves --- dealt with it above */
+ if (pgxactoff == mypgxactoff)
+ continue;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+
+ if (!TransactionIdIsValid(pxid))
+ continue;
+
+ /*
+ * Step 1: check the main Xid
+ */
+ if (TransactionIdEquals(pxid, xid))
+ {
+ LWLockRelease(ProcArrayLock);
+ xc_by_main_xid_inc();
+ return true;
+ }
+
+ /*
+ * We can ignore main Xids that are younger than the target Xid, since
+ * the target could not possibly be their child.
+ */
+ if (TransactionIdPrecedes(xid, pxid))
+ continue;
+
+ /*
+ * Step 2: check the cached child-Xids arrays
+ */
+ pxids = other_subxidstates[pgxactoff].count;
+ pg_read_barrier(); /* pairs with barrier in GetNewTransactionId() */
+ pgprocno = arrayP->pgprocnos[pgxactoff];
+ proc = &allProcs[pgprocno];
+ for (j = pxids - 1; j >= 0; j--)
+ {
+ /* Fetch xid just once - see GetNewTransactionId */
+ TransactionId cxid = UINT32_ACCESS_ONCE(proc->subxids.xids[j]);
+
+ if (TransactionIdEquals(cxid, xid))
+ {
+ LWLockRelease(ProcArrayLock);
+ xc_by_child_xid_inc();
+ return true;
+ }
+ }
+
+ /*
+ * Save the main Xid for step 4. We only need to remember main Xids
+ * that have uncached children. (Note: there is no race condition
+ * here because the overflowed flag cannot be cleared, only set, while
+ * we hold ProcArrayLock. So we can't miss an Xid that we need to
+ * worry about.)
+ */
+ if (other_subxidstates[pgxactoff].overflowed)
+ xids[nxids++] = pxid;
+ }
+
+ /*
+ * Step 3: in hot standby mode, check the known-assigned-xids list. XIDs
+ * in the list must be treated as running.
+ */
+ if (RecoveryInProgress())
+ {
+ /* none of the PGPROC entries should have XIDs in hot standby mode */
+ Assert(nxids == 0);
+
+ if (KnownAssignedXidExists(xid))
+ {
+ LWLockRelease(ProcArrayLock);
+ xc_by_known_assigned_inc();
+ return true;
+ }
+
+ /*
+ * If the KnownAssignedXids overflowed, we have to check pg_subtrans
+ * too. Fetch all xids from KnownAssignedXids that are lower than
+ * xid, since if xid is a subtransaction its parent will always have a
+ * lower value. Note we will collect both main and subXIDs here, but
+ * there's no help for it.
+ */
+ if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid))
+ nxids = KnownAssignedXidsGet(xids, xid);
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * If none of the relevant caches overflowed, we know the Xid is not
+ * running without even looking at pg_subtrans.
+ */
+ if (nxids == 0)
+ {
+ xc_no_overflow_inc();
+ cachedXidIsNotInProgress = xid;
+ return false;
+ }
+
+ /*
+ * Step 4: have to check pg_subtrans.
+ *
+ * At this point, we know it's either a subtransaction of one of the Xids
+ * in xids[], or it's not running. If it's an already-failed
+ * subtransaction, we want to say "not running" even though its parent may
+ * still be running. So first, check pg_xact to see if it's been aborted.
+ */
+ xc_slow_answer_inc();
+
+ if (TransactionIdDidAbort(xid))
+ {
+ cachedXidIsNotInProgress = xid;
+ return false;
+ }
+
+ /*
+ * It isn't aborted, so check whether the transaction tree it belongs to
+ * is still running (or, more precisely, whether it was running when we
+ * held ProcArrayLock).
+ */
+ topxid = SubTransGetTopmostTransaction(xid);
+ Assert(TransactionIdIsValid(topxid));
+ if (!TransactionIdEquals(topxid, xid))
+ {
+ for (int i = 0; i < nxids; i++)
+ {
+ if (TransactionIdEquals(xids[i], topxid))
+ return true;
+ }
+ }
+
+ cachedXidIsNotInProgress = xid;
+ return false;
+}
+
+/*
+ * TransactionIdIsActive -- is xid the top-level XID of an active backend?
+ *
+ * This differs from TransactionIdIsInProgress in that it ignores prepared
+ * transactions, as well as transactions running on the primary if we're in
+ * hot standby. Also, we ignore subtransactions since that's not needed
+ * for current uses.
+ */
+bool
+TransactionIdIsActive(TransactionId xid)
+{
+ bool result = false;
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ int i;
+
+ /*
+ * Don't bother checking a transaction older than RecentXmin; it could not
+ * possibly still be running.
+ */
+ if (TransactionIdPrecedes(xid, RecentXmin))
+ return false;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (i = 0; i < arrayP->numProcs; i++)
+ {
+ int pgprocno = arrayP->pgprocnos[i];
+ PGPROC *proc = &allProcs[pgprocno];
+ TransactionId pxid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ pxid = UINT32_ACCESS_ONCE(other_xids[i]);
+
+ if (!TransactionIdIsValid(pxid))
+ continue;
+
+ if (proc->pid == 0)
+ continue; /* ignore prepared transactions */
+
+ if (TransactionIdEquals(pxid, xid))
+ {
+ result = true;
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+
+/*
+ * Determine XID horizons.
+ *
+ * This is used by wrapper functions like GetOldestNonRemovableTransactionId()
+ * (for VACUUM), GetReplicationHorizons() (for hot_standby_feedback), etc as
+ * well as "internally" by GlobalVisUpdate() (see comment above struct
+ * GlobalVisState).
+ *
+ * See the definition of ComputeXidHorizonsResult for the various computed
+ * horizons.
+ *
+ * For VACUUM separate horizons (used to decide which deleted tuples must
+ * be preserved), for shared and non-shared tables are computed. For shared
+ * relations backends in all databases must be considered, but for non-shared
+ * relations that's not required, since only backends in my own database could
+ * ever see the tuples in them. Also, we can ignore concurrently running lazy
+ * VACUUMs because (a) they must be working on other tables, and (b) they
+ * don't need to do snapshot-based lookups.
+ *
+ * This also computes a horizon used to truncate pg_subtrans. For that
+ * backends in all databases have to be considered, and concurrently running
+ * lazy VACUUMs cannot be ignored, as they still may perform pg_subtrans
+ * accesses.
+ *
+ * Note: we include all currently running xids in the set of considered xids.
+ * This ensures that if a just-started xact has not yet set its snapshot,
+ * when it does set the snapshot it cannot set xmin less than what we compute.
+ * See notes in src/backend/access/transam/README.
+ *
+ * Note: despite the above, it's possible for the calculated values to move
+ * backwards on repeated calls. The calculated values are conservative, so
+ * that anything older is definitely not considered as running by anyone
+ * anymore, but the exact values calculated depend on a number of things. For
+ * example, if there are no transactions running in the current database, the
+ * horizon for normal tables will be latestCompletedXid. If a transaction
+ * begins after that, its xmin will include in-progress transactions in other
+ * databases that started earlier, so another call will return a lower value.
+ * Nonetheless it is safe to vacuum a table in the current database with the
+ * first result. There are also replication-related effects: a walsender
+ * process can set its xmin based on transactions that are no longer running
+ * on the primary but are still being replayed on the standby, thus possibly
+ * making the values go backwards. In this case there is a possibility that
+ * we lose data that the standby would like to have, but unless the standby
+ * uses a replication slot to make its xmin persistent there is little we can
+ * do about that --- data is only protected if the walsender runs continuously
+ * while queries are executed on the standby. (The Hot Standby code deals
+ * with such cases by failing standby queries that needed to access
+ * already-removed data, so there's no integrity bug.) The computed values
+ * are also adjusted with vacuum_defer_cleanup_age, so increasing that setting
+ * on the fly is another easy way to make horizons move backwards, with no
+ * consequences for data integrity.
+ *
+ * Note: the approximate horizons (see definition of GlobalVisState) are
+ * updated by the computations done here. That's currently required for
+ * correctness and a small optimization. Without doing so it's possible that
+ * heap vacuum's call to heap_page_prune() uses a more conservative horizon
+ * than later when deciding which tuples can be removed - which the code
+ * doesn't expect (breaking HOT).
+ */
+static void
+ComputeXidHorizons(ComputeXidHorizonsResult *h)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId kaxmin;
+ bool in_recovery = RecoveryInProgress();
+ TransactionId *other_xids = ProcGlobal->xids;
+
+ /* inferred after ProcArrayLock is released */
+ h->catalog_oldest_nonremovable = InvalidTransactionId;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ h->latest_completed = ShmemVariableCache->latestCompletedXid;
+
+ /*
+ * We initialize the MIN() calculation with latestCompletedXid + 1. This
+ * is a lower bound for the XIDs that might appear in the ProcArray later,
+ * and so protects us against overestimating the result due to future
+ * additions.
+ */
+ {
+ TransactionId initial;
+
+ initial = XidFromFullTransactionId(h->latest_completed);
+ Assert(TransactionIdIsValid(initial));
+ TransactionIdAdvance(initial);
+
+ h->oldest_considered_running = initial;
+ h->shared_oldest_nonremovable = initial;
+ h->data_oldest_nonremovable = initial;
+
+ /*
+ * Only modifications made by this backend affect the horizon for
+ * temporary relations. Instead of a check in each iteration of the
+ * loop over all PGPROCs it is cheaper to just initialize to the
+ * current top-level xid any.
+ *
+ * Without an assigned xid we could use a horizon as aggressive as
+ * ReadNewTransactionid(), but we can get away with the much cheaper
+ * latestCompletedXid + 1: If this backend has no xid there, by
+ * definition, can't be any newer changes in the temp table than
+ * latestCompletedXid.
+ */
+ if (TransactionIdIsValid(MyProc->xid))
+ h->temp_oldest_nonremovable = MyProc->xid;
+ else
+ h->temp_oldest_nonremovable = initial;
+ }
+
+ /*
+ * Fetch slot horizons while ProcArrayLock is held - the
+ * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside
+ * the lock.
+ */
+ h->slot_xmin = procArray->replication_slot_xmin;
+ h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+ for (int index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ int8 statusFlags = ProcGlobal->statusFlags[index];
+ TransactionId xid;
+ TransactionId xmin;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
+ xmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+ /*
+ * Consider both the transaction's Xmin, and its Xid.
+ *
+ * We must check both because a transaction might have an Xmin but not
+ * (yet) an Xid; conversely, if it has an Xid, that could determine
+ * some not-yet-set Xmin.
+ */
+ xmin = TransactionIdOlder(xmin, xid);
+
+ /* if neither is set, this proc doesn't influence the horizon */
+ if (!TransactionIdIsValid(xmin))
+ continue;
+
+ /*
+ * Don't ignore any procs when determining which transactions might be
+ * considered running. While slots should ensure logical decoding
+ * backends are protected even without this check, it can't hurt to
+ * include them here as well..
+ */
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running, xmin);
+
+ /*
+ * Skip over backends either vacuuming (which is ok with rows being
+ * removed, as long as pg_subtrans is not truncated) or doing logical
+ * decoding (which manages xmin separately, check below).
+ */
+ if (statusFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING))
+ continue;
+
+ /* shared tables need to take backends in all databases into account */
+ h->shared_oldest_nonremovable =
+ TransactionIdOlder(h->shared_oldest_nonremovable, xmin);
+
+ /*
+ * Normally queries in other databases are ignored for anything but
+ * the shared horizon. But in recovery we cannot compute an accurate
+ * per-database horizon as all xids are managed via the
+ * KnownAssignedXids machinery.
+ *
+ * Be careful to compute a pessimistic value when MyDatabaseId is not
+ * set. If this is a backend in the process of starting up, we may not
+ * use a "too aggressive" horizon (otherwise we could end up using it
+ * to prune still needed data away). If the current backend never
+ * connects to a database that is harmless, because
+ * data_oldest_nonremovable will never be utilized.
+ */
+ if (in_recovery ||
+ MyDatabaseId == InvalidOid || proc->databaseId == MyDatabaseId ||
+ proc->databaseId == 0) /* always include WalSender */
+ {
+ h->data_oldest_nonremovable =
+ TransactionIdOlder(h->data_oldest_nonremovable, xmin);
+ }
+ }
+
+ /*
+ * If in recovery fetch oldest xid in KnownAssignedXids, will be applied
+ * after lock is released.
+ */
+ if (in_recovery)
+ kaxmin = KnownAssignedXidsGetOldestXmin();
+
+ /*
+ * No other information from shared state is needed, release the lock
+ * immediately. The rest of the computations can be done without a lock.
+ */
+ LWLockRelease(ProcArrayLock);
+
+ if (in_recovery)
+ {
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running, kaxmin);
+ h->shared_oldest_nonremovable =
+ TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin);
+ h->data_oldest_nonremovable =
+ TransactionIdOlder(h->data_oldest_nonremovable, kaxmin);
+ /* temp relations cannot be accessed in recovery */
+ }
+ else
+ {
+ /*
+ * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age.
+ *
+ * vacuum_defer_cleanup_age provides some additional "slop" for the
+ * benefit of hot standby queries on standby servers. This is quick
+ * and dirty, and perhaps not all that useful unless the primary has a
+ * predictable transaction rate, but it offers some protection when
+ * there's no walsender connection. Note that we are assuming
+ * vacuum_defer_cleanup_age isn't large enough to cause wraparound ---
+ * so guc.c should limit it to no more than the xidStopLimit threshold
+ * in varsup.c. Also note that we intentionally don't apply
+ * vacuum_defer_cleanup_age on standby servers.
+ */
+ h->oldest_considered_running =
+ TransactionIdRetreatedBy(h->oldest_considered_running,
+ vacuum_defer_cleanup_age);
+ h->shared_oldest_nonremovable =
+ TransactionIdRetreatedBy(h->shared_oldest_nonremovable,
+ vacuum_defer_cleanup_age);
+ h->data_oldest_nonremovable =
+ TransactionIdRetreatedBy(h->data_oldest_nonremovable,
+ vacuum_defer_cleanup_age);
+ /* defer doesn't apply to temp relations */
+ }
+
+ /*
+ * Check whether there are replication slots requiring an older xmin.
+ */
+ h->shared_oldest_nonremovable =
+ TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_xmin);
+ h->data_oldest_nonremovable =
+ TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin);
+
+ /*
+ * The only difference between catalog / data horizons is that the slot's
+ * catalog xmin is applied to the catalog one (so catalogs can be accessed
+ * for logical decoding). Initialize with data horizon, and then back up
+ * further if necessary. Have to back up the shared horizon as well, since
+ * that also can contain catalogs.
+ */
+ h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable;
+ h->shared_oldest_nonremovable =
+ TransactionIdOlder(h->shared_oldest_nonremovable,
+ h->slot_catalog_xmin);
+ h->catalog_oldest_nonremovable = h->data_oldest_nonremovable;
+ h->catalog_oldest_nonremovable =
+ TransactionIdOlder(h->catalog_oldest_nonremovable,
+ h->slot_catalog_xmin);
+
+ /*
+ * It's possible that slots / vacuum_defer_cleanup_age backed up the
+ * horizons further than oldest_considered_running. Fix.
+ */
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running,
+ h->shared_oldest_nonremovable);
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running,
+ h->catalog_oldest_nonremovable);
+ h->oldest_considered_running =
+ TransactionIdOlder(h->oldest_considered_running,
+ h->data_oldest_nonremovable);
+
+ /*
+ * shared horizons have to be at least as old as the oldest visible in
+ * current db
+ */
+ Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+ h->data_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable,
+ h->catalog_oldest_nonremovable));
+
+ /*
+ * Horizons need to ensure that pg_subtrans access is still possible for
+ * the relevant backends.
+ */
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->shared_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->catalog_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->data_oldest_nonremovable));
+ Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->temp_oldest_nonremovable));
+ Assert(!TransactionIdIsValid(h->slot_xmin) ||
+ TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->slot_xmin));
+ Assert(!TransactionIdIsValid(h->slot_catalog_xmin) ||
+ TransactionIdPrecedesOrEquals(h->oldest_considered_running,
+ h->slot_catalog_xmin));
+
+ /* update approximate horizons with the computed horizons */
+ GlobalVisUpdateApply(h);
+}
+
+/*
+ * Determine what kind of visibility horizon needs to be used for a
+ * relation. If rel is NULL, the most conservative horizon is used.
+ */
+static inline GlobalVisHorizonKind
+GlobalVisHorizonKindForRel(Relation rel)
+{
+ /*
+ * Other relkkinds currently don't contain xids, nor always the necessary
+ * logical decoding markers.
+ */
+ Assert(!rel ||
+ rel->rd_rel->relkind == RELKIND_RELATION ||
+ rel->rd_rel->relkind == RELKIND_MATVIEW ||
+ rel->rd_rel->relkind == RELKIND_TOASTVALUE);
+
+ if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress())
+ return VISHORIZON_SHARED;
+ else if (IsCatalogRelation(rel) ||
+ RelationIsAccessibleInLogicalDecoding(rel))
+ return VISHORIZON_CATALOG;
+ else if (!RELATION_IS_LOCAL(rel))
+ return VISHORIZON_DATA;
+ else
+ return VISHORIZON_TEMP;
+}
+
+/*
+ * Return the oldest XID for which deleted tuples must be preserved in the
+ * passed table.
+ *
+ * If rel is not NULL the horizon may be considerably more recent than
+ * otherwise (i.e. fewer tuples will be removable). In the NULL case a horizon
+ * that is correct (but not optimal) for all relations will be returned.
+ *
+ * This is used by VACUUM to decide which deleted tuples must be preserved in
+ * the passed in table.
+ */
+TransactionId
+GetOldestNonRemovableTransactionId(Relation rel)
+{
+ ComputeXidHorizonsResult horizons;
+
+ ComputeXidHorizons(&horizons);
+
+ switch (GlobalVisHorizonKindForRel(rel))
+ {
+ case VISHORIZON_SHARED:
+ return horizons.shared_oldest_nonremovable;
+ case VISHORIZON_CATALOG:
+ return horizons.catalog_oldest_nonremovable;
+ case VISHORIZON_DATA:
+ return horizons.data_oldest_nonremovable;
+ case VISHORIZON_TEMP:
+ return horizons.temp_oldest_nonremovable;
+ }
+
+ return InvalidTransactionId;
+}
+
+/*
+ * Return the oldest transaction id any currently running backend might still
+ * consider running. This should not be used for visibility / pruning
+ * determinations (see GetOldestNonRemovableTransactionId()), but for
+ * decisions like up to where pg_subtrans can be truncated.
+ */
+TransactionId
+GetOldestTransactionIdConsideredRunning(void)
+{
+ ComputeXidHorizonsResult horizons;
+
+ ComputeXidHorizons(&horizons);
+
+ return horizons.oldest_considered_running;
+}
+
+/*
+ * Return the visibility horizons for a hot standby feedback message.
+ */
+void
+GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin)
+{
+ ComputeXidHorizonsResult horizons;
+
+ ComputeXidHorizons(&horizons);
+
+ /*
+ * Don't want to use shared_oldest_nonremovable here, as that contains the
+ * effect of replication slot's catalog_xmin. We want to send a separate
+ * feedback for the catalog horizon, so the primary can remove data table
+ * contents more aggressively.
+ */
+ *xmin = horizons.shared_oldest_nonremovable_raw;
+ *catalog_xmin = horizons.slot_catalog_xmin;
+}
+
+/*
+ * GetMaxSnapshotXidCount -- get max size for snapshot XID array
+ *
+ * We have to export this for use by snapmgr.c.
+ */
+int
+GetMaxSnapshotXidCount(void)
+{
+ return procArray->maxProcs;
+}
+
+/*
+ * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array
+ *
+ * We have to export this for use by snapmgr.c.
+ */
+int
+GetMaxSnapshotSubxidCount(void)
+{
+ return TOTAL_MAX_CACHED_SUBXIDS;
+}
+
+/*
+ * Initialize old_snapshot_threshold specific parts of a newly build snapshot.
+ */
+static void
+GetSnapshotDataInitOldSnapshot(Snapshot snapshot)
+{
+ if (!OldSnapshotThresholdActive())
+ {
+ /*
+ * If not using "snapshot too old" feature, fill related fields with
+ * dummy values that don't require any locking.
+ */
+ snapshot->lsn = InvalidXLogRecPtr;
+ snapshot->whenTaken = 0;
+ }
+ else
+ {
+ /*
+ * Capture the current time and WAL stream location in case this
+ * snapshot becomes old enough to need to fall back on the special
+ * "old snapshot" logic.
+ */
+ snapshot->lsn = GetXLogInsertRecPtr();
+ snapshot->whenTaken = GetSnapshotCurrentTimestamp();
+ MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin);
+ }
+}
+
+/*
+ * Helper function for GetSnapshotData() that checks if the bulk of the
+ * visibility information in the snapshot is still valid. If so, it updates
+ * the fields that need to change and returns true. Otherwise it returns
+ * false.
+ *
+ * This very likely can be evolved to not need ProcArrayLock held (at very
+ * least in the case we already hold a snapshot), but that's for another day.
+ */
+static bool
+GetSnapshotDataReuse(Snapshot snapshot)
+{
+ uint64 curXactCompletionCount;
+
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ if (unlikely(snapshot->snapXactCompletionCount == 0))
+ return false;
+
+ curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+ if (curXactCompletionCount != snapshot->snapXactCompletionCount)
+ return false;
+
+ /*
+ * If the current xactCompletionCount is still the same as it was at the
+ * time the snapshot was built, we can be sure that rebuilding the
+ * contents of the snapshot the hard way would result in the same snapshot
+ * contents:
+ *
+ * As explained in transam/README, the set of xids considered running by
+ * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot
+ * contents only depend on transactions with xids and xactCompletionCount
+ * is incremented whenever a transaction with an xid finishes (while
+ * holding ProcArrayLock) exclusively). Thus the xactCompletionCount check
+ * ensures we would detect if the snapshot would have changed.
+ *
+ * As the snapshot contents are the same as it was before, it is safe to
+ * re-enter the snapshot's xmin into the PGPROC array. None of the rows
+ * visible under the snapshot could already have been removed (that'd
+ * require the set of running transactions to change) and it fulfills the
+ * requirement that concurrent GetSnapshotData() calls yield the same
+ * xmin.
+ */
+ if (!TransactionIdIsValid(MyProc->xmin))
+ MyProc->xmin = TransactionXmin = snapshot->xmin;
+
+ RecentXmin = snapshot->xmin;
+ Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+ snapshot->curcid = GetCurrentCommandId(false);
+ snapshot->active_count = 0;
+ snapshot->regd_count = 0;
+ snapshot->copied = false;
+
+ GetSnapshotDataInitOldSnapshot(snapshot);
+
+ return true;
+}
+
+/*
+ * GetSnapshotData -- returns information about running transactions.
+ *
+ * The returned snapshot includes xmin (lowest still-running xact ID),
+ * xmax (highest completed xact ID + 1), and a list of running xact IDs
+ * in the range xmin <= xid < xmax. It is used as follows:
+ * All xact IDs < xmin are considered finished.
+ * All xact IDs >= xmax are considered still running.
+ * For an xact ID xmin <= xid < xmax, consult list to see whether
+ * it is considered running or not.
+ * This ensures that the set of transactions seen as "running" by the
+ * current xact will not change after it takes the snapshot.
+ *
+ * All running top-level XIDs are included in the snapshot, except for lazy
+ * VACUUM processes. We also try to include running subtransaction XIDs,
+ * but since PGPROC has only a limited cache area for subxact XIDs, full
+ * information may not be available. If we find any overflowed subxid arrays,
+ * we have to mark the snapshot's subxid data as overflowed, and extra work
+ * *may* need to be done to determine what's running (see XidInMVCCSnapshot()
+ * in heapam_visibility.c).
+ *
+ * We also update the following backend-global variables:
+ * TransactionXmin: the oldest xmin of any snapshot in use in the
+ * current transaction (this is the same as MyProc->xmin).
+ * RecentXmin: the xmin computed for the most recent snapshot. XIDs
+ * older than this are known not running any more.
+ *
+ * And try to advance the bounds of GlobalVis{Shared,Catalog,Data,Temp}Rels
+ * for the benefit of the GlobalVisTest* family of functions.
+ *
+ * Note: this function should probably not be called with an argument that's
+ * not statically allocated (see xip allocation below).
+ */
+Snapshot
+GetSnapshotData(Snapshot snapshot)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ TransactionId xmin;
+ TransactionId xmax;
+ int count = 0;
+ int subcount = 0;
+ bool suboverflowed = false;
+ FullTransactionId latest_completed;
+ TransactionId oldestxid;
+ int mypgxactoff;
+ TransactionId myxid;
+ uint64 curXactCompletionCount;
+
+ TransactionId replication_slot_xmin = InvalidTransactionId;
+ TransactionId replication_slot_catalog_xmin = InvalidTransactionId;
+
+ Assert(snapshot != NULL);
+
+ /*
+ * Allocating space for maxProcs xids is usually overkill; numProcs would
+ * be sufficient. But it seems better to do the malloc while not holding
+ * the lock, so we can't look at numProcs. Likewise, we allocate much
+ * more subxip storage than is probably needed.
+ *
+ * This does open a possibility for avoiding repeated malloc/free: since
+ * maxProcs does not change at runtime, we can simply reuse the previous
+ * xip arrays if any. (This relies on the fact that all callers pass
+ * static SnapshotData structs.)
+ */
+ if (snapshot->xip == NULL)
+ {
+ /*
+ * First call for this snapshot. Snapshot is same size whether or not
+ * we are in recovery, see later comments.
+ */
+ snapshot->xip = (TransactionId *)
+ malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId));
+ if (snapshot->xip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ Assert(snapshot->subxip == NULL);
+ snapshot->subxip = (TransactionId *)
+ malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId));
+ if (snapshot->subxip == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ /*
+ * It is sufficient to get shared lock on ProcArrayLock, even if we are
+ * going to set MyProc->xmin.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ if (GetSnapshotDataReuse(snapshot))
+ {
+ LWLockRelease(ProcArrayLock);
+ return snapshot;
+ }
+
+ latest_completed = ShmemVariableCache->latestCompletedXid;
+ mypgxactoff = MyProc->pgxactoff;
+ myxid = other_xids[mypgxactoff];
+ Assert(myxid == MyProc->xid);
+
+ oldestxid = ShmemVariableCache->oldestXid;
+ curXactCompletionCount = ShmemVariableCache->xactCompletionCount;
+
+ /* xmax is always latestCompletedXid + 1 */
+ xmax = XidFromFullTransactionId(latest_completed);
+ TransactionIdAdvance(xmax);
+ Assert(TransactionIdIsNormal(xmax));
+
+ /* initialize xmin calculation with xmax */
+ xmin = xmax;
+
+ /* take own xid into account, saves a check inside the loop */
+ if (TransactionIdIsNormal(myxid) && NormalTransactionIdPrecedes(myxid, xmin))
+ xmin = myxid;
+
+ snapshot->takenDuringRecovery = RecoveryInProgress();
+
+ if (!snapshot->takenDuringRecovery)
+ {
+ int numProcs = arrayP->numProcs;
+ TransactionId *xip = snapshot->xip;
+ int *pgprocnos = arrayP->pgprocnos;
+ XidCacheStatus *subxidStates = ProcGlobal->subxidStates;
+ uint8 *allStatusFlags = ProcGlobal->statusFlags;
+
+ /*
+ * First collect set of pgxactoff/xids that need to be included in the
+ * snapshot.
+ */
+ for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++)
+ {
+ /* Fetch xid just once - see GetNewTransactionId */
+ TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]);
+ uint8 statusFlags;
+
+ Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff);
+
+ /*
+ * If the transaction has no XID assigned, we can skip it; it
+ * won't have sub-XIDs either.
+ */
+ if (likely(xid == InvalidTransactionId))
+ continue;
+
+ /*
+ * We don't include our own XIDs (if any) in the snapshot. It
+ * needs to be includeded in the xmin computation, but we did so
+ * outside the loop.
+ */
+ if (pgxactoff == mypgxactoff)
+ continue;
+
+ /*
+ * The only way we are able to get here with a non-normal xid is
+ * during bootstrap - with this backend using
+ * BootstrapTransactionId. But the above test should filter that
+ * out.
+ */
+ Assert(TransactionIdIsNormal(xid));
+
+ /*
+ * If the XID is >= xmax, we can skip it; such transactions will
+ * be treated as running anyway (and any sub-XIDs will also be >=
+ * xmax).
+ */
+ if (!NormalTransactionIdPrecedes(xid, xmax))
+ continue;
+
+ /*
+ * Skip over backends doing logical decoding which manages xmin
+ * separately (check below) and ones running LAZY VACUUM.
+ */
+ statusFlags = allStatusFlags[pgxactoff];
+ if (statusFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM))
+ continue;
+
+ if (NormalTransactionIdPrecedes(xid, xmin))
+ xmin = xid;
+
+ /* Add XID to snapshot. */
+ xip[count++] = xid;
+
+ /*
+ * Save subtransaction XIDs if possible (if we've already
+ * overflowed, there's no point). Note that the subxact XIDs must
+ * be later than their parent, so no need to check them against
+ * xmin. We could filter against xmax, but it seems better not to
+ * do that much work while holding the ProcArrayLock.
+ *
+ * The other backend can add more subxids concurrently, but cannot
+ * remove any. Hence it's important to fetch nxids just once.
+ * Should be safe to use memcpy, though. (We needn't worry about
+ * missing any xids added concurrently, because they must postdate
+ * xmax.)
+ *
+ * Again, our own XIDs are not included in the snapshot.
+ */
+ if (!suboverflowed)
+ {
+
+ if (subxidStates[pgxactoff].overflowed)
+ suboverflowed = true;
+ else
+ {
+ int nsubxids = subxidStates[pgxactoff].count;
+
+ if (nsubxids > 0)
+ {
+ int pgprocno = pgprocnos[pgxactoff];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ pg_read_barrier(); /* pairs with GetNewTransactionId */
+
+ memcpy(snapshot->subxip + subcount,
+ (void *) proc->subxids.xids,
+ nsubxids * sizeof(TransactionId));
+ subcount += nsubxids;
+ }
+ }
+ }
+ }
+ }
+ else
+ {
+ /*
+ * We're in hot standby, so get XIDs from KnownAssignedXids.
+ *
+ * We store all xids directly into subxip[]. Here's why:
+ *
+ * In recovery we don't know which xids are top-level and which are
+ * subxacts, a design choice that greatly simplifies xid processing.
+ *
+ * It seems like we would want to try to put xids into xip[] only, but
+ * that is fairly small. We would either need to make that bigger or
+ * to increase the rate at which we WAL-log xid assignment; neither is
+ * an appealing choice.
+ *
+ * We could try to store xids into xip[] first and then into subxip[]
+ * if there are too many xids. That only works if the snapshot doesn't
+ * overflow because we do not search subxip[] in that case. A simpler
+ * way is to just store all xids in the subxact array because this is
+ * by far the bigger array. We just leave the xip array empty.
+ *
+ * Either way we need to change the way XidInMVCCSnapshot() works
+ * depending upon when the snapshot was taken, or change normal
+ * snapshot processing so it matches.
+ *
+ * Note: It is possible for recovery to end before we finish taking
+ * the snapshot, and for newly assigned transaction ids to be added to
+ * the ProcArray. xmax cannot change while we hold ProcArrayLock, so
+ * those newly added transaction ids would be filtered away, so we
+ * need not be concerned about them.
+ */
+ subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin,
+ xmax);
+
+ if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid))
+ suboverflowed = true;
+ }
+
+
+ /*
+ * Fetch into local variable while ProcArrayLock is held - the
+ * LWLockRelease below is a barrier, ensuring this happens inside the
+ * lock.
+ */
+ replication_slot_xmin = procArray->replication_slot_xmin;
+ replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+ if (!TransactionIdIsValid(MyProc->xmin))
+ MyProc->xmin = TransactionXmin = xmin;
+
+ LWLockRelease(ProcArrayLock);
+
+ /* maintain state for GlobalVis* */
+ {
+ TransactionId def_vis_xid;
+ TransactionId def_vis_xid_data;
+ FullTransactionId def_vis_fxid;
+ FullTransactionId def_vis_fxid_data;
+ FullTransactionId oldestfxid;
+
+ /*
+ * Converting oldestXid is only safe when xid horizon cannot advance,
+ * i.e. holding locks. While we don't hold the lock anymore, all the
+ * necessary data has been gathered with lock held.
+ */
+ oldestfxid = FullXidRelativeTo(latest_completed, oldestxid);
+
+ /* apply vacuum_defer_cleanup_age */
+ def_vis_xid_data =
+ TransactionIdRetreatedBy(xmin, vacuum_defer_cleanup_age);
+
+ /* Check whether there's a replication slot requiring an older xmin. */
+ def_vis_xid_data =
+ TransactionIdOlder(def_vis_xid_data, replication_slot_xmin);
+
+ /*
+ * Rows in non-shared, non-catalog tables possibly could be vacuumed
+ * if older than this xid.
+ */
+ def_vis_xid = def_vis_xid_data;
+
+ /*
+ * Check whether there's a replication slot requiring an older catalog
+ * xmin.
+ */
+ def_vis_xid =
+ TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid);
+
+ def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid);
+ def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data);
+
+ /*
+ * Check if we can increase upper bound. As a previous
+ * GlobalVisUpdate() might have computed more aggressive values, don't
+ * overwrite them if so.
+ */
+ GlobalVisSharedRels.definitely_needed =
+ FullTransactionIdNewer(def_vis_fxid,
+ GlobalVisSharedRels.definitely_needed);
+ GlobalVisCatalogRels.definitely_needed =
+ FullTransactionIdNewer(def_vis_fxid,
+ GlobalVisCatalogRels.definitely_needed);
+ GlobalVisDataRels.definitely_needed =
+ FullTransactionIdNewer(def_vis_fxid_data,
+ GlobalVisDataRels.definitely_needed);
+ /* See temp_oldest_nonremovable computation in ComputeXidHorizons() */
+ if (TransactionIdIsNormal(myxid))
+ GlobalVisTempRels.definitely_needed =
+ FullXidRelativeTo(latest_completed, myxid);
+ else
+ {
+ GlobalVisTempRels.definitely_needed = latest_completed;
+ FullTransactionIdAdvance(&GlobalVisTempRels.definitely_needed);
+ }
+
+ /*
+ * Check if we know that we can initialize or increase the lower
+ * bound. Currently the only cheap way to do so is to use
+ * ShmemVariableCache->oldestXid as input.
+ *
+ * We should definitely be able to do better. We could e.g. put a
+ * global lower bound value into ShmemVariableCache.
+ */
+ GlobalVisSharedRels.maybe_needed =
+ FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+ oldestfxid);
+ GlobalVisCatalogRels.maybe_needed =
+ FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+ oldestfxid);
+ GlobalVisDataRels.maybe_needed =
+ FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+ oldestfxid);
+ /* accurate value known */
+ GlobalVisTempRels.maybe_needed = GlobalVisTempRels.definitely_needed;
+ }
+
+ RecentXmin = xmin;
+ Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin));
+
+ snapshot->xmin = xmin;
+ snapshot->xmax = xmax;
+ snapshot->xcnt = count;
+ snapshot->subxcnt = subcount;
+ snapshot->suboverflowed = suboverflowed;
+ snapshot->snapXactCompletionCount = curXactCompletionCount;
+
+ snapshot->curcid = GetCurrentCommandId(false);
+
+ /*
+ * This is a new snapshot, so set both refcounts are zero, and mark it as
+ * not copied in persistent memory.
+ */
+ snapshot->active_count = 0;
+ snapshot->regd_count = 0;
+ snapshot->copied = false;
+
+ GetSnapshotDataInitOldSnapshot(snapshot);
+
+ return snapshot;
+}
+
+/*
+ * ProcArrayInstallImportedXmin -- install imported xmin into MyProc->xmin
+ *
+ * This is called when installing a snapshot imported from another
+ * transaction. To ensure that OldestXmin doesn't go backwards, we must
+ * check that the source transaction is still running, and we'd better do
+ * that atomically with installing the new xmin.
+ *
+ * Returns true if successful, false if source xact is no longer running.
+ */
+bool
+ProcArrayInstallImportedXmin(TransactionId xmin,
+ VirtualTransactionId *sourcevxid)
+{
+ bool result = false;
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ Assert(TransactionIdIsNormal(xmin));
+ if (!sourcevxid)
+ return false;
+
+ /* Get lock so source xact can't end while we're doing this */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ int statusFlags = ProcGlobal->statusFlags[index];
+ TransactionId xid;
+
+ /* Ignore procs running LAZY VACUUM */
+ if (statusFlags & PROC_IN_VACUUM)
+ continue;
+
+ /* We are only interested in the specific virtual transaction. */
+ if (proc->backendId != sourcevxid->backendId)
+ continue;
+ if (proc->lxid != sourcevxid->localTransactionId)
+ continue;
+
+ /*
+ * We check the transaction's database ID for paranoia's sake: if it's
+ * in another DB then its xmin does not cover us. Caller should have
+ * detected this already, so we just treat any funny cases as
+ * "transaction not found".
+ */
+ if (proc->databaseId != MyDatabaseId)
+ continue;
+
+ /*
+ * Likewise, let's just make real sure its xmin does cover us.
+ */
+ xid = UINT32_ACCESS_ONCE(proc->xmin);
+ if (!TransactionIdIsNormal(xid) ||
+ !TransactionIdPrecedesOrEquals(xid, xmin))
+ continue;
+
+ /*
+ * We're good. Install the new xmin. As in GetSnapshotData, set
+ * TransactionXmin too. (Note that because snapmgr.c called
+ * GetSnapshotData first, we'll be overwriting a valid xmin here, so
+ * we don't check that.)
+ */
+ MyProc->xmin = TransactionXmin = xmin;
+
+ result = true;
+ break;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * ProcArrayInstallRestoredXmin -- install restored xmin into MyProc->xmin
+ *
+ * This is like ProcArrayInstallImportedXmin, but we have a pointer to the
+ * PGPROC of the transaction from which we imported the snapshot, rather than
+ * an XID.
+ *
+ * Note that this function also copies statusFlags from the source `proc` in
+ * order to avoid the case where MyProc's xmin needs to be skipped for
+ * computing xid horizon.
+ *
+ * Returns true if successful, false if source xact is no longer running.
+ */
+bool
+ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc)
+{
+ bool result = false;
+ TransactionId xid;
+
+ Assert(TransactionIdIsNormal(xmin));
+ Assert(proc != NULL);
+
+ /*
+ * Get an exclusive lock so that we can copy statusFlags from source proc.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Be certain that the referenced PGPROC has an advertised xmin which is
+ * no later than the one we're installing, so that the system-wide xmin
+ * can't go backwards. Also, make sure it's running in the same database,
+ * so that the per-database xmin cannot go backwards.
+ */
+ xid = UINT32_ACCESS_ONCE(proc->xmin);
+ if (proc->databaseId == MyDatabaseId &&
+ TransactionIdIsNormal(xid) &&
+ TransactionIdPrecedesOrEquals(xid, xmin))
+ {
+ /*
+ * Install xmin and propagate the statusFlags that affect how the
+ * value is interpreted by vacuum.
+ */
+ MyProc->xmin = TransactionXmin = xmin;
+ MyProc->statusFlags = (MyProc->statusFlags & ~PROC_XMIN_FLAGS) |
+ (proc->statusFlags & PROC_XMIN_FLAGS);
+ ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags;
+
+ result = true;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * GetRunningTransactionData -- returns information about running transactions.
+ *
+ * Similar to GetSnapshotData but returns more information. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes and
+ * prepared transactions.
+ *
+ * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for
+ * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc
+ * array until the caller has WAL-logged this snapshot, and releases the
+ * lock. Acquiring ProcArrayLock ensures that no transactions commit until the
+ * lock is released.
+ *
+ * The returned data structure is statically allocated; caller should not
+ * modify it, and must not assume it is valid past the next call.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * Dummy PGPROCs from prepared transaction are included, meaning that this
+ * may return entries with duplicated TransactionId values coming from
+ * transaction finishing to prepare. Nothing is done about duplicated
+ * entries here to not hold on ProcArrayLock more than necessary.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ *
+ * Note that if any transaction has overflowed its cached subtransactions
+ * then there is no real need include any subtransactions.
+ */
+RunningTransactions
+GetRunningTransactionData(void)
+{
+ /* result workspace */
+ static RunningTransactionsData CurrentRunningXactsData;
+
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData;
+ TransactionId latestCompletedXid;
+ TransactionId oldestRunningXid;
+ TransactionId *xids;
+ int index;
+ int count;
+ int subcount;
+ bool suboverflowed;
+
+ Assert(!RecoveryInProgress());
+
+ /*
+ * Allocating space for maxProcs xids is usually overkill; numProcs would
+ * be sufficient. But it seems better to do the malloc while not holding
+ * the lock, so we can't look at numProcs. Likewise, we allocate much
+ * more subxip storage than is probably needed.
+ *
+ * Should only be allocated in bgwriter, since only ever executed during
+ * checkpoints.
+ */
+ if (CurrentRunningXacts->xids == NULL)
+ {
+ /*
+ * First call
+ */
+ CurrentRunningXacts->xids = (TransactionId *)
+ malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId));
+ if (CurrentRunningXacts->xids == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ xids = CurrentRunningXacts->xids;
+
+ count = subcount = 0;
+ suboverflowed = false;
+
+ /*
+ * Ensure that no xids enter or leave the procarray while we obtain
+ * snapshot.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+ LWLockAcquire(XidGenLock, LW_SHARED);
+
+ latestCompletedXid =
+ XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid);
+ oldestRunningXid =
+ XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+ /*
+ * Spin over procArray collecting all xids
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ TransactionId xid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+ /*
+ * We don't need to store transactions that don't have a TransactionId
+ * yet because they will not show as running on a standby server.
+ */
+ if (!TransactionIdIsValid(xid))
+ continue;
+
+ /*
+ * Be careful not to exclude any xids before calculating the values of
+ * oldestRunningXid and suboverflowed, since these are used to clean
+ * up transaction information held on standbys.
+ */
+ if (TransactionIdPrecedes(xid, oldestRunningXid))
+ oldestRunningXid = xid;
+
+ if (ProcGlobal->subxidStates[index].overflowed)
+ suboverflowed = true;
+
+ /*
+ * If we wished to exclude xids this would be the right place for it.
+ * Procs with the PROC_IN_VACUUM flag set don't usually assign xids,
+ * but they do during truncation at the end when they get the lock and
+ * truncate, so it is not much of a problem to include them if they
+ * are seen and it is cleaner to include them.
+ */
+
+ xids[count++] = xid;
+ }
+
+ /*
+ * Spin over procArray collecting all subxids, but only if there hasn't
+ * been a suboverflow.
+ */
+ if (!suboverflowed)
+ {
+ XidCacheStatus *other_subxidstates = ProcGlobal->subxidStates;
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ int nsubxids;
+
+ /*
+ * Save subtransaction XIDs. Other backends can't add or remove
+ * entries while we're holding XidGenLock.
+ */
+ nsubxids = other_subxidstates[index].count;
+ if (nsubxids > 0)
+ {
+ /* barrier not really required, as XidGenLock is held, but ... */
+ pg_read_barrier(); /* pairs with GetNewTransactionId */
+
+ memcpy(&xids[count], (void *) proc->subxids.xids,
+ nsubxids * sizeof(TransactionId));
+ count += nsubxids;
+ subcount += nsubxids;
+
+ /*
+ * Top-level XID of a transaction is always less than any of
+ * its subxids, so we don't need to check if any of the
+ * subxids are smaller than oldestRunningXid
+ */
+ }
+ }
+ }
+
+ /*
+ * It's important *not* to include the limits set by slots here because
+ * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those
+ * were to be included here the initial value could never increase because
+ * of a circular dependency where slots only increase their limits when
+ * running xacts increases oldestRunningXid and running xacts only
+ * increases if slots do.
+ */
+
+ CurrentRunningXacts->xcnt = count - subcount;
+ CurrentRunningXacts->subxcnt = subcount;
+ CurrentRunningXacts->subxid_overflow = suboverflowed;
+ CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ CurrentRunningXacts->oldestRunningXid = oldestRunningXid;
+ CurrentRunningXacts->latestCompletedXid = latestCompletedXid;
+
+ Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid));
+ Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid));
+ Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid));
+
+ /* We don't release the locks here, the caller is responsible for that */
+
+ return CurrentRunningXacts;
+}
+
+/*
+ * GetOldestActiveTransactionId()
+ *
+ * Similar to GetSnapshotData but returns just oldestActiveXid. We include
+ * all PGPROCs with an assigned TransactionId, even VACUUM processes.
+ * We look at all databases, though there is no need to include WALSender
+ * since this has no effect on hot standby conflicts.
+ *
+ * This is never executed during recovery so there is no need to look at
+ * KnownAssignedXids.
+ *
+ * We don't worry about updating other counters, we want to keep this as
+ * simple as possible and leave GetSnapshotData() as the primary code for
+ * that bookkeeping.
+ */
+TransactionId
+GetOldestActiveTransactionId(void)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ TransactionId oldestRunningXid;
+ int index;
+
+ Assert(!RecoveryInProgress());
+
+ /*
+ * Read nextXid, as the upper bound of what's still active.
+ *
+ * Reading a TransactionId is atomic, but we must grab the lock to make
+ * sure that all XIDs < nextXid are already present in the proc array (or
+ * have already completed), when we spin over it.
+ */
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+ LWLockRelease(XidGenLock);
+
+ /*
+ * Spin over procArray collecting all xids and subxids.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ TransactionId xid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+ if (!TransactionIdIsNormal(xid))
+ continue;
+
+ if (TransactionIdPrecedes(xid, oldestRunningXid))
+ oldestRunningXid = xid;
+
+ /*
+ * Top-level XID of a transaction is always less than any of its
+ * subxids, so we don't need to check if any of the subxids are
+ * smaller than oldestRunningXid
+ */
+ }
+ LWLockRelease(ProcArrayLock);
+
+ return oldestRunningXid;
+}
+
+/*
+ * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum
+ *
+ * Returns the oldest xid that we can guarantee not to have been affected by
+ * vacuum, i.e. no rows >= that xid have been vacuumed away unless the
+ * transaction aborted. Note that the value can (and most of the time will) be
+ * much more conservative than what really has been affected by vacuum, but we
+ * currently don't have better data available.
+ *
+ * This is useful to initialize the cutoff xid after which a new changeset
+ * extraction replication slot can start decoding changes.
+ *
+ * Must be called with ProcArrayLock held either shared or exclusively,
+ * although most callers will want to use exclusive mode since it is expected
+ * that the caller will immediately use the xid to peg the xmin horizon.
+ */
+TransactionId
+GetOldestSafeDecodingTransactionId(bool catalogOnly)
+{
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId oldestSafeXid;
+ int index;
+ bool recovery_in_progress = RecoveryInProgress();
+
+ Assert(LWLockHeldByMe(ProcArrayLock));
+
+ /*
+ * Acquire XidGenLock, so no transactions can acquire an xid while we're
+ * running. If no transaction with xid were running concurrently a new xid
+ * could influence the RecentXmin et al.
+ *
+ * We initialize the computation to nextXid since that's guaranteed to be
+ * a safe, albeit pessimal, value.
+ */
+ LWLockAcquire(XidGenLock, LW_SHARED);
+ oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextXid);
+
+ /*
+ * If there's already a slot pegging the xmin horizon, we can start with
+ * that value, it's guaranteed to be safe since it's computed by this
+ * routine initially and has been enforced since. We can always use the
+ * slot's general xmin horizon, but the catalog horizon is only usable
+ * when only catalog data is going to be looked at.
+ */
+ if (TransactionIdIsValid(procArray->replication_slot_xmin) &&
+ TransactionIdPrecedes(procArray->replication_slot_xmin,
+ oldestSafeXid))
+ oldestSafeXid = procArray->replication_slot_xmin;
+
+ if (catalogOnly &&
+ TransactionIdIsValid(procArray->replication_slot_catalog_xmin) &&
+ TransactionIdPrecedes(procArray->replication_slot_catalog_xmin,
+ oldestSafeXid))
+ oldestSafeXid = procArray->replication_slot_catalog_xmin;
+
+ /*
+ * If we're not in recovery, we walk over the procarray and collect the
+ * lowest xid. Since we're called with ProcArrayLock held and have
+ * acquired XidGenLock, no entries can vanish concurrently, since
+ * ProcGlobal->xids[i] is only set with XidGenLock held and only cleared
+ * with ProcArrayLock held.
+ *
+ * In recovery we can't lower the safe value besides what we've computed
+ * above, so we'll have to wait a bit longer there. We unfortunately can
+ * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids
+ * machinery can miss values and return an older value than is safe.
+ */
+ if (!recovery_in_progress)
+ {
+ TransactionId *other_xids = ProcGlobal->xids;
+
+ /*
+ * Spin over procArray collecting min(ProcGlobal->xids[i])
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ TransactionId xid;
+
+ /* Fetch xid just once - see GetNewTransactionId */
+ xid = UINT32_ACCESS_ONCE(other_xids[index]);
+
+ if (!TransactionIdIsNormal(xid))
+ continue;
+
+ if (TransactionIdPrecedes(xid, oldestSafeXid))
+ oldestSafeXid = xid;
+ }
+ }
+
+ LWLockRelease(XidGenLock);
+
+ return oldestSafeXid;
+}
+
+/*
+ * GetVirtualXIDsDelayingChkptGuts -- Get the VXIDs of transactions that are
+ * delaying the start or end of a checkpoint because they have critical
+ * actions in progress.
+ *
+ * Constructs an array of VXIDs of transactions that are currently in commit
+ * critical sections, as shown by having delayChkpt or delayChkptEnd set in
+ * their PGPROC.
+ *
+ * Returns a palloc'd array that should be freed by the caller.
+ * *nvxids is the number of valid entries.
+ *
+ * Note that because backends set or clear delayChkpt and delayChkptEnd
+ * without holding any lock, the result is somewhat indeterminate, but we
+ * don't really care. Even in a multiprocessor with delayed writes to
+ * shared memory, it should be certain that setting of delayChkpt will
+ * propagate to shared memory when the backend takes a lock, so we cannot
+ * fail to see a virtual xact as delayChkpt if it's already inserted its
+ * commit record. Whether it takes a little while for clearing of
+ * delayChkpt to propagate is unimportant for correctness.
+ */
+static VirtualTransactionId *
+GetVirtualXIDsDelayingChkptGuts(int *nvxids, int type)
+{
+ VirtualTransactionId *vxids;
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ Assert(type != 0);
+
+ /* allocate what's certainly enough result space */
+ vxids = (VirtualTransactionId *)
+ palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (((type & DELAY_CHKPT_START) && proc->delayChkpt) ||
+ ((type & DELAY_CHKPT_COMPLETE) && proc->delayChkptEnd))
+ {
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+ if (VirtualTransactionIdIsValid(vxid))
+ vxids[count++] = vxid;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ *nvxids = count;
+ return vxids;
+}
+
+/*
+ * GetVirtualXIDsDelayingChkpt - Get the VXIDs of transactions that are
+ * delaying the start of a checkpoint.
+ */
+VirtualTransactionId *
+GetVirtualXIDsDelayingChkpt(int *nvxids)
+{
+ return GetVirtualXIDsDelayingChkptGuts(nvxids, DELAY_CHKPT_START);
+}
+
+/*
+ * GetVirtualXIDsDelayingChkptEnd - Get the VXIDs of transactions that are
+ * delaying the end of a checkpoint.
+ */
+VirtualTransactionId *
+GetVirtualXIDsDelayingChkptEnd(int *nvxids)
+{
+ return GetVirtualXIDsDelayingChkptGuts(nvxids, DELAY_CHKPT_COMPLETE);
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying?
+ *
+ * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any
+ * of the specified VXIDs are still in critical sections of code.
+ *
+ * Note: this is O(N^2) in the number of vxacts that are/were delaying, but
+ * those numbers should be small enough for it not to be a problem.
+ */
+static bool
+HaveVirtualXIDsDelayingChkptGuts(VirtualTransactionId *vxids, int nvxids,
+ int type)
+{
+ bool result = false;
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ Assert(type != 0);
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+
+ if ((((type & DELAY_CHKPT_START) && proc->delayChkpt) ||
+ ((type & DELAY_CHKPT_COMPLETE) && proc->delayChkptEnd)) &&
+ VirtualTransactionIdIsValid(vxid))
+ {
+ int i;
+
+ for (i = 0; i < nvxids; i++)
+ {
+ if (VirtualTransactionIdEquals(vxid, vxids[i]))
+ {
+ result = true;
+ break;
+ }
+ }
+ if (result)
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying
+ * the start of a checkpoint?
+ */
+bool
+HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids)
+{
+ return HaveVirtualXIDsDelayingChkptGuts(vxids, nvxids,
+ DELAY_CHKPT_START);
+}
+
+/*
+ * HaveVirtualXIDsDelayingChkptEnd -- Are any of the specified VXIDs delaying
+ * the end of a checkpoint?
+ */
+bool
+HaveVirtualXIDsDelayingChkptEnd(VirtualTransactionId *vxids, int nvxids)
+{
+ return HaveVirtualXIDsDelayingChkptGuts(vxids, nvxids,
+ DELAY_CHKPT_COMPLETE);
+}
+
+/*
+ * BackendPidGetProc -- get a backend's PGPROC given its PID
+ *
+ * Returns NULL if not found. Note that it is up to the caller to be
+ * sure that the question remains meaningful for long enough for the
+ * answer to be used ...
+ */
+PGPROC *
+BackendPidGetProc(int pid)
+{
+ PGPROC *result;
+
+ if (pid == 0) /* never match dummy PGPROCs */
+ return NULL;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ result = BackendPidGetProcWithLock(pid);
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * BackendPidGetProcWithLock -- get a backend's PGPROC given its PID
+ *
+ * Same as above, except caller must be holding ProcArrayLock. The found
+ * entry, if any, can be assumed to be valid as long as the lock remains held.
+ */
+PGPROC *
+BackendPidGetProcWithLock(int pid)
+{
+ PGPROC *result = NULL;
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ if (pid == 0) /* never match dummy PGPROCs */
+ return NULL;
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ PGPROC *proc = &allProcs[arrayP->pgprocnos[index]];
+
+ if (proc->pid == pid)
+ {
+ result = proc;
+ break;
+ }
+ }
+
+ return result;
+}
+
+/*
+ * BackendXidGetPid -- get a backend's pid given its XID
+ *
+ * Returns 0 if not found or it's a prepared transaction. Note that
+ * it is up to the caller to be sure that the question remains
+ * meaningful for long enough for the answer to be used ...
+ *
+ * Only main transaction Ids are considered. This function is mainly
+ * useful for determining what backend owns a lock.
+ *
+ * Beware that not every xact has an XID assigned. However, as long as you
+ * only call this using an XID found on disk, you're safe.
+ */
+int
+BackendXidGetPid(TransactionId xid)
+{
+ int result = 0;
+ ProcArrayStruct *arrayP = procArray;
+ TransactionId *other_xids = ProcGlobal->xids;
+ int index;
+
+ if (xid == InvalidTransactionId) /* never match invalid xid */
+ return 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (other_xids[index] == xid)
+ {
+ result = proc->pid;
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return result;
+}
+
+/*
+ * IsBackendPid -- is a given pid a running backend
+ *
+ * This is not called by the backend, but is called by external modules.
+ */
+bool
+IsBackendPid(int pid)
+{
+ return (BackendPidGetProc(pid) != NULL);
+}
+
+
+/*
+ * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * The array is palloc'd. The number of valid entries is returned into *nvxids.
+ *
+ * The arguments allow filtering the set of VXIDs returned. Our own process
+ * is always skipped. In addition:
+ * If limitXmin is not InvalidTransactionId, skip processes with
+ * xmin > limitXmin.
+ * If excludeXmin0 is true, skip processes with xmin = 0.
+ * If allDbs is false, skip processes attached to other databases.
+ * If excludeVacuum isn't zero, skip processes for which
+ * (statusFlags & excludeVacuum) is not zero.
+ *
+ * Note: the purpose of the limitXmin and excludeXmin0 parameters is to
+ * allow skipping backends whose oldest live snapshot is no older than
+ * some snapshot we have. Since we examine the procarray with only shared
+ * lock, there are race conditions: a backend could set its xmin just after
+ * we look. Indeed, on multiprocessors with weak memory ordering, the
+ * other backend could have set its xmin *before* we look. We know however
+ * that such a backend must have held shared ProcArrayLock overlapping our
+ * own hold of ProcArrayLock, else we would see its xmin update. Therefore,
+ * any snapshot the other backend is taking concurrently with our scan cannot
+ * consider any transactions as still running that we think are committed
+ * (since backends must hold ProcArrayLock exclusive to commit).
+ */
+VirtualTransactionId *
+GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0,
+ bool allDbs, int excludeVacuum,
+ int *nvxids)
+{
+ VirtualTransactionId *vxids;
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ /* allocate what's certainly enough result space */
+ vxids = (VirtualTransactionId *)
+ palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs);
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ uint8 statusFlags = ProcGlobal->statusFlags[index];
+
+ if (proc == MyProc)
+ continue;
+
+ if (excludeVacuum & statusFlags)
+ continue;
+
+ if (allDbs || proc->databaseId == MyDatabaseId)
+ {
+ /* Fetch xmin just once - might change on us */
+ TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+ if (excludeXmin0 && !TransactionIdIsValid(pxmin))
+ continue;
+
+ /*
+ * InvalidTransactionId precedes all other XIDs, so a proc that
+ * hasn't set xmin yet will not be rejected by this test.
+ */
+ if (!TransactionIdIsValid(limitXmin) ||
+ TransactionIdPrecedesOrEquals(pxmin, limitXmin))
+ {
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+ if (VirtualTransactionIdIsValid(vxid))
+ vxids[count++] = vxid;
+ }
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ *nvxids = count;
+ return vxids;
+}
+
+/*
+ * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * Usage is limited to conflict resolution during recovery on standby servers.
+ * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId
+ * in cases where we cannot accurately determine a value for latestRemovedXid.
+ *
+ * If limitXmin is InvalidTransactionId then we want to kill everybody,
+ * so we're not worried if they have a snapshot or not, nor does it really
+ * matter what type of lock we hold.
+ *
+ * All callers that are checking xmins always now supply a valid and useful
+ * value for limitXmin. The limitXmin is always lower than the lowest
+ * numbered KnownAssignedXid that is not already a FATAL error. This is
+ * because we only care about cleanup records that are cleaning up tuple
+ * versions from committed transactions. In that case they will only occur
+ * at the point where the record is less than the lowest running xid. That
+ * allows us to say that if any backend takes a snapshot concurrently with
+ * us then the conflict assessment made here would never include the snapshot
+ * that is being derived. So we take LW_SHARED on the ProcArray and allow
+ * concurrent snapshots when limitXmin is valid. We might think about adding
+ * Assert(limitXmin < lowest(KnownAssignedXids))
+ * but that would not be true in the case of FATAL errors lagging in array,
+ * but we already know those are bogus anyway, so we skip that test.
+ *
+ * If dbOid is valid we skip backends attached to other databases.
+ *
+ * Be careful to *not* pfree the result from this function. We reuse
+ * this array sufficiently often that we use malloc for the result.
+ */
+VirtualTransactionId *
+GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid)
+{
+ static VirtualTransactionId *vxids;
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ /*
+ * If first time through, get workspace to remember main XIDs in. We
+ * malloc it permanently to avoid repeated palloc/pfree overhead. Allow
+ * result space, remembering room for a terminator.
+ */
+ if (vxids == NULL)
+ {
+ vxids = (VirtualTransactionId *)
+ malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
+ if (vxids == NULL)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ /* Exclude prepared transactions */
+ if (proc->pid == 0)
+ continue;
+
+ if (!OidIsValid(dbOid) ||
+ proc->databaseId == dbOid)
+ {
+ /* Fetch xmin just once - can't change on us, but good coding */
+ TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin);
+
+ /*
+ * We ignore an invalid pxmin because this means that backend has
+ * no snapshot currently. We hold a Share lock to avoid contention
+ * with users taking snapshots. That is not a problem because the
+ * current xmin is always at least one higher than the latest
+ * removed xid, so any new snapshot would never conflict with the
+ * test here.
+ */
+ if (!TransactionIdIsValid(limitXmin) ||
+ (TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin)))
+ {
+ VirtualTransactionId vxid;
+
+ GET_VXID_FROM_PGPROC(vxid, *proc);
+ if (VirtualTransactionIdIsValid(vxid))
+ vxids[count++] = vxid;
+ }
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ /* add the terminator */
+ vxids[count].backendId = InvalidBackendId;
+ vxids[count].localTransactionId = InvalidLocalTransactionId;
+
+ return vxids;
+}
+
+/*
+ * CancelVirtualTransaction - used in recovery conflict processing
+ *
+ * Returns pid of the process signaled, or 0 if not found.
+ */
+pid_t
+CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode)
+{
+ return SignalVirtualTransaction(vxid, sigmode, true);
+}
+
+pid_t
+SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode,
+ bool conflictPending)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+ pid_t pid = 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ VirtualTransactionId procvxid;
+
+ GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+ if (procvxid.backendId == vxid.backendId &&
+ procvxid.localTransactionId == vxid.localTransactionId)
+ {
+ proc->recoveryConflictPending = conflictPending;
+ pid = proc->pid;
+ if (pid != 0)
+ {
+ /*
+ * Kill the pid if it's still here. If not, that's what we
+ * wanted so ignore any errors.
+ */
+ (void) SendProcSignal(pid, sigmode, vxid.backendId);
+ }
+ break;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return pid;
+}
+
+/*
+ * MinimumActiveBackends --- count backends (other than myself) that are
+ * in active transactions. Return true if the count exceeds the
+ * minimum threshold passed. This is used as a heuristic to decide if
+ * a pre-XLOG-flush delay is worthwhile during commit.
+ *
+ * Do not count backends that are blocked waiting for locks, since they are
+ * not going to get to run until someone else commits.
+ */
+bool
+MinimumActiveBackends(int min)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ /* Quick short-circuit if no minimum is specified */
+ if (min == 0)
+ return true;
+
+ /*
+ * Note: for speed, we don't acquire ProcArrayLock. This is a little bit
+ * bogus, but since we are only testing fields for zero or nonzero, it
+ * should be OK. The result is only used for heuristic purposes anyway...
+ */
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ /*
+ * Since we're not holding a lock, need to be prepared to deal with
+ * garbage, as someone could have incremented numProcs but not yet
+ * filled the structure.
+ *
+ * If someone just decremented numProcs, 'proc' could also point to a
+ * PGPROC entry that's no longer in the array. It still points to a
+ * PGPROC struct, though, because freed PGPROC entries just go to the
+ * free list and are recycled. Its contents are nonsense in that case,
+ * but that's acceptable for this function.
+ */
+ if (pgprocno == -1)
+ continue; /* do not count deleted entries */
+ if (proc == MyProc)
+ continue; /* do not count myself */
+ if (proc->xid == InvalidTransactionId)
+ continue; /* do not count if no XID assigned */
+ if (proc->pid == 0)
+ continue; /* do not count prepared xacts */
+ if (proc->waitLock != NULL)
+ continue; /* do not count if blocked on a lock */
+ count++;
+ if (count >= min)
+ break;
+ }
+
+ return count >= min;
+}
+
+/*
+ * CountDBBackends --- count backends that are using specified database
+ */
+int
+CountDBBackends(Oid databaseid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (proc->pid == 0)
+ continue; /* do not count prepared xacts */
+ if (!OidIsValid(databaseid) ||
+ proc->databaseId == databaseid)
+ count++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return count;
+}
+
+/*
+ * CountDBConnections --- counts database backends ignoring any background
+ * worker processes
+ */
+int
+CountDBConnections(Oid databaseid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (proc->pid == 0)
+ continue; /* do not count prepared xacts */
+ if (proc->isBackgroundWorker)
+ continue; /* do not count background workers */
+ if (!OidIsValid(databaseid) ||
+ proc->databaseId == databaseid)
+ count++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return count;
+}
+
+/*
+ * CancelDBBackends --- cancel backends that are using specified database
+ */
+void
+CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int index;
+
+ /* tell all backends to die */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (databaseid == InvalidOid || proc->databaseId == databaseid)
+ {
+ VirtualTransactionId procvxid;
+ pid_t pid;
+
+ GET_VXID_FROM_PGPROC(procvxid, *proc);
+
+ proc->recoveryConflictPending = conflictPending;
+ pid = proc->pid;
+ if (pid != 0)
+ {
+ /*
+ * Kill the pid if it's still here. If not, that's what we
+ * wanted so ignore any errors.
+ */
+ (void) SendProcSignal(pid, sigmode, procvxid.backendId);
+ }
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * CountUserBackends --- count backends that are used by specified user
+ */
+int
+CountUserBackends(Oid roleid)
+{
+ ProcArrayStruct *arrayP = procArray;
+ int count = 0;
+ int index;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (proc->pid == 0)
+ continue; /* do not count prepared xacts */
+ if (proc->isBackgroundWorker)
+ continue; /* do not count background workers */
+ if (proc->roleId == roleid)
+ count++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ return count;
+}
+
+/*
+ * CountOtherDBBackends -- check for other backends running in the given DB
+ *
+ * If there are other backends in the DB, we will wait a maximum of 5 seconds
+ * for them to exit. Autovacuum backends are encouraged to exit early by
+ * sending them SIGTERM, but normal user backends are just waited for.
+ *
+ * The current backend is always ignored; it is caller's responsibility to
+ * check whether the current backend uses the given DB, if it's important.
+ *
+ * Returns true if there are (still) other backends in the DB, false if not.
+ * Also, *nbackends and *nprepared are set to the number of other backends
+ * and prepared transactions in the DB, respectively.
+ *
+ * This function is used to interlock DROP DATABASE and related commands
+ * against there being any active backends in the target DB --- dropping the
+ * DB while active backends remain would be a Bad Thing. Note that we cannot
+ * detect here the possibility of a newly-started backend that is trying to
+ * connect to the doomed database, so additional interlocking is needed during
+ * backend startup. The caller should normally hold an exclusive lock on the
+ * target DB before calling this, which is one reason we mustn't wait
+ * indefinitely.
+ */
+bool
+CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared)
+{
+ ProcArrayStruct *arrayP = procArray;
+
+#define MAXAUTOVACPIDS 10 /* max autovacs to SIGTERM per iteration */
+ int autovac_pids[MAXAUTOVACPIDS];
+ int tries;
+
+ /* 50 tries with 100ms sleep between tries makes 5 sec total wait */
+ for (tries = 0; tries < 50; tries++)
+ {
+ int nautovacs = 0;
+ bool found = false;
+ int index;
+
+ CHECK_FOR_INTERRUPTS();
+
+ *nbackends = *nprepared = 0;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (index = 0; index < arrayP->numProcs; index++)
+ {
+ int pgprocno = arrayP->pgprocnos[index];
+ PGPROC *proc = &allProcs[pgprocno];
+ uint8 statusFlags = ProcGlobal->statusFlags[index];
+
+ if (proc->databaseId != databaseId)
+ continue;
+ if (proc == MyProc)
+ continue;
+
+ found = true;
+
+ if (proc->pid == 0)
+ (*nprepared)++;
+ else
+ {
+ (*nbackends)++;
+ if ((statusFlags & PROC_IS_AUTOVACUUM) &&
+ nautovacs < MAXAUTOVACPIDS)
+ autovac_pids[nautovacs++] = proc->pid;
+ }
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ if (!found)
+ return false; /* no conflicting backends, so done */
+
+ /*
+ * Send SIGTERM to any conflicting autovacuums before sleeping. We
+ * postpone this step until after the loop because we don't want to
+ * hold ProcArrayLock while issuing kill(). We have no idea what might
+ * block kill() inside the kernel...
+ */
+ for (index = 0; index < nautovacs; index++)
+ (void) kill(autovac_pids[index], SIGTERM); /* ignore any error */
+
+ /* sleep, then try again */
+ pg_usleep(100 * 1000L); /* 100ms */
+ }
+
+ return true; /* timed out, still conflicts */
+}
+
+/*
+ * Terminate existing connections to the specified database. This routine
+ * is used by the DROP DATABASE command when user has asked to forcefully
+ * drop the database.
+ *
+ * The current backend is always ignored; it is caller's responsibility to
+ * check whether the current backend uses the given DB, if it's important.
+ *
+ * It doesn't allow to terminate the connections even if there is a one
+ * backend with the prepared transaction in the target database.
+ */
+void
+TerminateOtherDBBackends(Oid databaseId)
+{
+ ProcArrayStruct *arrayP = procArray;
+ List *pids = NIL;
+ int nprepared = 0;
+ int i;
+
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ for (i = 0; i < procArray->numProcs; i++)
+ {
+ int pgprocno = arrayP->pgprocnos[i];
+ PGPROC *proc = &allProcs[pgprocno];
+
+ if (proc->databaseId != databaseId)
+ continue;
+ if (proc == MyProc)
+ continue;
+
+ if (proc->pid != 0)
+ pids = lappend_int(pids, proc->pid);
+ else
+ nprepared++;
+ }
+
+ LWLockRelease(ProcArrayLock);
+
+ if (nprepared > 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_OBJECT_IN_USE),
+ errmsg("database \"%s\" is being used by prepared transactions",
+ get_database_name(databaseId)),
+ errdetail_plural("There is %d prepared transaction using the database.",
+ "There are %d prepared transactions using the database.",
+ nprepared,
+ nprepared)));
+
+ if (pids)
+ {
+ ListCell *lc;
+
+ /*
+ * Check whether we have the necessary rights to terminate other
+ * sessions. We don't terminate any session until we ensure that we
+ * have rights on all the sessions to be terminated. These checks are
+ * the same as we do in pg_terminate_backend.
+ *
+ * In this case we don't raise some warnings - like "PID %d is not a
+ * PostgreSQL server process", because for us already finished session
+ * is not a problem.
+ */
+ foreach(lc, pids)
+ {
+ int pid = lfirst_int(lc);
+ PGPROC *proc = BackendPidGetProc(pid);
+
+ if (proc != NULL)
+ {
+ /* Only allow superusers to signal superuser-owned backends. */
+ if (superuser_arg(proc->roleId) && !superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be a superuser to terminate superuser process")));
+
+ /* Users can signal backends they have role membership in. */
+ if (!has_privs_of_role(GetUserId(), proc->roleId) &&
+ !has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be a member of the role whose process is being terminated or member of pg_signal_backend")));
+ }
+ }
+
+ /*
+ * There's a race condition here: once we release the ProcArrayLock,
+ * it's possible for the session to exit before we issue kill. That
+ * race condition possibility seems too unlikely to worry about. See
+ * pg_signal_backend.
+ */
+ foreach(lc, pids)
+ {
+ int pid = lfirst_int(lc);
+ PGPROC *proc = BackendPidGetProc(pid);
+
+ if (proc != NULL)
+ {
+ /*
+ * If we have setsid(), signal the backend's whole process
+ * group
+ */
+#ifdef HAVE_SETSID
+ (void) kill(-pid, SIGTERM);
+#else
+ (void) kill(pid, SIGTERM);
+#endif
+ }
+ }
+ }
+}
+
+/*
+ * ProcArraySetReplicationSlotXmin
+ *
+ * Install limits to future computations of the xmin horizon to prevent vacuum
+ * and HOT pruning from removing affected rows still needed by clients with
+ * replication slots.
+ */
+void
+ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
+ bool already_locked)
+{
+ Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
+
+ if (!already_locked)
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ procArray->replication_slot_xmin = xmin;
+ procArray->replication_slot_catalog_xmin = catalog_xmin;
+
+ if (!already_locked)
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ProcArrayGetReplicationSlotXmin
+ *
+ * Return the current slot xmin limits. That's useful to be able to remove
+ * data that's older than those limits.
+ */
+void
+ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
+ TransactionId *catalog_xmin)
+{
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+ if (xmin != NULL)
+ *xmin = procArray->replication_slot_xmin;
+
+ if (catalog_xmin != NULL)
+ *catalog_xmin = procArray->replication_slot_catalog_xmin;
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * XidCacheRemoveRunningXids
+ *
+ * Remove a bunch of TransactionIds from the list of known-running
+ * subtransactions for my backend. Both the specified xid and those in
+ * the xids[] array (of length nxids) are removed from the subxids cache.
+ * latestXid must be the latest XID among the group.
+ */
+void
+XidCacheRemoveRunningXids(TransactionId xid,
+ int nxids, const TransactionId *xids,
+ TransactionId latestXid)
+{
+ int i,
+ j;
+ XidCacheStatus *mysubxidstat;
+
+ Assert(TransactionIdIsValid(xid));
+
+ /*
+ * We must hold ProcArrayLock exclusively in order to remove transactions
+ * from the PGPROC array. (See src/backend/access/transam/README.) It's
+ * possible this could be relaxed since we know this routine is only used
+ * to abort subtransactions, but pending closer analysis we'd best be
+ * conservative.
+ *
+ * Note that we do not have to be careful about memory ordering of our own
+ * reads wrt. GetNewTransactionId() here - only this process can modify
+ * relevant fields of MyProc/ProcGlobal->xids[]. But we do have to be
+ * careful about our own writes being well ordered.
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ mysubxidstat = &ProcGlobal->subxidStates[MyProc->pgxactoff];
+
+ /*
+ * Under normal circumstances xid and xids[] will be in increasing order,
+ * as will be the entries in subxids. Scan backwards to avoid O(N^2)
+ * behavior when removing a lot of xids.
+ */
+ for (i = nxids - 1; i >= 0; i--)
+ {
+ TransactionId anxid = xids[i];
+
+ for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
+ {
+ if (TransactionIdEquals(MyProc->subxids.xids[j], anxid))
+ {
+ MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+ pg_write_barrier();
+ mysubxidstat->count--;
+ MyProc->subxidStatus.count--;
+ break;
+ }
+ }
+
+ /*
+ * Ordinarily we should have found it, unless the cache has
+ * overflowed. However it's also possible for this routine to be
+ * invoked multiple times for the same subtransaction, in case of an
+ * error during AbortSubTransaction. So instead of Assert, emit a
+ * debug warning.
+ */
+ if (j < 0 && !MyProc->subxidStatus.overflowed)
+ elog(WARNING, "did not find subXID %u in MyProc", anxid);
+ }
+
+ for (j = MyProc->subxidStatus.count - 1; j >= 0; j--)
+ {
+ if (TransactionIdEquals(MyProc->subxids.xids[j], xid))
+ {
+ MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1];
+ pg_write_barrier();
+ mysubxidstat->count--;
+ MyProc->subxidStatus.count--;
+ break;
+ }
+ }
+ /* Ordinarily we should have found it, unless the cache has overflowed */
+ if (j < 0 && !MyProc->subxidStatus.overflowed)
+ elog(WARNING, "did not find subXID %u in MyProc", xid);
+
+ /* Also advance global latestCompletedXid while holding the lock */
+ MaintainLatestCompletedXid(latestXid);
+
+ /* ... and xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+
+ LWLockRelease(ProcArrayLock);
+}
+
+#ifdef XIDCACHE_DEBUG
+
+/*
+ * Print stats about effectiveness of XID cache
+ */
+static void
+DisplayXidCache(void)
+{
+ fprintf(stderr,
+ "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n",
+ xc_by_recent_xmin,
+ xc_by_known_xact,
+ xc_by_my_xact,
+ xc_by_latest_xid,
+ xc_by_main_xid,
+ xc_by_child_xid,
+ xc_by_known_assigned,
+ xc_no_overflow,
+ xc_slow_answer);
+}
+#endif /* XIDCACHE_DEBUG */
+
+/*
+ * If rel != NULL, return test state appropriate for relation, otherwise
+ * return state usable for all relations. The latter may consider XIDs as
+ * not-yet-visible-to-everyone that a state for a specific relation would
+ * already consider visible-to-everyone.
+ *
+ * This needs to be called while a snapshot is active or registered, otherwise
+ * there are wraparound and other dangers.
+ *
+ * See comment for GlobalVisState for details.
+ */
+GlobalVisState *
+GlobalVisTestFor(Relation rel)
+{
+ GlobalVisState *state = NULL;
+
+ /* XXX: we should assert that a snapshot is pushed or registered */
+ Assert(RecentXmin);
+
+ switch (GlobalVisHorizonKindForRel(rel))
+ {
+ case VISHORIZON_SHARED:
+ state = &GlobalVisSharedRels;
+ break;
+ case VISHORIZON_CATALOG:
+ state = &GlobalVisCatalogRels;
+ break;
+ case VISHORIZON_DATA:
+ state = &GlobalVisDataRels;
+ break;
+ case VISHORIZON_TEMP:
+ state = &GlobalVisTempRels;
+ break;
+ }
+
+ Assert(FullTransactionIdIsValid(state->definitely_needed) &&
+ FullTransactionIdIsValid(state->maybe_needed));
+
+ return state;
+}
+
+/*
+ * Return true if it's worth updating the accurate maybe_needed boundary.
+ *
+ * As it is somewhat expensive to determine xmin horizons, we don't want to
+ * repeatedly do so when there is a low likelihood of it being beneficial.
+ *
+ * The current heuristic is that we update only if RecentXmin has changed
+ * since the last update. If the oldest currently running transaction has not
+ * finished, it is unlikely that recomputing the horizon would be useful.
+ */
+static bool
+GlobalVisTestShouldUpdate(GlobalVisState *state)
+{
+ /* hasn't been updated yet */
+ if (!TransactionIdIsValid(ComputeXidHorizonsResultLastXmin))
+ return true;
+
+ /*
+ * If the maybe_needed/definitely_needed boundaries are the same, it's
+ * unlikely to be beneficial to refresh boundaries.
+ */
+ if (FullTransactionIdFollowsOrEquals(state->maybe_needed,
+ state->definitely_needed))
+ return false;
+
+ /* does the last snapshot built have a different xmin? */
+ return RecentXmin != ComputeXidHorizonsResultLastXmin;
+}
+
+static void
+GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons)
+{
+ GlobalVisSharedRels.maybe_needed =
+ FullXidRelativeTo(horizons->latest_completed,
+ horizons->shared_oldest_nonremovable);
+ GlobalVisCatalogRels.maybe_needed =
+ FullXidRelativeTo(horizons->latest_completed,
+ horizons->catalog_oldest_nonremovable);
+ GlobalVisDataRels.maybe_needed =
+ FullXidRelativeTo(horizons->latest_completed,
+ horizons->data_oldest_nonremovable);
+ GlobalVisTempRels.maybe_needed =
+ FullXidRelativeTo(horizons->latest_completed,
+ horizons->temp_oldest_nonremovable);
+
+ /*
+ * In longer running transactions it's possible that transactions we
+ * previously needed to treat as running aren't around anymore. So update
+ * definitely_needed to not be earlier than maybe_needed.
+ */
+ GlobalVisSharedRels.definitely_needed =
+ FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed,
+ GlobalVisSharedRels.definitely_needed);
+ GlobalVisCatalogRels.definitely_needed =
+ FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed,
+ GlobalVisCatalogRels.definitely_needed);
+ GlobalVisDataRels.definitely_needed =
+ FullTransactionIdNewer(GlobalVisDataRels.maybe_needed,
+ GlobalVisDataRels.definitely_needed);
+ GlobalVisTempRels.definitely_needed = GlobalVisTempRels.maybe_needed;
+
+ ComputeXidHorizonsResultLastXmin = RecentXmin;
+}
+
+/*
+ * Update boundaries in GlobalVis{Shared,Catalog, Data}Rels
+ * using ComputeXidHorizons().
+ */
+static void
+GlobalVisUpdate(void)
+{
+ ComputeXidHorizonsResult horizons;
+
+ /* updates the horizons as a side-effect */
+ ComputeXidHorizons(&horizons);
+}
+
+/*
+ * Return true if no snapshot still considers fxid to be running.
+ *
+ * The state passed needs to have been initialized for the relation fxid is
+ * from (NULL is also OK), otherwise the result may not be correct.
+ *
+ * See comment for GlobalVisState for details.
+ */
+bool
+GlobalVisTestIsRemovableFullXid(GlobalVisState *state,
+ FullTransactionId fxid)
+{
+ /*
+ * If fxid is older than maybe_needed bound, it definitely is visible to
+ * everyone.
+ */
+ if (FullTransactionIdPrecedes(fxid, state->maybe_needed))
+ return true;
+
+ /*
+ * If fxid is >= definitely_needed bound, it is very likely to still be
+ * considered running.
+ */
+ if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed))
+ return false;
+
+ /*
+ * fxid is between maybe_needed and definitely_needed, i.e. there might or
+ * might not exist a snapshot considering fxid running. If it makes sense,
+ * update boundaries and recheck.
+ */
+ if (GlobalVisTestShouldUpdate(state))
+ {
+ GlobalVisUpdate();
+
+ Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed));
+
+ return FullTransactionIdPrecedes(fxid, state->maybe_needed);
+ }
+ else
+ return false;
+}
+
+/*
+ * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids.
+ *
+ * It is crucial that this only gets called for xids from a source that
+ * protects against xid wraparounds (e.g. from a table and thus protected by
+ * relfrozenxid).
+ */
+bool
+GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid)
+{
+ FullTransactionId fxid;
+
+ /*
+ * Convert 32 bit argument to FullTransactionId. We can do so safely
+ * because we know the xid has to, at the very least, be between
+ * [oldestXid, nextFullXid), i.e. within 2 billion of xid. To avoid taking
+ * a lock to determine either, we can just compare with
+ * state->definitely_needed, which was based on those value at the time
+ * the current snapshot was built.
+ */
+ fxid = FullXidRelativeTo(state->definitely_needed, xid);
+
+ return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Return FullTransactionId below which all transactions are not considered
+ * running anymore.
+ *
+ * Note: This is less efficient than testing with
+ * GlobalVisTestIsRemovableFullXid as it likely requires building an accurate
+ * cutoff, even in the case all the XIDs compared with the cutoff are outside
+ * [maybe_needed, definitely_needed).
+ */
+FullTransactionId
+GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state)
+{
+ /* acquire accurate horizon if not already done */
+ if (GlobalVisTestShouldUpdate(state))
+ GlobalVisUpdate();
+
+ return state->maybe_needed;
+}
+
+/* Convenience wrapper around GlobalVisTestNonRemovableFullHorizon */
+TransactionId
+GlobalVisTestNonRemovableHorizon(GlobalVisState *state)
+{
+ FullTransactionId cutoff;
+
+ cutoff = GlobalVisTestNonRemovableFullHorizon(state);
+
+ return XidFromFullTransactionId(cutoff);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableFullXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid)
+{
+ GlobalVisState *state;
+
+ state = GlobalVisTestFor(rel);
+
+ return GlobalVisTestIsRemovableFullXid(state, fxid);
+}
+
+/*
+ * Convenience wrapper around GlobalVisTestFor() and
+ * GlobalVisTestIsRemovableXid(), see their comments.
+ */
+bool
+GlobalVisCheckRemovableXid(Relation rel, TransactionId xid)
+{
+ GlobalVisState *state;
+
+ state = GlobalVisTestFor(rel);
+
+ return GlobalVisTestIsRemovableXid(state, xid);
+}
+
+/*
+ * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it
+ * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel).
+ *
+ * Be very careful about when to use this function. It can only safely be used
+ * when there is a guarantee that xid is within MaxTransactionId / 2 xids of
+ * rel. That e.g. can be guaranteed if the caller assures a snapshot is
+ * held by the backend and xid is from a table (where vacuum/freezing ensures
+ * the xid has to be within that range), or if xid is from the procarray and
+ * prevents xid wraparound that way.
+ */
+static inline FullTransactionId
+FullXidRelativeTo(FullTransactionId rel, TransactionId xid)
+{
+ TransactionId rel_xid = XidFromFullTransactionId(rel);
+
+ Assert(TransactionIdIsValid(xid));
+ Assert(TransactionIdIsValid(rel_xid));
+
+ /* not guaranteed to find issues, but likely to catch mistakes */
+ AssertTransactionIdInAllowableRange(xid);
+
+ return FullTransactionIdFromU64(U64FromFullTransactionId(rel)
+ + (int32) (xid - rel_xid));
+}
+
+
+/* ----------------------------------------------
+ * KnownAssignedTransactionIds sub-module
+ * ----------------------------------------------
+ */
+
+/*
+ * In Hot Standby mode, we maintain a list of transactions that are (or were)
+ * running on the primary at the current point in WAL. These XIDs must be
+ * treated as running by standby transactions, even though they are not in
+ * the standby server's PGPROC array.
+ *
+ * We record all XIDs that we know have been assigned. That includes all the
+ * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have
+ * been assigned. We can deduce the existence of unobserved XIDs because we
+ * know XIDs are assigned in sequence, with no gaps. The KnownAssignedXids
+ * list expands as new XIDs are observed or inferred, and contracts when
+ * transaction completion records arrive.
+ *
+ * During hot standby we do not fret too much about the distinction between
+ * top-level XIDs and subtransaction XIDs. We store both together in the
+ * KnownAssignedXids list. In backends, this is copied into snapshots in
+ * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot()
+ * doesn't care about the distinction either. Subtransaction XIDs are
+ * effectively treated as top-level XIDs and in the typical case pg_subtrans
+ * links are *not* maintained (which does not affect visibility).
+ *
+ * We have room in KnownAssignedXids and in snapshots to hold maxProcs *
+ * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must
+ * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at
+ * least every PGPROC_MAX_CACHED_SUBXIDS. When we receive one of these
+ * records, we mark the subXIDs as children of the top XID in pg_subtrans,
+ * and then remove them from KnownAssignedXids. This prevents overflow of
+ * KnownAssignedXids and snapshots, at the cost that status checks for these
+ * subXIDs will take a slower path through TransactionIdIsInProgress().
+ * This means that KnownAssignedXids is not necessarily complete for subXIDs,
+ * though it should be complete for top-level XIDs; this is the same situation
+ * that holds with respect to the PGPROC entries in normal running.
+ *
+ * When we throw away subXIDs from KnownAssignedXids, we need to keep track of
+ * that, similarly to tracking overflow of a PGPROC's subxids array. We do
+ * that by remembering the lastOverflowedXid, ie the last thrown-away subXID.
+ * As long as that is within the range of interesting XIDs, we have to assume
+ * that subXIDs are missing from snapshots. (Note that subXID overflow occurs
+ * on primary when 65th subXID arrives, whereas on standby it occurs when 64th
+ * subXID arrives - that is not an error.)
+ *
+ * Should a backend on primary somehow disappear before it can write an abort
+ * record, then we just leave those XIDs in KnownAssignedXids. They actually
+ * aborted but we think they were running; the distinction is irrelevant
+ * because either way any changes done by the transaction are not visible to
+ * backends in the standby. We prune KnownAssignedXids when
+ * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the
+ * array due to such dead XIDs.
+ */
+
+/*
+ * RecordKnownAssignedTransactionIds
+ * Record the given XID in KnownAssignedXids, as well as any preceding
+ * unobserved XIDs.
+ *
+ * RecordKnownAssignedTransactionIds() should be run for *every* WAL record
+ * associated with a transaction. Must be called for each record after we
+ * have executed StartupCLOG() et al, since we must ExtendCLOG() etc..
+ *
+ * Called during recovery in analogy with and in place of GetNewTransactionId()
+ */
+void
+RecordKnownAssignedTransactionIds(TransactionId xid)
+{
+ Assert(standbyState >= STANDBY_INITIALIZED);
+ Assert(TransactionIdIsValid(xid));
+ Assert(TransactionIdIsValid(latestObservedXid));
+
+ elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u",
+ xid, latestObservedXid);
+
+ /*
+ * When a newly observed xid arrives, it is frequently the case that it is
+ * *not* the next xid in sequence. When this occurs, we must treat the
+ * intervening xids as running also.
+ */
+ if (TransactionIdFollows(xid, latestObservedXid))
+ {
+ TransactionId next_expected_xid;
+
+ /*
+ * Extend subtrans like we do in GetNewTransactionId() during normal
+ * operation using individual extend steps. Note that we do not need
+ * to extend clog since its extensions are WAL logged.
+ *
+ * This part has to be done regardless of standbyState since we
+ * immediately start assigning subtransactions to their toplevel
+ * transactions.
+ */
+ next_expected_xid = latestObservedXid;
+ while (TransactionIdPrecedes(next_expected_xid, xid))
+ {
+ TransactionIdAdvance(next_expected_xid);
+ ExtendSUBTRANS(next_expected_xid);
+ }
+ Assert(next_expected_xid == xid);
+
+ /*
+ * If the KnownAssignedXids machinery isn't up yet, there's nothing
+ * more to do since we don't track assigned xids yet.
+ */
+ if (standbyState <= STANDBY_INITIALIZED)
+ {
+ latestObservedXid = xid;
+ return;
+ }
+
+ /*
+ * Add (latestObservedXid, xid] onto the KnownAssignedXids array.
+ */
+ next_expected_xid = latestObservedXid;
+ TransactionIdAdvance(next_expected_xid);
+ KnownAssignedXidsAdd(next_expected_xid, xid, false);
+
+ /*
+ * Now we can advance latestObservedXid
+ */
+ latestObservedXid = xid;
+
+ /* ShmemVariableCache->nextXid must be beyond any observed xid */
+ AdvanceNextFullTransactionIdPastXid(latestObservedXid);
+ next_expected_xid = latestObservedXid;
+ TransactionIdAdvance(next_expected_xid);
+ }
+}
+
+/*
+ * ExpireTreeKnownAssignedTransactionIds
+ * Remove the given XIDs from KnownAssignedXids.
+ *
+ * Called during recovery in analogy with and in place of ProcArrayEndTransaction()
+ */
+void
+ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids,
+ TransactionId *subxids, TransactionId max_xid)
+{
+ Assert(standbyState >= STANDBY_INITIALIZED);
+
+ /*
+ * Uses same locking as transaction commit
+ */
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ KnownAssignedXidsRemoveTree(xid, nsubxids, subxids);
+
+ /* As in ProcArrayEndTransaction, advance latestCompletedXid */
+ MaintainLatestCompletedXidRecovery(max_xid);
+
+ /* ... and xactCompletionCount */
+ ShmemVariableCache->xactCompletionCount++;
+
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ExpireAllKnownAssignedTransactionIds
+ * Remove all entries in KnownAssignedXids and reset lastOverflowedXid.
+ */
+void
+ExpireAllKnownAssignedTransactionIds(void)
+{
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ KnownAssignedXidsRemovePreceding(InvalidTransactionId);
+
+ /*
+ * Reset lastOverflowedXid. Currently, lastOverflowedXid has no use after
+ * the call of this function. But do this for unification with what
+ * ExpireOldKnownAssignedTransactionIds() do.
+ */
+ procArray->lastOverflowedXid = InvalidTransactionId;
+ LWLockRelease(ProcArrayLock);
+}
+
+/*
+ * ExpireOldKnownAssignedTransactionIds
+ * Remove KnownAssignedXids entries preceding the given XID and
+ * potentially reset lastOverflowedXid.
+ */
+void
+ExpireOldKnownAssignedTransactionIds(TransactionId xid)
+{
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ /*
+ * Reset lastOverflowedXid if we know all transactions that have been
+ * possibly running are being gone. Not doing so could cause an incorrect
+ * lastOverflowedXid value, which makes extra snapshots be marked as
+ * suboverflowed.
+ */
+ if (TransactionIdPrecedes(procArray->lastOverflowedXid, xid))
+ procArray->lastOverflowedXid = InvalidTransactionId;
+ KnownAssignedXidsRemovePreceding(xid);
+ LWLockRelease(ProcArrayLock);
+}
+
+
+/*
+ * Private module functions to manipulate KnownAssignedXids
+ *
+ * There are 5 main uses of the KnownAssignedXids data structure:
+ *
+ * * backends taking snapshots - all valid XIDs need to be copied out
+ * * backends seeking to determine presence of a specific XID
+ * * startup process adding new known-assigned XIDs
+ * * startup process removing specific XIDs as transactions end
+ * * startup process pruning array when special WAL records arrive
+ *
+ * This data structure is known to be a hot spot during Hot Standby, so we
+ * go to some lengths to make these operations as efficient and as concurrent
+ * as possible.
+ *
+ * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes
+ * order, to be exact --- to allow binary search for specific XIDs. Note:
+ * in general TransactionIdPrecedes would not provide a total order, but
+ * we know that the entries present at any instant should not extend across
+ * a large enough fraction of XID space to wrap around (the primary would
+ * shut down for fear of XID wrap long before that happens). So it's OK to
+ * use TransactionIdPrecedes as a binary-search comparator.
+ *
+ * It's cheap to maintain the sortedness during insertions, since new known
+ * XIDs are always reported in XID order; we just append them at the right.
+ *
+ * To keep individual deletions cheap, we need to allow gaps in the array.
+ * This is implemented by marking array elements as valid or invalid using
+ * the parallel boolean array KnownAssignedXidsValid[]. A deletion is done
+ * by setting KnownAssignedXidsValid[i] to false, *without* clearing the
+ * XID entry itself. This preserves the property that the XID entries are
+ * sorted, so we can do binary searches easily. Periodically we compress
+ * out the unused entries; that's much cheaper than having to compress the
+ * array immediately on every deletion.
+ *
+ * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[]
+ * are those with indexes tail <= i < head; items outside this subscript range
+ * have unspecified contents. When head reaches the end of the array, we
+ * force compression of unused entries rather than wrapping around, since
+ * allowing wraparound would greatly complicate the search logic. We maintain
+ * an explicit tail pointer so that pruning of old XIDs can be done without
+ * immediately moving the array contents. In most cases only a small fraction
+ * of the array contains valid entries at any instant.
+ *
+ * Although only the startup process can ever change the KnownAssignedXids
+ * data structure, we still need interlocking so that standby backends will
+ * not observe invalid intermediate states. The convention is that backends
+ * must hold shared ProcArrayLock to examine the array. To remove XIDs from
+ * the array, the startup process must hold ProcArrayLock exclusively, for
+ * the usual transactional reasons (compare commit/abort of a transaction
+ * during normal running). Compressing unused entries out of the array
+ * likewise requires exclusive lock. To add XIDs to the array, we just insert
+ * them into slots to the right of the head pointer and then advance the head
+ * pointer. This wouldn't require any lock at all, except that on machines
+ * with weak memory ordering we need to be careful that other processors
+ * see the array element changes before they see the head pointer change.
+ * We handle this by using a spinlock to protect reads and writes of the
+ * head/tail pointers. (We could dispense with the spinlock if we were to
+ * create suitable memory access barrier primitives and use those instead.)
+ * The spinlock must be taken to read or write the head/tail pointers unless
+ * the caller holds ProcArrayLock exclusively.
+ *
+ * Algorithmic analysis:
+ *
+ * If we have a maximum of M slots, with N XIDs currently spread across
+ * S elements then we have N <= S <= M always.
+ *
+ * * Adding a new XID is O(1) and needs little locking (unless compression
+ * must happen)
+ * * Compressing the array is O(S) and requires exclusive lock
+ * * Removing an XID is O(logS) and requires exclusive lock
+ * * Taking a snapshot is O(S) and requires shared lock
+ * * Checking for an XID is O(logS) and requires shared lock
+ *
+ * In comparison, using a hash table for KnownAssignedXids would mean that
+ * taking snapshots would be O(M). If we can maintain S << M then the
+ * sorted array technique will deliver significantly faster snapshots.
+ * If we try to keep S too small then we will spend too much time compressing,
+ * so there is an optimal point for any workload mix. We use a heuristic to
+ * decide when to compress the array, though trimming also helps reduce
+ * frequency of compressing. The heuristic requires us to track the number of
+ * currently valid XIDs in the array.
+ */
+
+
+/*
+ * Compress KnownAssignedXids by shifting valid data down to the start of the
+ * array, removing any gaps.
+ *
+ * A compression step is forced if "force" is true, otherwise we do it
+ * only if a heuristic indicates it's a good time to do it.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsCompress(bool force)
+{
+ ProcArrayStruct *pArray = procArray;
+ int head,
+ tail;
+ int compress_index;
+ int i;
+
+ /* no spinlock required since we hold ProcArrayLock exclusively */
+ head = pArray->headKnownAssignedXids;
+ tail = pArray->tailKnownAssignedXids;
+
+ if (!force)
+ {
+ /*
+ * If we can choose how much to compress, use a heuristic to avoid
+ * compressing too often or not often enough.
+ *
+ * Heuristic is if we have a large enough current spread and less than
+ * 50% of the elements are currently in use, then compress. This
+ * should ensure we compress fairly infrequently. We could compress
+ * less often though the virtual array would spread out more and
+ * snapshots would become more expensive.
+ */
+ int nelements = head - tail;
+
+ if (nelements < 4 * PROCARRAY_MAXPROCS ||
+ nelements < 2 * pArray->numKnownAssignedXids)
+ return;
+ }
+
+ /*
+ * We compress the array by reading the valid values from tail to head,
+ * re-aligning data to 0th element.
+ */
+ compress_index = 0;
+ for (i = tail; i < head; i++)
+ {
+ if (KnownAssignedXidsValid[i])
+ {
+ KnownAssignedXids[compress_index] = KnownAssignedXids[i];
+ KnownAssignedXidsValid[compress_index] = true;
+ compress_index++;
+ }
+ }
+
+ pArray->tailKnownAssignedXids = 0;
+ pArray->headKnownAssignedXids = compress_index;
+}
+
+/*
+ * Add xids into KnownAssignedXids at the head of the array.
+ *
+ * xids from from_xid to to_xid, inclusive, are added to the array.
+ *
+ * If exclusive_lock is true then caller already holds ProcArrayLock in
+ * exclusive mode, so we need no extra locking here. Else caller holds no
+ * lock, so we need to be sure we maintain sufficient interlocks against
+ * concurrent readers. (Only the startup process ever calls this, so no need
+ * to worry about concurrent writers.)
+ */
+static void
+KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid,
+ bool exclusive_lock)
+{
+ ProcArrayStruct *pArray = procArray;
+ TransactionId next_xid;
+ int head,
+ tail;
+ int nxids;
+ int i;
+
+ Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid));
+
+ /*
+ * Calculate how many array slots we'll need. Normally this is cheap; in
+ * the unusual case where the XIDs cross the wrap point, we do it the hard
+ * way.
+ */
+ if (to_xid >= from_xid)
+ nxids = to_xid - from_xid + 1;
+ else
+ {
+ nxids = 1;
+ next_xid = from_xid;
+ while (TransactionIdPrecedes(next_xid, to_xid))
+ {
+ nxids++;
+ TransactionIdAdvance(next_xid);
+ }
+ }
+
+ /*
+ * Since only the startup process modifies the head/tail pointers, we
+ * don't need a lock to read them here.
+ */
+ head = pArray->headKnownAssignedXids;
+ tail = pArray->tailKnownAssignedXids;
+
+ Assert(head >= 0 && head <= pArray->maxKnownAssignedXids);
+ Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids);
+
+ /*
+ * Verify that insertions occur in TransactionId sequence. Note that even
+ * if the last existing element is marked invalid, it must still have a
+ * correctly sequenced XID value.
+ */
+ if (head > tail &&
+ TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid))
+ {
+ KnownAssignedXidsDisplay(LOG);
+ elog(ERROR, "out-of-order XID insertion in KnownAssignedXids");
+ }
+
+ /*
+ * If our xids won't fit in the remaining space, compress out free space
+ */
+ if (head + nxids > pArray->maxKnownAssignedXids)
+ {
+ /* must hold lock to compress */
+ if (!exclusive_lock)
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ KnownAssignedXidsCompress(true);
+
+ head = pArray->headKnownAssignedXids;
+ /* note: we no longer care about the tail pointer */
+
+ if (!exclusive_lock)
+ LWLockRelease(ProcArrayLock);
+
+ /*
+ * If it still won't fit then we're out of memory
+ */
+ if (head + nxids > pArray->maxKnownAssignedXids)
+ elog(ERROR, "too many KnownAssignedXids");
+ }
+
+ /* Now we can insert the xids into the space starting at head */
+ next_xid = from_xid;
+ for (i = 0; i < nxids; i++)
+ {
+ KnownAssignedXids[head] = next_xid;
+ KnownAssignedXidsValid[head] = true;
+ TransactionIdAdvance(next_xid);
+ head++;
+ }
+
+ /* Adjust count of number of valid entries */
+ pArray->numKnownAssignedXids += nxids;
+
+ /*
+ * Now update the head pointer. We use a spinlock to protect this
+ * pointer, not because the update is likely to be non-atomic, but to
+ * ensure that other processors see the above array updates before they
+ * see the head pointer change.
+ *
+ * If we're holding ProcArrayLock exclusively, there's no need to take the
+ * spinlock.
+ */
+ if (exclusive_lock)
+ pArray->headKnownAssignedXids = head;
+ else
+ {
+ SpinLockAcquire(&pArray->known_assigned_xids_lck);
+ pArray->headKnownAssignedXids = head;
+ SpinLockRelease(&pArray->known_assigned_xids_lck);
+ }
+}
+
+/*
+ * KnownAssignedXidsSearch
+ *
+ * Searches KnownAssignedXids for a specific xid and optionally removes it.
+ * Returns true if it was found, false if not.
+ *
+ * Caller must hold ProcArrayLock in shared or exclusive mode.
+ * Exclusive lock must be held for remove = true.
+ */
+static bool
+KnownAssignedXidsSearch(TransactionId xid, bool remove)
+{
+ ProcArrayStruct *pArray = procArray;
+ int first,
+ last;
+ int head;
+ int tail;
+ int result_index = -1;
+
+ if (remove)
+ {
+ /* we hold ProcArrayLock exclusively, so no need for spinlock */
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
+ }
+ else
+ {
+ /* take spinlock to ensure we see up-to-date array contents */
+ SpinLockAcquire(&pArray->known_assigned_xids_lck);
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
+ SpinLockRelease(&pArray->known_assigned_xids_lck);
+ }
+
+ /*
+ * Standard binary search. Note we can ignore the KnownAssignedXidsValid
+ * array here, since even invalid entries will contain sorted XIDs.
+ */
+ first = tail;
+ last = head - 1;
+ while (first <= last)
+ {
+ int mid_index;
+ TransactionId mid_xid;
+
+ mid_index = (first + last) / 2;
+ mid_xid = KnownAssignedXids[mid_index];
+
+ if (xid == mid_xid)
+ {
+ result_index = mid_index;
+ break;
+ }
+ else if (TransactionIdPrecedes(xid, mid_xid))
+ last = mid_index - 1;
+ else
+ first = mid_index + 1;
+ }
+
+ if (result_index < 0)
+ return false; /* not in array */
+
+ if (!KnownAssignedXidsValid[result_index])
+ return false; /* in array, but invalid */
+
+ if (remove)
+ {
+ KnownAssignedXidsValid[result_index] = false;
+
+ pArray->numKnownAssignedXids--;
+ Assert(pArray->numKnownAssignedXids >= 0);
+
+ /*
+ * If we're removing the tail element then advance tail pointer over
+ * any invalid elements. This will speed future searches.
+ */
+ if (result_index == tail)
+ {
+ tail++;
+ while (tail < head && !KnownAssignedXidsValid[tail])
+ tail++;
+ if (tail >= head)
+ {
+ /* Array is empty, so we can reset both pointers */
+ pArray->headKnownAssignedXids = 0;
+ pArray->tailKnownAssignedXids = 0;
+ }
+ else
+ {
+ pArray->tailKnownAssignedXids = tail;
+ }
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Is the specified XID present in KnownAssignedXids[]?
+ *
+ * Caller must hold ProcArrayLock in shared or exclusive mode.
+ */
+static bool
+KnownAssignedXidExists(TransactionId xid)
+{
+ Assert(TransactionIdIsValid(xid));
+
+ return KnownAssignedXidsSearch(xid, false);
+}
+
+/*
+ * Remove the specified XID from KnownAssignedXids[].
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemove(TransactionId xid)
+{
+ Assert(TransactionIdIsValid(xid));
+
+ elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid);
+
+ /*
+ * Note: we cannot consider it an error to remove an XID that's not
+ * present. We intentionally remove subxact IDs while processing
+ * XLOG_XACT_ASSIGNMENT, to avoid array overflow. Then those XIDs will be
+ * removed again when the top-level xact commits or aborts.
+ *
+ * It might be possible to track such XIDs to distinguish this case from
+ * actual errors, but it would be complicated and probably not worth it.
+ * So, just ignore the search result.
+ */
+ (void) KnownAssignedXidsSearch(xid, true);
+}
+
+/*
+ * KnownAssignedXidsRemoveTree
+ * Remove xid (if it's not InvalidTransactionId) and all the subxids.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids,
+ TransactionId *subxids)
+{
+ int i;
+
+ if (TransactionIdIsValid(xid))
+ KnownAssignedXidsRemove(xid);
+
+ for (i = 0; i < nsubxids; i++)
+ KnownAssignedXidsRemove(subxids[i]);
+
+ /* Opportunistically compress the array */
+ KnownAssignedXidsCompress(false);
+}
+
+/*
+ * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid
+ * then clear the whole table.
+ *
+ * Caller must hold ProcArrayLock in exclusive mode.
+ */
+static void
+KnownAssignedXidsRemovePreceding(TransactionId removeXid)
+{
+ ProcArrayStruct *pArray = procArray;
+ int count = 0;
+ int head,
+ tail,
+ i;
+
+ if (!TransactionIdIsValid(removeXid))
+ {
+ elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids");
+ pArray->numKnownAssignedXids = 0;
+ pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0;
+ return;
+ }
+
+ elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid);
+
+ /*
+ * Mark entries invalid starting at the tail. Since array is sorted, we
+ * can stop as soon as we reach an entry >= removeXid.
+ */
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
+
+ for (i = tail; i < head; i++)
+ {
+ if (KnownAssignedXidsValid[i])
+ {
+ TransactionId knownXid = KnownAssignedXids[i];
+
+ if (TransactionIdFollowsOrEquals(knownXid, removeXid))
+ break;
+
+ if (!StandbyTransactionIdIsPrepared(knownXid))
+ {
+ KnownAssignedXidsValid[i] = false;
+ count++;
+ }
+ }
+ }
+
+ pArray->numKnownAssignedXids -= count;
+ Assert(pArray->numKnownAssignedXids >= 0);
+
+ /*
+ * Advance the tail pointer if we've marked the tail item invalid.
+ */
+ for (i = tail; i < head; i++)
+ {
+ if (KnownAssignedXidsValid[i])
+ break;
+ }
+ if (i >= head)
+ {
+ /* Array is empty, so we can reset both pointers */
+ pArray->headKnownAssignedXids = 0;
+ pArray->tailKnownAssignedXids = 0;
+ }
+ else
+ {
+ pArray->tailKnownAssignedXids = i;
+ }
+
+ /* Opportunistically compress the array */
+ KnownAssignedXidsCompress(false);
+}
+
+/*
+ * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids.
+ * We filter out anything >= xmax.
+ *
+ * Returns the number of XIDs stored into xarray[]. Caller is responsible
+ * that array is large enough.
+ *
+ * Caller must hold ProcArrayLock in (at least) shared mode.
+ */
+static int
+KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax)
+{
+ TransactionId xtmp = InvalidTransactionId;
+
+ return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax);
+}
+
+/*
+ * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus
+ * we reduce *xmin to the lowest xid value seen if not already lower.
+ *
+ * Caller must hold ProcArrayLock in (at least) shared mode.
+ */
+static int
+KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin,
+ TransactionId xmax)
+{
+ int count = 0;
+ int head,
+ tail;
+ int i;
+
+ /*
+ * Fetch head just once, since it may change while we loop. We can stop
+ * once we reach the initially seen head, since we are certain that an xid
+ * cannot enter and then leave the array while we hold ProcArrayLock. We
+ * might miss newly-added xids, but they should be >= xmax so irrelevant
+ * anyway.
+ *
+ * Must take spinlock to ensure we see up-to-date array contents.
+ */
+ SpinLockAcquire(&procArray->known_assigned_xids_lck);
+ tail = procArray->tailKnownAssignedXids;
+ head = procArray->headKnownAssignedXids;
+ SpinLockRelease(&procArray->known_assigned_xids_lck);
+
+ for (i = tail; i < head; i++)
+ {
+ /* Skip any gaps in the array */
+ if (KnownAssignedXidsValid[i])
+ {
+ TransactionId knownXid = KnownAssignedXids[i];
+
+ /*
+ * Update xmin if required. Only the first XID need be checked,
+ * since the array is sorted.
+ */
+ if (count == 0 &&
+ TransactionIdPrecedes(knownXid, *xmin))
+ *xmin = knownXid;
+
+ /*
+ * Filter out anything >= xmax, again relying on sorted property
+ * of array.
+ */
+ if (TransactionIdIsValid(xmax) &&
+ TransactionIdFollowsOrEquals(knownXid, xmax))
+ break;
+
+ /* Add knownXid into output array */
+ xarray[count++] = knownXid;
+ }
+ }
+
+ return count;
+}
+
+/*
+ * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId
+ * if nothing there.
+ */
+static TransactionId
+KnownAssignedXidsGetOldestXmin(void)
+{
+ int head,
+ tail;
+ int i;
+
+ /*
+ * Fetch head just once, since it may change while we loop.
+ */
+ SpinLockAcquire(&procArray->known_assigned_xids_lck);
+ tail = procArray->tailKnownAssignedXids;
+ head = procArray->headKnownAssignedXids;
+ SpinLockRelease(&procArray->known_assigned_xids_lck);
+
+ for (i = tail; i < head; i++)
+ {
+ /* Skip any gaps in the array */
+ if (KnownAssignedXidsValid[i])
+ return KnownAssignedXids[i];
+ }
+
+ return InvalidTransactionId;
+}
+
+/*
+ * Display KnownAssignedXids to provide debug trail
+ *
+ * Currently this is only called within startup process, so we need no
+ * special locking.
+ *
+ * Note this is pretty expensive, and much of the expense will be incurred
+ * even if the elog message will get discarded. It's not currently called
+ * in any performance-critical places, however, so no need to be tenser.
+ */
+static void
+KnownAssignedXidsDisplay(int trace_level)
+{
+ ProcArrayStruct *pArray = procArray;
+ StringInfoData buf;
+ int head,
+ tail,
+ i;
+ int nxids = 0;
+
+ tail = pArray->tailKnownAssignedXids;
+ head = pArray->headKnownAssignedXids;
+
+ initStringInfo(&buf);
+
+ for (i = tail; i < head; i++)
+ {
+ if (KnownAssignedXidsValid[i])
+ {
+ nxids++;
+ appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]);
+ }
+ }
+
+ elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",
+ nxids,
+ pArray->numKnownAssignedXids,
+ pArray->tailKnownAssignedXids,
+ pArray->headKnownAssignedXids,
+ buf.data);
+
+ pfree(buf.data);
+}
+
+/*
+ * KnownAssignedXidsReset
+ * Resets KnownAssignedXids to be empty
+ */
+static void
+KnownAssignedXidsReset(void)
+{
+ ProcArrayStruct *pArray = procArray;
+
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
+ pArray->numKnownAssignedXids = 0;
+ pArray->tailKnownAssignedXids = 0;
+ pArray->headKnownAssignedXids = 0;
+
+ LWLockRelease(ProcArrayLock);
+}
diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c
new file mode 100644
index 0000000..defb75a
--- /dev/null
+++ b/src/backend/storage/ipc/procsignal.c
@@ -0,0 +1,685 @@
+/*-------------------------------------------------------------------------
+ *
+ * procsignal.c
+ * Routines for interprocess signaling
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/procsignal.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/parallel.h"
+#include "port/pg_bitutils.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "replication/walsender.h"
+#include "storage/condition_variable.h"
+#include "storage/ipc.h"
+#include "storage/latch.h"
+#include "storage/proc.h"
+#include "storage/shmem.h"
+#include "storage/sinval.h"
+#include "tcop/tcopprot.h"
+#include "utils/memutils.h"
+
+/*
+ * The SIGUSR1 signal is multiplexed to support signaling multiple event
+ * types. The specific reason is communicated via flags in shared memory.
+ * We keep a boolean flag for each possible "reason", so that different
+ * reasons can be signaled to a process concurrently. (However, if the same
+ * reason is signaled more than once nearly simultaneously, the process may
+ * observe it only once.)
+ *
+ * Each process that wants to receive signals registers its process ID
+ * in the ProcSignalSlots array. The array is indexed by backend ID to make
+ * slot allocation simple, and to avoid having to search the array when you
+ * know the backend ID of the process you're signaling. (We do support
+ * signaling without backend ID, but it's a bit less efficient.)
+ *
+ * The flags are actually declared as "volatile sig_atomic_t" for maximum
+ * portability. This should ensure that loads and stores of the flag
+ * values are atomic, allowing us to dispense with any explicit locking.
+ *
+ * pss_signalFlags are intended to be set in cases where we don't need to
+ * keep track of whether or not the target process has handled the signal,
+ * but sometimes we need confirmation, as when making a global state change
+ * that cannot be considered complete until all backends have taken notice
+ * of it. For such use cases, we set a bit in pss_barrierCheckMask and then
+ * increment the current "barrier generation"; when the new barrier generation
+ * (or greater) appears in the pss_barrierGeneration flag of every process,
+ * we know that the message has been received everywhere.
+ */
+typedef struct
+{
+ volatile pid_t pss_pid;
+ volatile sig_atomic_t pss_signalFlags[NUM_PROCSIGNALS];
+ pg_atomic_uint64 pss_barrierGeneration;
+ pg_atomic_uint32 pss_barrierCheckMask;
+ ConditionVariable pss_barrierCV;
+} ProcSignalSlot;
+
+/*
+ * Information that is global to the entire ProcSignal system can be stored
+ * here.
+ *
+ * psh_barrierGeneration is the highest barrier generation in existence.
+ */
+typedef struct
+{
+ pg_atomic_uint64 psh_barrierGeneration;
+ ProcSignalSlot psh_slot[FLEXIBLE_ARRAY_MEMBER];
+} ProcSignalHeader;
+
+/*
+ * We reserve a slot for each possible BackendId, plus one for each
+ * possible auxiliary process type. (This scheme assumes there is not
+ * more than one of any auxiliary process type at a time.)
+ */
+#define NumProcSignalSlots (MaxBackends + NUM_AUXPROCTYPES)
+
+/* Check whether the relevant type bit is set in the flags. */
+#define BARRIER_SHOULD_CHECK(flags, type) \
+ (((flags) & (((uint32) 1) << (uint32) (type))) != 0)
+
+/* Clear the relevant type bit from the flags. */
+#define BARRIER_CLEAR_BIT(flags, type) \
+ ((flags) &= ~(((uint32) 1) << (uint32) (type)))
+
+static ProcSignalHeader *ProcSignal = NULL;
+static ProcSignalSlot *MyProcSignalSlot = NULL;
+
+static bool CheckProcSignal(ProcSignalReason reason);
+static void CleanupProcSignalState(int status, Datum arg);
+static void ResetProcSignalBarrierBits(uint32 flags);
+static bool ProcessBarrierPlaceholder(void);
+
+/*
+ * ProcSignalShmemSize
+ * Compute space needed for procsignal's shared memory
+ */
+Size
+ProcSignalShmemSize(void)
+{
+ Size size;
+
+ size = mul_size(NumProcSignalSlots, sizeof(ProcSignalSlot));
+ size = add_size(size, offsetof(ProcSignalHeader, psh_slot));
+ return size;
+}
+
+/*
+ * ProcSignalShmemInit
+ * Allocate and initialize procsignal's shared memory
+ */
+void
+ProcSignalShmemInit(void)
+{
+ Size size = ProcSignalShmemSize();
+ bool found;
+
+ ProcSignal = (ProcSignalHeader *)
+ ShmemInitStruct("ProcSignal", size, &found);
+
+ /* If we're first, initialize. */
+ if (!found)
+ {
+ int i;
+
+ pg_atomic_init_u64(&ProcSignal->psh_barrierGeneration, 0);
+
+ for (i = 0; i < NumProcSignalSlots; ++i)
+ {
+ ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+
+ slot->pss_pid = 0;
+ MemSet(slot->pss_signalFlags, 0, sizeof(slot->pss_signalFlags));
+ pg_atomic_init_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX);
+ pg_atomic_init_u32(&slot->pss_barrierCheckMask, 0);
+ ConditionVariableInit(&slot->pss_barrierCV);
+ }
+ }
+}
+
+/*
+ * ProcSignalInit
+ * Register the current process in the procsignal array
+ *
+ * The passed index should be my BackendId if the process has one,
+ * or MaxBackends + aux process type if not.
+ */
+void
+ProcSignalInit(int pss_idx)
+{
+ ProcSignalSlot *slot;
+ uint64 barrier_generation;
+
+ Assert(pss_idx >= 1 && pss_idx <= NumProcSignalSlots);
+
+ slot = &ProcSignal->psh_slot[pss_idx - 1];
+
+ /* sanity check */
+ if (slot->pss_pid != 0)
+ elog(LOG, "process %d taking over ProcSignal slot %d, but it's not empty",
+ MyProcPid, pss_idx);
+
+ /* Clear out any leftover signal reasons */
+ MemSet(slot->pss_signalFlags, 0, NUM_PROCSIGNALS * sizeof(sig_atomic_t));
+
+ /*
+ * Initialize barrier state. Since we're a brand-new process, there
+ * shouldn't be any leftover backend-private state that needs to be
+ * updated. Therefore, we can broadcast the latest barrier generation and
+ * disregard any previously-set check bits.
+ *
+ * NB: This only works if this initialization happens early enough in the
+ * startup sequence that we haven't yet cached any state that might need
+ * to be invalidated. That's also why we have a memory barrier here, to be
+ * sure that any later reads of memory happen strictly after this.
+ */
+ pg_atomic_write_u32(&slot->pss_barrierCheckMask, 0);
+ barrier_generation =
+ pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+ pg_atomic_write_u64(&slot->pss_barrierGeneration, barrier_generation);
+ pg_memory_barrier();
+
+ /* Mark slot with my PID */
+ slot->pss_pid = MyProcPid;
+
+ /* Remember slot location for CheckProcSignal */
+ MyProcSignalSlot = slot;
+
+ /* Set up to release the slot on process exit */
+ on_shmem_exit(CleanupProcSignalState, Int32GetDatum(pss_idx));
+}
+
+/*
+ * CleanupProcSignalState
+ * Remove current process from ProcSignal mechanism
+ *
+ * This function is called via on_shmem_exit() during backend shutdown.
+ */
+static void
+CleanupProcSignalState(int status, Datum arg)
+{
+ int pss_idx = DatumGetInt32(arg);
+ ProcSignalSlot *slot;
+
+ slot = &ProcSignal->psh_slot[pss_idx - 1];
+ Assert(slot == MyProcSignalSlot);
+
+ /*
+ * Clear MyProcSignalSlot, so that a SIGUSR1 received after this point
+ * won't try to access it after it's no longer ours (and perhaps even
+ * after we've unmapped the shared memory segment).
+ */
+ MyProcSignalSlot = NULL;
+
+ /* sanity check */
+ if (slot->pss_pid != MyProcPid)
+ {
+ /*
+ * don't ERROR here. We're exiting anyway, and don't want to get into
+ * infinite loop trying to exit
+ */
+ elog(LOG, "process %d releasing ProcSignal slot %d, but it contains %d",
+ MyProcPid, pss_idx, (int) slot->pss_pid);
+ return; /* XXX better to zero the slot anyway? */
+ }
+
+ /*
+ * Make this slot look like it's absorbed all possible barriers, so that
+ * no barrier waits block on it.
+ */
+ pg_atomic_write_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX);
+ ConditionVariableBroadcast(&slot->pss_barrierCV);
+
+ slot->pss_pid = 0;
+}
+
+/*
+ * SendProcSignal
+ * Send a signal to a Postgres process
+ *
+ * Providing backendId is optional, but it will speed up the operation.
+ *
+ * On success (a signal was sent), zero is returned.
+ * On error, -1 is returned, and errno is set (typically to ESRCH or EPERM).
+ *
+ * Not to be confused with ProcSendSignal
+ */
+int
+SendProcSignal(pid_t pid, ProcSignalReason reason, BackendId backendId)
+{
+ volatile ProcSignalSlot *slot;
+
+ if (backendId != InvalidBackendId)
+ {
+ slot = &ProcSignal->psh_slot[backendId - 1];
+
+ /*
+ * Note: Since there's no locking, it's possible that the target
+ * process detaches from shared memory and exits right after this
+ * test, before we set the flag and send signal. And the signal slot
+ * might even be recycled by a new process, so it's remotely possible
+ * that we set a flag for a wrong process. That's OK, all the signals
+ * are such that no harm is done if they're mistakenly fired.
+ */
+ if (slot->pss_pid == pid)
+ {
+ /* Atomically set the proper flag */
+ slot->pss_signalFlags[reason] = true;
+ /* Send signal */
+ return kill(pid, SIGUSR1);
+ }
+ }
+ else
+ {
+ /*
+ * BackendId not provided, so search the array using pid. We search
+ * the array back to front so as to reduce search overhead. Passing
+ * InvalidBackendId means that the target is most likely an auxiliary
+ * process, which will have a slot near the end of the array.
+ */
+ int i;
+
+ for (i = NumProcSignalSlots - 1; i >= 0; i--)
+ {
+ slot = &ProcSignal->psh_slot[i];
+
+ if (slot->pss_pid == pid)
+ {
+ /* the above note about race conditions applies here too */
+
+ /* Atomically set the proper flag */
+ slot->pss_signalFlags[reason] = true;
+ /* Send signal */
+ return kill(pid, SIGUSR1);
+ }
+ }
+ }
+
+ errno = ESRCH;
+ return -1;
+}
+
+/*
+ * EmitProcSignalBarrier
+ * Send a signal to every Postgres process
+ *
+ * The return value of this function is the barrier "generation" created
+ * by this operation. This value can be passed to WaitForProcSignalBarrier
+ * to wait until it is known that every participant in the ProcSignal
+ * mechanism has absorbed the signal (or started afterwards).
+ *
+ * Note that it would be a bad idea to use this for anything that happens
+ * frequently, as interrupting every backend could cause a noticeable
+ * performance hit.
+ *
+ * Callers are entitled to assume that this function will not throw ERROR
+ * or FATAL.
+ */
+uint64
+EmitProcSignalBarrier(ProcSignalBarrierType type)
+{
+ uint32 flagbit = 1 << (uint32) type;
+ uint64 generation;
+
+ /*
+ * Set all the flags.
+ *
+ * Note that pg_atomic_fetch_or_u32 has full barrier semantics, so this is
+ * totally ordered with respect to anything the caller did before, and
+ * anything that we do afterwards. (This is also true of the later call to
+ * pg_atomic_add_fetch_u64.)
+ */
+ for (int i = 0; i < NumProcSignalSlots; i++)
+ {
+ volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+
+ pg_atomic_fetch_or_u32(&slot->pss_barrierCheckMask, flagbit);
+ }
+
+ /*
+ * Increment the generation counter.
+ */
+ generation =
+ pg_atomic_add_fetch_u64(&ProcSignal->psh_barrierGeneration, 1);
+
+ /*
+ * Signal all the processes, so that they update their advertised barrier
+ * generation.
+ *
+ * Concurrency is not a problem here. Backends that have exited don't
+ * matter, and new backends that have joined since we entered this
+ * function must already have current state, since the caller is
+ * responsible for making sure that the relevant state is entirely visible
+ * before calling this function in the first place. We still have to wake
+ * them up - because we can't distinguish between such backends and older
+ * backends that need to update state - but they won't actually need to
+ * change any state.
+ */
+ for (int i = NumProcSignalSlots - 1; i >= 0; i--)
+ {
+ volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+ pid_t pid = slot->pss_pid;
+
+ if (pid != 0)
+ {
+ /* see SendProcSignal for details */
+ slot->pss_signalFlags[PROCSIG_BARRIER] = true;
+ kill(pid, SIGUSR1);
+ }
+ }
+
+ return generation;
+}
+
+/*
+ * WaitForProcSignalBarrier - wait until it is guaranteed that all changes
+ * requested by a specific call to EmitProcSignalBarrier() have taken effect.
+ */
+void
+WaitForProcSignalBarrier(uint64 generation)
+{
+ Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration));
+
+ for (int i = NumProcSignalSlots - 1; i >= 0; i--)
+ {
+ ProcSignalSlot *slot = &ProcSignal->psh_slot[i];
+ uint64 oldval;
+
+ /*
+ * It's important that we check only pss_barrierGeneration here and
+ * not pss_barrierCheckMask. Bits in pss_barrierCheckMask get cleared
+ * before the barrier is actually absorbed, but pss_barrierGeneration
+ * is updated only afterward.
+ */
+ oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
+ while (oldval < generation)
+ {
+ ConditionVariableSleep(&slot->pss_barrierCV,
+ WAIT_EVENT_PROC_SIGNAL_BARRIER);
+ oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration);
+ }
+ ConditionVariableCancelSleep();
+ }
+
+ /*
+ * The caller is probably calling this function because it wants to read
+ * the shared state or perform further writes to shared state once all
+ * backends are known to have absorbed the barrier. However, the read of
+ * pss_barrierGeneration was performed unlocked; insert a memory barrier
+ * to separate it from whatever follows.
+ */
+ pg_memory_barrier();
+}
+
+/*
+ * Handle receipt of an interrupt indicating a global barrier event.
+ *
+ * All the actual work is deferred to ProcessProcSignalBarrier(), because we
+ * cannot safely access the barrier generation inside the signal handler as
+ * 64bit atomics might use spinlock based emulation, even for reads. As this
+ * routine only gets called when PROCSIG_BARRIER is sent that won't cause a
+ * lot of unnecessary work.
+ */
+static void
+HandleProcSignalBarrierInterrupt(void)
+{
+ InterruptPending = true;
+ ProcSignalBarrierPending = true;
+ /* latch will be set by procsignal_sigusr1_handler */
+}
+
+/*
+ * Perform global barrier related interrupt checking.
+ *
+ * Any backend that participates in ProcSignal signaling must arrange to
+ * call this function periodically. It is called from CHECK_FOR_INTERRUPTS(),
+ * which is enough for normal backends, but not necessarily for all types of
+ * background processes.
+ */
+void
+ProcessProcSignalBarrier(void)
+{
+ uint64 local_gen;
+ uint64 shared_gen;
+ volatile uint32 flags;
+
+ Assert(MyProcSignalSlot);
+
+ /* Exit quickly if there's no work to do. */
+ if (!ProcSignalBarrierPending)
+ return;
+ ProcSignalBarrierPending = false;
+
+ /*
+ * It's not unlikely to process multiple barriers at once, before the
+ * signals for all the barriers have arrived. To avoid unnecessary work in
+ * response to subsequent signals, exit early if we already have processed
+ * all of them.
+ */
+ local_gen = pg_atomic_read_u64(&MyProcSignalSlot->pss_barrierGeneration);
+ shared_gen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration);
+
+ Assert(local_gen <= shared_gen);
+
+ if (local_gen == shared_gen)
+ return;
+
+ /*
+ * Get and clear the flags that are set for this backend. Note that
+ * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the
+ * read of the barrier generation above happens before we atomically
+ * extract the flags, and that any subsequent state changes happen
+ * afterward.
+ *
+ * NB: In order to avoid race conditions, we must zero
+ * pss_barrierCheckMask first and only afterwards try to do barrier
+ * processing. If we did it in the other order, someone could send us
+ * another barrier of some type right after we called the
+ * barrier-processing function but before we cleared the bit. We would
+ * have no way of knowing that the bit needs to stay set in that case, so
+ * the need to call the barrier-processing function again would just get
+ * forgotten. So instead, we tentatively clear all the bits and then put
+ * back any for which we don't manage to successfully absorb the barrier.
+ */
+ flags = pg_atomic_exchange_u32(&MyProcSignalSlot->pss_barrierCheckMask, 0);
+
+ /*
+ * If there are no flags set, then we can skip doing any real work.
+ * Otherwise, establish a PG_TRY block, so that we don't lose track of
+ * which types of barrier processing are needed if an ERROR occurs.
+ */
+ if (flags != 0)
+ {
+ bool success = true;
+
+ PG_TRY();
+ {
+ /*
+ * Process each type of barrier. The barrier-processing functions
+ * should normally return true, but may return false if the
+ * barrier can't be absorbed at the current time. This should be
+ * rare, because it's pretty expensive. Every single
+ * CHECK_FOR_INTERRUPTS() will return here until we manage to
+ * absorb the barrier, and that cost will add up in a hurry.
+ *
+ * NB: It ought to be OK to call the barrier-processing functions
+ * unconditionally, but it's more efficient to call only the ones
+ * that might need us to do something based on the flags.
+ */
+ while (flags != 0)
+ {
+ ProcSignalBarrierType type;
+ bool processed = true;
+
+ type = (ProcSignalBarrierType) pg_rightmost_one_pos32(flags);
+ switch (type)
+ {
+ case PROCSIGNAL_BARRIER_PLACEHOLDER:
+ processed = ProcessBarrierPlaceholder();
+ break;
+ }
+
+ /*
+ * To avoid an infinite loop, we must always unset the bit in
+ * flags.
+ */
+ BARRIER_CLEAR_BIT(flags, type);
+
+ /*
+ * If we failed to process the barrier, reset the shared bit
+ * so we try again later, and set a flag so that we don't bump
+ * our generation.
+ */
+ if (!processed)
+ {
+ ResetProcSignalBarrierBits(((uint32) 1) << type);
+ success = false;
+ }
+ }
+ }
+ PG_CATCH();
+ {
+ /*
+ * If an ERROR occurred, we'll need to try again later to handle
+ * that barrier type and any others that haven't been handled yet
+ * or weren't successfully absorbed.
+ */
+ ResetProcSignalBarrierBits(flags);
+ PG_RE_THROW();
+ }
+ PG_END_TRY();
+
+ /*
+ * If some barrier types were not successfully absorbed, we will have
+ * to try again later.
+ */
+ if (!success)
+ return;
+ }
+
+ /*
+ * State changes related to all types of barriers that might have been
+ * emitted have now been handled, so we can update our notion of the
+ * generation to the one we observed before beginning the updates. If
+ * things have changed further, it'll get fixed up when this function is
+ * next called.
+ */
+ pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen);
+ ConditionVariableBroadcast(&MyProcSignalSlot->pss_barrierCV);
+}
+
+/*
+ * If it turns out that we couldn't absorb one or more barrier types, either
+ * because the barrier-processing functions returned false or due to an error,
+ * arrange for processing to be retried later.
+ */
+static void
+ResetProcSignalBarrierBits(uint32 flags)
+{
+ pg_atomic_fetch_or_u32(&MyProcSignalSlot->pss_barrierCheckMask, flags);
+ ProcSignalBarrierPending = true;
+ InterruptPending = true;
+}
+
+static bool
+ProcessBarrierPlaceholder(void)
+{
+ /*
+ * XXX. This is just a placeholder until the first real user of this
+ * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to
+ * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something
+ * appropriately descriptive. Get rid of this function and instead have
+ * ProcessBarrierSomethingElse. Most likely, that function should live in
+ * the file pertaining to that subsystem, rather than here.
+ *
+ * The return value should be 'true' if the barrier was successfully
+ * absorbed and 'false' if not. Note that returning 'false' can lead to
+ * very frequent retries, so try hard to make that an uncommon case.
+ */
+ return true;
+}
+
+/*
+ * CheckProcSignal - check to see if a particular reason has been
+ * signaled, and clear the signal flag. Should be called after receiving
+ * SIGUSR1.
+ */
+static bool
+CheckProcSignal(ProcSignalReason reason)
+{
+ volatile ProcSignalSlot *slot = MyProcSignalSlot;
+
+ if (slot != NULL)
+ {
+ /* Careful here --- don't clear flag if we haven't seen it set */
+ if (slot->pss_signalFlags[reason])
+ {
+ slot->pss_signalFlags[reason] = false;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/*
+ * procsignal_sigusr1_handler - handle SIGUSR1 signal.
+ */
+void
+procsignal_sigusr1_handler(SIGNAL_ARGS)
+{
+ int save_errno = errno;
+
+ if (CheckProcSignal(PROCSIG_CATCHUP_INTERRUPT))
+ HandleCatchupInterrupt();
+
+ if (CheckProcSignal(PROCSIG_NOTIFY_INTERRUPT))
+ HandleNotifyInterrupt();
+
+ if (CheckProcSignal(PROCSIG_PARALLEL_MESSAGE))
+ HandleParallelMessageInterrupt();
+
+ if (CheckProcSignal(PROCSIG_WALSND_INIT_STOPPING))
+ HandleWalSndInitStopping();
+
+ if (CheckProcSignal(PROCSIG_BARRIER))
+ HandleProcSignalBarrierInterrupt();
+
+ if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT))
+ HandleLogMemoryContextInterrupt();
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+
+ if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN))
+ RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+
+ SetLatch(MyLatch);
+
+ errno = save_errno;
+}
diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c
new file mode 100644
index 0000000..3240af4
--- /dev/null
+++ b/src/backend/storage/ipc/shm_mq.c
@@ -0,0 +1,1288 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_mq.c
+ * single-reader, single-writer shared memory message queue
+ *
+ * Both the sender and the receiver must have a PGPROC; their respective
+ * process latches are used for synchronization. Only the sender may send,
+ * and only the receiver may receive. This is intended to allow a user
+ * backend to communicate with worker backends that it has registered.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_mq.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/bgworker.h"
+#include "storage/procsignal.h"
+#include "storage/shm_mq.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+/*
+ * This structure represents the actual queue, stored in shared memory.
+ *
+ * Some notes on synchronization:
+ *
+ * mq_receiver and mq_bytes_read can only be changed by the receiver; and
+ * mq_sender and mq_bytes_written can only be changed by the sender.
+ * mq_receiver and mq_sender are protected by mq_mutex, although, importantly,
+ * they cannot change once set, and thus may be read without a lock once this
+ * is known to be the case.
+ *
+ * mq_bytes_read and mq_bytes_written are not protected by the mutex. Instead,
+ * they are written atomically using 8 byte loads and stores. Memory barriers
+ * must be carefully used to synchronize reads and writes of these values with
+ * reads and writes of the actual data in mq_ring.
+ *
+ * mq_detached needs no locking. It can be set by either the sender or the
+ * receiver, but only ever from false to true, so redundant writes don't
+ * matter. It is important that if we set mq_detached and then set the
+ * counterparty's latch, the counterparty must be certain to see the change
+ * after waking up. Since SetLatch begins with a memory barrier and ResetLatch
+ * ends with one, this should be OK.
+ *
+ * mq_ring_size and mq_ring_offset never change after initialization, and
+ * can therefore be read without the lock.
+ *
+ * Importantly, mq_ring can be safely read and written without a lock.
+ * At any given time, the difference between mq_bytes_read and
+ * mq_bytes_written defines the number of bytes within mq_ring that contain
+ * unread data, and mq_bytes_read defines the position where those bytes
+ * begin. The sender can increase the number of unread bytes at any time,
+ * but only the receiver can give license to overwrite those bytes, by
+ * incrementing mq_bytes_read. Therefore, it's safe for the receiver to read
+ * the unread bytes it knows to be present without the lock. Conversely,
+ * the sender can write to the unused portion of the ring buffer without
+ * the lock, because nobody else can be reading or writing those bytes. The
+ * receiver could be making more bytes unused by incrementing mq_bytes_read,
+ * but that's OK. Note that it would be unsafe for the receiver to read any
+ * data it's already marked as read, or to write any data; and it would be
+ * unsafe for the sender to reread any data after incrementing
+ * mq_bytes_written, but fortunately there's no need for any of that.
+ */
+struct shm_mq
+{
+ slock_t mq_mutex;
+ PGPROC *mq_receiver;
+ PGPROC *mq_sender;
+ pg_atomic_uint64 mq_bytes_read;
+ pg_atomic_uint64 mq_bytes_written;
+ Size mq_ring_size;
+ bool mq_detached;
+ uint8 mq_ring_offset;
+ char mq_ring[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * This structure is a backend-private handle for access to a queue.
+ *
+ * mqh_queue is a pointer to the queue we've attached, and mqh_segment is
+ * an optional pointer to the dynamic shared memory segment that contains it.
+ * (If mqh_segment is provided, we register an on_dsm_detach callback to
+ * make sure we detach from the queue before detaching from DSM.)
+ *
+ * If this queue is intended to connect the current process with a background
+ * worker that started it, the user can pass a pointer to the worker handle
+ * to shm_mq_attach(), and we'll store it in mqh_handle. The point of this
+ * is to allow us to begin sending to or receiving from that queue before the
+ * process we'll be communicating with has even been started. If it fails
+ * to start, the handle will allow us to notice that and fail cleanly, rather
+ * than waiting forever; see shm_mq_wait_internal. This is mostly useful in
+ * simple cases - e.g. where there are just 2 processes communicating; in
+ * more complex scenarios, every process may not have a BackgroundWorkerHandle
+ * available, or may need to watch for the failure of more than one other
+ * process at a time.
+ *
+ * When a message exists as a contiguous chunk of bytes in the queue - that is,
+ * it is smaller than the size of the ring buffer and does not wrap around
+ * the end - we return the message to the caller as a pointer into the buffer.
+ * For messages that are larger or happen to wrap, we reassemble the message
+ * locally by copying the chunks into a backend-local buffer. mqh_buffer is
+ * the buffer, and mqh_buflen is the number of bytes allocated for it.
+ *
+ * mqh_partial_bytes, mqh_expected_bytes, and mqh_length_word_complete
+ * are used to track the state of non-blocking operations. When the caller
+ * attempts a non-blocking operation that returns SHM_MQ_WOULD_BLOCK, they
+ * are expected to retry the call at a later time with the same argument;
+ * we need to retain enough state to pick up where we left off.
+ * mqh_length_word_complete tracks whether we are done sending or receiving
+ * (whichever we're doing) the entire length word. mqh_partial_bytes tracks
+ * the number of bytes read or written for either the length word or the
+ * message itself, and mqh_expected_bytes - which is used only for reads -
+ * tracks the expected total size of the payload.
+ *
+ * mqh_counterparty_attached tracks whether we know the counterparty to have
+ * attached to the queue at some previous point. This lets us avoid some
+ * mutex acquisitions.
+ *
+ * mqh_context is the memory context in effect at the time we attached to
+ * the shm_mq. The shm_mq_handle itself is allocated in this context, and
+ * we make sure any other allocations we do happen in this context as well,
+ * to avoid nasty surprises.
+ */
+struct shm_mq_handle
+{
+ shm_mq *mqh_queue;
+ dsm_segment *mqh_segment;
+ BackgroundWorkerHandle *mqh_handle;
+ char *mqh_buffer;
+ Size mqh_buflen;
+ Size mqh_consume_pending;
+ Size mqh_partial_bytes;
+ Size mqh_expected_bytes;
+ bool mqh_length_word_complete;
+ bool mqh_counterparty_attached;
+ MemoryContext mqh_context;
+};
+
+static void shm_mq_detach_internal(shm_mq *mq);
+static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes,
+ const void *data, bool nowait, Size *bytes_written);
+static shm_mq_result shm_mq_receive_bytes(shm_mq_handle *mqh,
+ Size bytes_needed, bool nowait, Size *nbytesp,
+ void **datap);
+static bool shm_mq_counterparty_gone(shm_mq *mq,
+ BackgroundWorkerHandle *handle);
+static bool shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr,
+ BackgroundWorkerHandle *handle);
+static void shm_mq_inc_bytes_read(shm_mq *mq, Size n);
+static void shm_mq_inc_bytes_written(shm_mq *mq, Size n);
+static void shm_mq_detach_callback(dsm_segment *seg, Datum arg);
+
+/* Minimum queue size is enough for header and at least one chunk of data. */
+const Size shm_mq_minimum_size =
+MAXALIGN(offsetof(shm_mq, mq_ring)) + MAXIMUM_ALIGNOF;
+
+#define MQH_INITIAL_BUFSIZE 8192
+
+/*
+ * Initialize a new shared message queue.
+ */
+shm_mq *
+shm_mq_create(void *address, Size size)
+{
+ shm_mq *mq = address;
+ Size data_offset = MAXALIGN(offsetof(shm_mq, mq_ring));
+
+ /* If the size isn't MAXALIGN'd, just discard the odd bytes. */
+ size = MAXALIGN_DOWN(size);
+
+ /* Queue size must be large enough to hold some data. */
+ Assert(size > data_offset);
+
+ /* Initialize queue header. */
+ SpinLockInit(&mq->mq_mutex);
+ mq->mq_receiver = NULL;
+ mq->mq_sender = NULL;
+ pg_atomic_init_u64(&mq->mq_bytes_read, 0);
+ pg_atomic_init_u64(&mq->mq_bytes_written, 0);
+ mq->mq_ring_size = size - data_offset;
+ mq->mq_detached = false;
+ mq->mq_ring_offset = data_offset - offsetof(shm_mq, mq_ring);
+
+ return mq;
+}
+
+/*
+ * Set the identity of the process that will receive from a shared message
+ * queue.
+ */
+void
+shm_mq_set_receiver(shm_mq *mq, PGPROC *proc)
+{
+ PGPROC *sender;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ Assert(mq->mq_receiver == NULL);
+ mq->mq_receiver = proc;
+ sender = mq->mq_sender;
+ SpinLockRelease(&mq->mq_mutex);
+
+ if (sender != NULL)
+ SetLatch(&sender->procLatch);
+}
+
+/*
+ * Set the identity of the process that will send to a shared message queue.
+ */
+void
+shm_mq_set_sender(shm_mq *mq, PGPROC *proc)
+{
+ PGPROC *receiver;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ Assert(mq->mq_sender == NULL);
+ mq->mq_sender = proc;
+ receiver = mq->mq_receiver;
+ SpinLockRelease(&mq->mq_mutex);
+
+ if (receiver != NULL)
+ SetLatch(&receiver->procLatch);
+}
+
+/*
+ * Get the configured receiver.
+ */
+PGPROC *
+shm_mq_get_receiver(shm_mq *mq)
+{
+ PGPROC *receiver;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ receiver = mq->mq_receiver;
+ SpinLockRelease(&mq->mq_mutex);
+
+ return receiver;
+}
+
+/*
+ * Get the configured sender.
+ */
+PGPROC *
+shm_mq_get_sender(shm_mq *mq)
+{
+ PGPROC *sender;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ sender = mq->mq_sender;
+ SpinLockRelease(&mq->mq_mutex);
+
+ return sender;
+}
+
+/*
+ * Attach to a shared message queue so we can send or receive messages.
+ *
+ * The memory context in effect at the time this function is called should
+ * be one which will last for at least as long as the message queue itself.
+ * We'll allocate the handle in that context, and future allocations that
+ * are needed to buffer incoming data will happen in that context as well.
+ *
+ * If seg != NULL, the queue will be automatically detached when that dynamic
+ * shared memory segment is detached.
+ *
+ * If handle != NULL, the queue can be read or written even before the
+ * other process has attached. We'll wait for it to do so if needed. The
+ * handle must be for a background worker initialized with bgw_notify_pid
+ * equal to our PID.
+ *
+ * shm_mq_detach() should be called when done. This will free the
+ * shm_mq_handle and mark the queue itself as detached, so that our
+ * counterpart won't get stuck waiting for us to fill or drain the queue
+ * after we've already lost interest.
+ */
+shm_mq_handle *
+shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle)
+{
+ shm_mq_handle *mqh = palloc(sizeof(shm_mq_handle));
+
+ Assert(mq->mq_receiver == MyProc || mq->mq_sender == MyProc);
+ mqh->mqh_queue = mq;
+ mqh->mqh_segment = seg;
+ mqh->mqh_handle = handle;
+ mqh->mqh_buffer = NULL;
+ mqh->mqh_buflen = 0;
+ mqh->mqh_consume_pending = 0;
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_expected_bytes = 0;
+ mqh->mqh_length_word_complete = false;
+ mqh->mqh_counterparty_attached = false;
+ mqh->mqh_context = CurrentMemoryContext;
+
+ if (seg != NULL)
+ on_dsm_detach(seg, shm_mq_detach_callback, PointerGetDatum(mq));
+
+ return mqh;
+}
+
+/*
+ * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had
+ * been passed to shm_mq_attach.
+ */
+void
+shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle)
+{
+ Assert(mqh->mqh_handle == NULL);
+ mqh->mqh_handle = handle;
+}
+
+/*
+ * Write a message into a shared message queue.
+ */
+shm_mq_result
+shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait)
+{
+ shm_mq_iovec iov;
+
+ iov.data = data;
+ iov.len = nbytes;
+
+ return shm_mq_sendv(mqh, &iov, 1, nowait);
+}
+
+/*
+ * Write a message into a shared message queue, gathered from multiple
+ * addresses.
+ *
+ * When nowait = false, we'll wait on our process latch when the ring buffer
+ * fills up, and then continue writing once the receiver has drained some data.
+ * The process latch is reset after each wait.
+ *
+ * When nowait = true, we do not manipulate the state of the process latch;
+ * instead, if the buffer becomes full, we return SHM_MQ_WOULD_BLOCK. In
+ * this case, the caller should call this function again, with the same
+ * arguments, each time the process latch is set. (Once begun, the sending
+ * of a message cannot be aborted except by detaching from the queue; changing
+ * the length or payload will corrupt the queue.)
+ */
+shm_mq_result
+shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait)
+{
+ shm_mq_result res;
+ shm_mq *mq = mqh->mqh_queue;
+ PGPROC *receiver;
+ Size nbytes = 0;
+ Size bytes_written;
+ int i;
+ int which_iov = 0;
+ Size offset;
+
+ Assert(mq->mq_sender == MyProc);
+
+ /* Compute total size of write. */
+ for (i = 0; i < iovcnt; ++i)
+ nbytes += iov[i].len;
+
+ /* Prevent writing messages overwhelming the receiver. */
+ if (nbytes > MaxAllocSize)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot send a message of size %zu via shared memory queue",
+ nbytes)));
+
+ /* Try to write, or finish writing, the length word into the buffer. */
+ while (!mqh->mqh_length_word_complete)
+ {
+ Assert(mqh->mqh_partial_bytes < sizeof(Size));
+ res = shm_mq_send_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes,
+ ((char *) &nbytes) + mqh->mqh_partial_bytes,
+ nowait, &bytes_written);
+
+ if (res == SHM_MQ_DETACHED)
+ {
+ /* Reset state in case caller tries to send another message. */
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = false;
+ return res;
+ }
+ mqh->mqh_partial_bytes += bytes_written;
+
+ if (mqh->mqh_partial_bytes >= sizeof(Size))
+ {
+ Assert(mqh->mqh_partial_bytes == sizeof(Size));
+
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = true;
+ }
+
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+
+ /* Length word can't be split unless bigger than required alignment. */
+ Assert(mqh->mqh_length_word_complete || sizeof(Size) > MAXIMUM_ALIGNOF);
+ }
+
+ /* Write the actual data bytes into the buffer. */
+ Assert(mqh->mqh_partial_bytes <= nbytes);
+ offset = mqh->mqh_partial_bytes;
+ do
+ {
+ Size chunksize;
+
+ /* Figure out which bytes need to be sent next. */
+ if (offset >= iov[which_iov].len)
+ {
+ offset -= iov[which_iov].len;
+ ++which_iov;
+ if (which_iov >= iovcnt)
+ break;
+ continue;
+ }
+
+ /*
+ * We want to avoid copying the data if at all possible, but every
+ * chunk of bytes we write into the queue has to be MAXALIGN'd, except
+ * the last. Thus, if a chunk other than the last one ends on a
+ * non-MAXALIGN'd boundary, we have to combine the tail end of its
+ * data with data from one or more following chunks until we either
+ * reach the last chunk or accumulate a number of bytes which is
+ * MAXALIGN'd.
+ */
+ if (which_iov + 1 < iovcnt &&
+ offset + MAXIMUM_ALIGNOF > iov[which_iov].len)
+ {
+ char tmpbuf[MAXIMUM_ALIGNOF];
+ int j = 0;
+
+ for (;;)
+ {
+ if (offset < iov[which_iov].len)
+ {
+ tmpbuf[j] = iov[which_iov].data[offset];
+ j++;
+ offset++;
+ if (j == MAXIMUM_ALIGNOF)
+ break;
+ }
+ else
+ {
+ offset -= iov[which_iov].len;
+ which_iov++;
+ if (which_iov >= iovcnt)
+ break;
+ }
+ }
+
+ res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written);
+
+ if (res == SHM_MQ_DETACHED)
+ {
+ /* Reset state in case caller tries to send another message. */
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = false;
+ return res;
+ }
+
+ mqh->mqh_partial_bytes += bytes_written;
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ continue;
+ }
+
+ /*
+ * If this is the last chunk, we can write all the data, even if it
+ * isn't a multiple of MAXIMUM_ALIGNOF. Otherwise, we need to
+ * MAXALIGN_DOWN the write size.
+ */
+ chunksize = iov[which_iov].len - offset;
+ if (which_iov + 1 < iovcnt)
+ chunksize = MAXALIGN_DOWN(chunksize);
+ res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset],
+ nowait, &bytes_written);
+
+ if (res == SHM_MQ_DETACHED)
+ {
+ /* Reset state in case caller tries to send another message. */
+ mqh->mqh_length_word_complete = false;
+ mqh->mqh_partial_bytes = 0;
+ return res;
+ }
+
+ mqh->mqh_partial_bytes += bytes_written;
+ offset += bytes_written;
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ } while (mqh->mqh_partial_bytes < nbytes);
+
+ /* Reset for next message. */
+ mqh->mqh_partial_bytes = 0;
+ mqh->mqh_length_word_complete = false;
+
+ /* If queue has been detached, let caller know. */
+ if (mq->mq_detached)
+ return SHM_MQ_DETACHED;
+
+ /*
+ * If the counterparty is known to have attached, we can read mq_receiver
+ * without acquiring the spinlock and assume it isn't NULL. Otherwise,
+ * more caution is needed.
+ */
+ if (mqh->mqh_counterparty_attached)
+ receiver = mq->mq_receiver;
+ else
+ {
+ SpinLockAcquire(&mq->mq_mutex);
+ receiver = mq->mq_receiver;
+ SpinLockRelease(&mq->mq_mutex);
+ if (receiver == NULL)
+ return SHM_MQ_SUCCESS;
+ mqh->mqh_counterparty_attached = true;
+ }
+
+ /* Notify receiver of the newly-written data, and return. */
+ SetLatch(&receiver->procLatch);
+ return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Receive a message from a shared message queue.
+ *
+ * We set *nbytes to the message length and *data to point to the message
+ * payload. If the entire message exists in the queue as a single,
+ * contiguous chunk, *data will point directly into shared memory; otherwise,
+ * it will point to a temporary buffer. This mostly avoids data copying in
+ * the hoped-for case where messages are short compared to the buffer size,
+ * while still allowing longer messages. In either case, the return value
+ * remains valid until the next receive operation is performed on the queue.
+ *
+ * When nowait = false, we'll wait on our process latch when the ring buffer
+ * is empty and we have not yet received a full message. The sender will
+ * set our process latch after more data has been written, and we'll resume
+ * processing. Each call will therefore return a complete message
+ * (unless the sender detaches the queue).
+ *
+ * When nowait = true, we do not manipulate the state of the process latch;
+ * instead, whenever the buffer is empty and we need to read from it, we
+ * return SHM_MQ_WOULD_BLOCK. In this case, the caller should call this
+ * function again after the process latch has been set.
+ */
+shm_mq_result
+shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait)
+{
+ shm_mq *mq = mqh->mqh_queue;
+ shm_mq_result res;
+ Size rb = 0;
+ Size nbytes;
+ void *rawdata;
+
+ Assert(mq->mq_receiver == MyProc);
+
+ /* We can't receive data until the sender has attached. */
+ if (!mqh->mqh_counterparty_attached)
+ {
+ if (nowait)
+ {
+ int counterparty_gone;
+
+ /*
+ * We shouldn't return at this point at all unless the sender
+ * hasn't attached yet. However, the correct return value depends
+ * on whether the sender is still attached. If we first test
+ * whether the sender has ever attached and then test whether the
+ * sender has detached, there's a race condition: a sender that
+ * attaches and detaches very quickly might fool us into thinking
+ * the sender never attached at all. So, test whether our
+ * counterparty is definitively gone first, and only afterwards
+ * check whether the sender ever attached in the first place.
+ */
+ counterparty_gone = shm_mq_counterparty_gone(mq, mqh->mqh_handle);
+ if (shm_mq_get_sender(mq) == NULL)
+ {
+ if (counterparty_gone)
+ return SHM_MQ_DETACHED;
+ else
+ return SHM_MQ_WOULD_BLOCK;
+ }
+ }
+ else if (!shm_mq_wait_internal(mq, &mq->mq_sender, mqh->mqh_handle)
+ && shm_mq_get_sender(mq) == NULL)
+ {
+ mq->mq_detached = true;
+ return SHM_MQ_DETACHED;
+ }
+ mqh->mqh_counterparty_attached = true;
+ }
+
+ /*
+ * If we've consumed an amount of data greater than 1/4th of the ring
+ * size, mark it consumed in shared memory. We try to avoid doing this
+ * unnecessarily when only a small amount of data has been consumed,
+ * because SetLatch() is fairly expensive and we don't want to do it too
+ * often.
+ */
+ if (mqh->mqh_consume_pending > mq->mq_ring_size / 4)
+ {
+ shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending);
+ mqh->mqh_consume_pending = 0;
+ }
+
+ /* Try to read, or finish reading, the length word from the buffer. */
+ while (!mqh->mqh_length_word_complete)
+ {
+ /* Try to receive the message length word. */
+ Assert(mqh->mqh_partial_bytes < sizeof(Size));
+ res = shm_mq_receive_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes,
+ nowait, &rb, &rawdata);
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+
+ /*
+ * Hopefully, we'll receive the entire message length word at once.
+ * But if sizeof(Size) > MAXIMUM_ALIGNOF, then it might be split over
+ * multiple reads.
+ */
+ if (mqh->mqh_partial_bytes == 0 && rb >= sizeof(Size))
+ {
+ Size needed;
+
+ nbytes = *(Size *) rawdata;
+
+ /* If we've already got the whole message, we're done. */
+ needed = MAXALIGN(sizeof(Size)) + MAXALIGN(nbytes);
+ if (rb >= needed)
+ {
+ mqh->mqh_consume_pending += needed;
+ *nbytesp = nbytes;
+ *datap = ((char *) rawdata) + MAXALIGN(sizeof(Size));
+ return SHM_MQ_SUCCESS;
+ }
+
+ /*
+ * We don't have the whole message, but we at least have the whole
+ * length word.
+ */
+ mqh->mqh_expected_bytes = nbytes;
+ mqh->mqh_length_word_complete = true;
+ mqh->mqh_consume_pending += MAXALIGN(sizeof(Size));
+ rb -= MAXALIGN(sizeof(Size));
+ }
+ else
+ {
+ Size lengthbytes;
+
+ /* Can't be split unless bigger than required alignment. */
+ Assert(sizeof(Size) > MAXIMUM_ALIGNOF);
+
+ /* Message word is split; need buffer to reassemble. */
+ if (mqh->mqh_buffer == NULL)
+ {
+ mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context,
+ MQH_INITIAL_BUFSIZE);
+ mqh->mqh_buflen = MQH_INITIAL_BUFSIZE;
+ }
+ Assert(mqh->mqh_buflen >= sizeof(Size));
+
+ /* Copy partial length word; remember to consume it. */
+ if (mqh->mqh_partial_bytes + rb > sizeof(Size))
+ lengthbytes = sizeof(Size) - mqh->mqh_partial_bytes;
+ else
+ lengthbytes = rb;
+ memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata,
+ lengthbytes);
+ mqh->mqh_partial_bytes += lengthbytes;
+ mqh->mqh_consume_pending += MAXALIGN(lengthbytes);
+ rb -= lengthbytes;
+
+ /* If we now have the whole word, we're ready to read payload. */
+ if (mqh->mqh_partial_bytes >= sizeof(Size))
+ {
+ Assert(mqh->mqh_partial_bytes == sizeof(Size));
+ mqh->mqh_expected_bytes = *(Size *) mqh->mqh_buffer;
+ mqh->mqh_length_word_complete = true;
+ mqh->mqh_partial_bytes = 0;
+ }
+ }
+ }
+ nbytes = mqh->mqh_expected_bytes;
+
+ /*
+ * Should be disallowed on the sending side already, but better check and
+ * error out on the receiver side as well rather than trying to read a
+ * prohibitively large message.
+ */
+ if (nbytes > MaxAllocSize)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("invalid message size %zu in shared memory queue",
+ nbytes)));
+
+ if (mqh->mqh_partial_bytes == 0)
+ {
+ /*
+ * Try to obtain the whole message in a single chunk. If this works,
+ * we need not copy the data and can return a pointer directly into
+ * shared memory.
+ */
+ res = shm_mq_receive_bytes(mqh, nbytes, nowait, &rb, &rawdata);
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ if (rb >= nbytes)
+ {
+ mqh->mqh_length_word_complete = false;
+ mqh->mqh_consume_pending += MAXALIGN(nbytes);
+ *nbytesp = nbytes;
+ *datap = rawdata;
+ return SHM_MQ_SUCCESS;
+ }
+
+ /*
+ * The message has wrapped the buffer. We'll need to copy it in order
+ * to return it to the client in one chunk. First, make sure we have
+ * a large enough buffer available.
+ */
+ if (mqh->mqh_buflen < nbytes)
+ {
+ Size newbuflen = Max(mqh->mqh_buflen, MQH_INITIAL_BUFSIZE);
+
+ /*
+ * Double the buffer size until the payload fits, but limit to
+ * MaxAllocSize.
+ */
+ while (newbuflen < nbytes)
+ newbuflen *= 2;
+ newbuflen = Min(newbuflen, MaxAllocSize);
+
+ if (mqh->mqh_buffer != NULL)
+ {
+ pfree(mqh->mqh_buffer);
+ mqh->mqh_buffer = NULL;
+ mqh->mqh_buflen = 0;
+ }
+ mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, newbuflen);
+ mqh->mqh_buflen = newbuflen;
+ }
+ }
+
+ /* Loop until we've copied the entire message. */
+ for (;;)
+ {
+ Size still_needed;
+
+ /* Copy as much as we can. */
+ Assert(mqh->mqh_partial_bytes + rb <= nbytes);
+ if (rb > 0)
+ {
+ memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, rb);
+ mqh->mqh_partial_bytes += rb;
+ }
+
+ /*
+ * Update count of bytes that can be consumed, accounting for
+ * alignment padding. Note that this will never actually insert any
+ * padding except at the end of a message, because the buffer size is
+ * a multiple of MAXIMUM_ALIGNOF, and each read and write is as well.
+ */
+ Assert(mqh->mqh_partial_bytes == nbytes || rb == MAXALIGN(rb));
+ mqh->mqh_consume_pending += MAXALIGN(rb);
+
+ /* If we got all the data, exit the loop. */
+ if (mqh->mqh_partial_bytes >= nbytes)
+ break;
+
+ /* Wait for some more data. */
+ still_needed = nbytes - mqh->mqh_partial_bytes;
+ res = shm_mq_receive_bytes(mqh, still_needed, nowait, &rb, &rawdata);
+ if (res != SHM_MQ_SUCCESS)
+ return res;
+ if (rb > still_needed)
+ rb = still_needed;
+ }
+
+ /* Return the complete message, and reset for next message. */
+ *nbytesp = nbytes;
+ *datap = mqh->mqh_buffer;
+ mqh->mqh_length_word_complete = false;
+ mqh->mqh_partial_bytes = 0;
+ return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Wait for the other process that's supposed to use this queue to attach
+ * to it.
+ *
+ * The return value is SHM_MQ_DETACHED if the worker has already detached or
+ * if it dies; it is SHM_MQ_SUCCESS if we detect that the worker has attached.
+ * Note that we will only be able to detect that the worker has died before
+ * attaching if a background worker handle was passed to shm_mq_attach().
+ */
+shm_mq_result
+shm_mq_wait_for_attach(shm_mq_handle *mqh)
+{
+ shm_mq *mq = mqh->mqh_queue;
+ PGPROC **victim;
+
+ if (shm_mq_get_receiver(mq) == MyProc)
+ victim = &mq->mq_sender;
+ else
+ {
+ Assert(shm_mq_get_sender(mq) == MyProc);
+ victim = &mq->mq_receiver;
+ }
+
+ if (shm_mq_wait_internal(mq, victim, mqh->mqh_handle))
+ return SHM_MQ_SUCCESS;
+ else
+ return SHM_MQ_DETACHED;
+}
+
+/*
+ * Detach from a shared message queue, and destroy the shm_mq_handle.
+ */
+void
+shm_mq_detach(shm_mq_handle *mqh)
+{
+ /* Notify counterparty that we're outta here. */
+ shm_mq_detach_internal(mqh->mqh_queue);
+
+ /* Cancel on_dsm_detach callback, if any. */
+ if (mqh->mqh_segment)
+ cancel_on_dsm_detach(mqh->mqh_segment,
+ shm_mq_detach_callback,
+ PointerGetDatum(mqh->mqh_queue));
+
+ /* Release local memory associated with handle. */
+ if (mqh->mqh_buffer != NULL)
+ pfree(mqh->mqh_buffer);
+ pfree(mqh);
+}
+
+/*
+ * Notify counterparty that we're detaching from shared message queue.
+ *
+ * The purpose of this function is to make sure that the process
+ * with which we're communicating doesn't block forever waiting for us to
+ * fill or drain the queue once we've lost interest. When the sender
+ * detaches, the receiver can read any messages remaining in the queue;
+ * further reads will return SHM_MQ_DETACHED. If the receiver detaches,
+ * further attempts to send messages will likewise return SHM_MQ_DETACHED.
+ *
+ * This is separated out from shm_mq_detach() because if the on_dsm_detach
+ * callback fires, we only want to do this much. We do not try to touch
+ * the local shm_mq_handle, as it may have been pfree'd already.
+ */
+static void
+shm_mq_detach_internal(shm_mq *mq)
+{
+ PGPROC *victim;
+
+ SpinLockAcquire(&mq->mq_mutex);
+ if (mq->mq_sender == MyProc)
+ victim = mq->mq_receiver;
+ else
+ {
+ Assert(mq->mq_receiver == MyProc);
+ victim = mq->mq_sender;
+ }
+ mq->mq_detached = true;
+ SpinLockRelease(&mq->mq_mutex);
+
+ if (victim != NULL)
+ SetLatch(&victim->procLatch);
+}
+
+/*
+ * Get the shm_mq from handle.
+ */
+shm_mq *
+shm_mq_get_queue(shm_mq_handle *mqh)
+{
+ return mqh->mqh_queue;
+}
+
+/*
+ * Write bytes into a shared message queue.
+ */
+static shm_mq_result
+shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data,
+ bool nowait, Size *bytes_written)
+{
+ shm_mq *mq = mqh->mqh_queue;
+ Size sent = 0;
+ uint64 used;
+ Size ringsize = mq->mq_ring_size;
+ Size available;
+
+ while (sent < nbytes)
+ {
+ uint64 rb;
+ uint64 wb;
+
+ /* Compute number of ring buffer bytes used and available. */
+ rb = pg_atomic_read_u64(&mq->mq_bytes_read);
+ wb = pg_atomic_read_u64(&mq->mq_bytes_written);
+ Assert(wb >= rb);
+ used = wb - rb;
+ Assert(used <= ringsize);
+ available = Min(ringsize - used, nbytes - sent);
+
+ /*
+ * Bail out if the queue has been detached. Note that we would be in
+ * trouble if the compiler decided to cache the value of
+ * mq->mq_detached in a register or on the stack across loop
+ * iterations. It probably shouldn't do that anyway since we'll
+ * always return, call an external function that performs a system
+ * call, or reach a memory barrier at some point later in the loop,
+ * but just to be sure, insert a compiler barrier here.
+ */
+ pg_compiler_barrier();
+ if (mq->mq_detached)
+ {
+ *bytes_written = sent;
+ return SHM_MQ_DETACHED;
+ }
+
+ if (available == 0 && !mqh->mqh_counterparty_attached)
+ {
+ /*
+ * The queue is full, so if the receiver isn't yet known to be
+ * attached, we must wait for that to happen.
+ */
+ if (nowait)
+ {
+ if (shm_mq_counterparty_gone(mq, mqh->mqh_handle))
+ {
+ *bytes_written = sent;
+ return SHM_MQ_DETACHED;
+ }
+ if (shm_mq_get_receiver(mq) == NULL)
+ {
+ *bytes_written = sent;
+ return SHM_MQ_WOULD_BLOCK;
+ }
+ }
+ else if (!shm_mq_wait_internal(mq, &mq->mq_receiver,
+ mqh->mqh_handle))
+ {
+ mq->mq_detached = true;
+ *bytes_written = sent;
+ return SHM_MQ_DETACHED;
+ }
+ mqh->mqh_counterparty_attached = true;
+
+ /*
+ * The receiver may have read some data after attaching, so we
+ * must not wait without rechecking the queue state.
+ */
+ }
+ else if (available == 0)
+ {
+ /*
+ * Since mq->mqh_counterparty_attached is known to be true at this
+ * point, mq_receiver has been set, and it can't change once set.
+ * Therefore, we can read it without acquiring the spinlock.
+ */
+ Assert(mqh->mqh_counterparty_attached);
+ SetLatch(&mq->mq_receiver->procLatch);
+
+ /* Skip manipulation of our latch if nowait = true. */
+ if (nowait)
+ {
+ *bytes_written = sent;
+ return SHM_MQ_WOULD_BLOCK;
+ }
+
+ /*
+ * Wait for our latch to be set. It might already be set for some
+ * unrelated reason, but that'll just result in one extra trip
+ * through the loop. It's worth it to avoid resetting the latch
+ * at top of loop, because setting an already-set latch is much
+ * cheaper than setting one that has been reset.
+ */
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ WAIT_EVENT_MQ_SEND);
+
+ /* Reset the latch so we don't spin. */
+ ResetLatch(MyLatch);
+
+ /* An interrupt may have occurred while we were waiting. */
+ CHECK_FOR_INTERRUPTS();
+ }
+ else
+ {
+ Size offset;
+ Size sendnow;
+
+ offset = wb % (uint64) ringsize;
+ sendnow = Min(available, ringsize - offset);
+
+ /*
+ * Write as much data as we can via a single memcpy(). Make sure
+ * these writes happen after the read of mq_bytes_read, above.
+ * This barrier pairs with the one in shm_mq_inc_bytes_read.
+ * (Since we're separating the read of mq_bytes_read from a
+ * subsequent write to mq_ring, we need a full barrier here.)
+ */
+ pg_memory_barrier();
+ memcpy(&mq->mq_ring[mq->mq_ring_offset + offset],
+ (char *) data + sent, sendnow);
+ sent += sendnow;
+
+ /*
+ * Update count of bytes written, with alignment padding. Note
+ * that this will never actually insert any padding except at the
+ * end of a run of bytes, because the buffer size is a multiple of
+ * MAXIMUM_ALIGNOF, and each read is as well.
+ */
+ Assert(sent == nbytes || sendnow == MAXALIGN(sendnow));
+ shm_mq_inc_bytes_written(mq, MAXALIGN(sendnow));
+
+ /*
+ * For efficiency, we don't set the reader's latch here. We'll do
+ * that only when the buffer fills up or after writing an entire
+ * message.
+ */
+ }
+ }
+
+ *bytes_written = sent;
+ return SHM_MQ_SUCCESS;
+}
+
+/*
+ * Wait until at least *nbytesp bytes are available to be read from the
+ * shared message queue, or until the buffer wraps around. If the queue is
+ * detached, returns SHM_MQ_DETACHED. If nowait is specified and a wait
+ * would be required, returns SHM_MQ_WOULD_BLOCK. Otherwise, *datap is set
+ * to the location at which data bytes can be read, *nbytesp is set to the
+ * number of bytes which can be read at that address, and the return value
+ * is SHM_MQ_SUCCESS.
+ */
+static shm_mq_result
+shm_mq_receive_bytes(shm_mq_handle *mqh, Size bytes_needed, bool nowait,
+ Size *nbytesp, void **datap)
+{
+ shm_mq *mq = mqh->mqh_queue;
+ Size ringsize = mq->mq_ring_size;
+ uint64 used;
+ uint64 written;
+
+ for (;;)
+ {
+ Size offset;
+ uint64 read;
+
+ /* Get bytes written, so we can compute what's available to read. */
+ written = pg_atomic_read_u64(&mq->mq_bytes_written);
+
+ /*
+ * Get bytes read. Include bytes we could consume but have not yet
+ * consumed.
+ */
+ read = pg_atomic_read_u64(&mq->mq_bytes_read) +
+ mqh->mqh_consume_pending;
+ used = written - read;
+ Assert(used <= ringsize);
+ offset = read % (uint64) ringsize;
+
+ /* If we have enough data or buffer has wrapped, we're done. */
+ if (used >= bytes_needed || offset + used >= ringsize)
+ {
+ *nbytesp = Min(used, ringsize - offset);
+ *datap = &mq->mq_ring[mq->mq_ring_offset + offset];
+
+ /*
+ * Separate the read of mq_bytes_written, above, from caller's
+ * attempt to read the data itself. Pairs with the barrier in
+ * shm_mq_inc_bytes_written.
+ */
+ pg_read_barrier();
+ return SHM_MQ_SUCCESS;
+ }
+
+ /*
+ * Fall out before waiting if the queue has been detached.
+ *
+ * Note that we don't check for this until *after* considering whether
+ * the data already available is enough, since the receiver can finish
+ * receiving a message stored in the buffer even after the sender has
+ * detached.
+ */
+ if (mq->mq_detached)
+ {
+ /*
+ * If the writer advanced mq_bytes_written and then set
+ * mq_detached, we might not have read the final value of
+ * mq_bytes_written above. Insert a read barrier and then check
+ * again if mq_bytes_written has advanced.
+ */
+ pg_read_barrier();
+ if (written != pg_atomic_read_u64(&mq->mq_bytes_written))
+ continue;
+
+ return SHM_MQ_DETACHED;
+ }
+
+ /*
+ * We didn't get enough data to satisfy the request, so mark any data
+ * previously-consumed as read to make more buffer space.
+ */
+ if (mqh->mqh_consume_pending > 0)
+ {
+ shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending);
+ mqh->mqh_consume_pending = 0;
+ }
+
+ /* Skip manipulation of our latch if nowait = true. */
+ if (nowait)
+ return SHM_MQ_WOULD_BLOCK;
+
+ /*
+ * Wait for our latch to be set. It might already be set for some
+ * unrelated reason, but that'll just result in one extra trip through
+ * the loop. It's worth it to avoid resetting the latch at top of
+ * loop, because setting an already-set latch is much cheaper than
+ * setting one that has been reset.
+ */
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ WAIT_EVENT_MQ_RECEIVE);
+
+ /* Reset the latch so we don't spin. */
+ ResetLatch(MyLatch);
+
+ /* An interrupt may have occurred while we were waiting. */
+ CHECK_FOR_INTERRUPTS();
+ }
+}
+
+/*
+ * Test whether a counterparty who may not even be alive yet is definitely gone.
+ */
+static bool
+shm_mq_counterparty_gone(shm_mq *mq, BackgroundWorkerHandle *handle)
+{
+ pid_t pid;
+
+ /* If the queue has been detached, counterparty is definitely gone. */
+ if (mq->mq_detached)
+ return true;
+
+ /* If there's a handle, check worker status. */
+ if (handle != NULL)
+ {
+ BgwHandleStatus status;
+
+ /* Check for unexpected worker death. */
+ status = GetBackgroundWorkerPid(handle, &pid);
+ if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+ {
+ /* Mark it detached, just to make it official. */
+ mq->mq_detached = true;
+ return true;
+ }
+ }
+
+ /* Counterparty is not definitively gone. */
+ return false;
+}
+
+/*
+ * This is used when a process is waiting for its counterpart to attach to the
+ * queue. We exit when the other process attaches as expected, or, if
+ * handle != NULL, when the referenced background process or the postmaster
+ * dies. Note that if handle == NULL, and the process fails to attach, we'll
+ * potentially get stuck here forever waiting for a process that may never
+ * start. We do check for interrupts, though.
+ *
+ * ptr is a pointer to the memory address that we're expecting to become
+ * non-NULL when our counterpart attaches to the queue.
+ */
+static bool
+shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr, BackgroundWorkerHandle *handle)
+{
+ bool result = false;
+
+ for (;;)
+ {
+ BgwHandleStatus status;
+ pid_t pid;
+
+ /* Acquire the lock just long enough to check the pointer. */
+ SpinLockAcquire(&mq->mq_mutex);
+ result = (*ptr != NULL);
+ SpinLockRelease(&mq->mq_mutex);
+
+ /* Fail if detached; else succeed if initialized. */
+ if (mq->mq_detached)
+ {
+ result = false;
+ break;
+ }
+ if (result)
+ break;
+
+ if (handle != NULL)
+ {
+ /* Check for unexpected worker death. */
+ status = GetBackgroundWorkerPid(handle, &pid);
+ if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED)
+ {
+ result = false;
+ break;
+ }
+ }
+
+ /* Wait to be signaled. */
+ (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0,
+ WAIT_EVENT_MQ_INTERNAL);
+
+ /* Reset the latch so we don't spin. */
+ ResetLatch(MyLatch);
+
+ /* An interrupt may have occurred while we were waiting. */
+ CHECK_FOR_INTERRUPTS();
+ }
+
+ return result;
+}
+
+/*
+ * Increment the number of bytes read.
+ */
+static void
+shm_mq_inc_bytes_read(shm_mq *mq, Size n)
+{
+ PGPROC *sender;
+
+ /*
+ * Separate prior reads of mq_ring from the increment of mq_bytes_read
+ * which follows. This pairs with the full barrier in
+ * shm_mq_send_bytes(). We only need a read barrier here because the
+ * increment of mq_bytes_read is actually a read followed by a dependent
+ * write.
+ */
+ pg_read_barrier();
+
+ /*
+ * There's no need to use pg_atomic_fetch_add_u64 here, because nobody
+ * else can be changing this value. This method should be cheaper.
+ */
+ pg_atomic_write_u64(&mq->mq_bytes_read,
+ pg_atomic_read_u64(&mq->mq_bytes_read) + n);
+
+ /*
+ * We shouldn't have any bytes to read without a sender, so we can read
+ * mq_sender here without a lock. Once it's initialized, it can't change.
+ */
+ sender = mq->mq_sender;
+ Assert(sender != NULL);
+ SetLatch(&sender->procLatch);
+}
+
+/*
+ * Increment the number of bytes written.
+ */
+static void
+shm_mq_inc_bytes_written(shm_mq *mq, Size n)
+{
+ /*
+ * Separate prior reads of mq_ring from the write of mq_bytes_written
+ * which we're about to do. Pairs with the read barrier found in
+ * shm_mq_receive_bytes.
+ */
+ pg_write_barrier();
+
+ /*
+ * There's no need to use pg_atomic_fetch_add_u64 here, because nobody
+ * else can be changing this value. This method avoids taking the bus
+ * lock unnecessarily.
+ */
+ pg_atomic_write_u64(&mq->mq_bytes_written,
+ pg_atomic_read_u64(&mq->mq_bytes_written) + n);
+}
+
+/* Shim for on_dsm_detach callback. */
+static void
+shm_mq_detach_callback(dsm_segment *seg, Datum arg)
+{
+ shm_mq *mq = (shm_mq *) DatumGetPointer(arg);
+
+ shm_mq_detach_internal(mq);
+}
diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c
new file mode 100644
index 0000000..863b98b
--- /dev/null
+++ b/src/backend/storage/ipc/shm_toc.c
@@ -0,0 +1,272 @@
+/*-------------------------------------------------------------------------
+ *
+ * shm_toc.c
+ * shared memory segment table of contents
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/backend/storage/ipc/shm_toc.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "port/atomics.h"
+#include "storage/shm_toc.h"
+#include "storage/spin.h"
+
+typedef struct shm_toc_entry
+{
+ uint64 key; /* Arbitrary identifier */
+ Size offset; /* Offset, in bytes, from TOC start */
+} shm_toc_entry;
+
+struct shm_toc
+{
+ uint64 toc_magic; /* Magic number identifying this TOC */
+ slock_t toc_mutex; /* Spinlock for mutual exclusion */
+ Size toc_total_bytes; /* Bytes managed by this TOC */
+ Size toc_allocated_bytes; /* Bytes allocated of those managed */
+ uint32 toc_nentry; /* Number of entries in TOC */
+ shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER];
+};
+
+/*
+ * Initialize a region of shared memory with a table of contents.
+ */
+shm_toc *
+shm_toc_create(uint64 magic, void *address, Size nbytes)
+{
+ shm_toc *toc = (shm_toc *) address;
+
+ Assert(nbytes > offsetof(shm_toc, toc_entry));
+ toc->toc_magic = magic;
+ SpinLockInit(&toc->toc_mutex);
+
+ /*
+ * The alignment code in shm_toc_allocate() assumes that the starting
+ * value is buffer-aligned.
+ */
+ toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes);
+ toc->toc_allocated_bytes = 0;
+ toc->toc_nentry = 0;
+
+ return toc;
+}
+
+/*
+ * Attach to an existing table of contents. If the magic number found at
+ * the target address doesn't match our expectations, return NULL.
+ */
+shm_toc *
+shm_toc_attach(uint64 magic, void *address)
+{
+ shm_toc *toc = (shm_toc *) address;
+
+ if (toc->toc_magic != magic)
+ return NULL;
+
+ Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes);
+ Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry));
+
+ return toc;
+}
+
+/*
+ * Allocate shared memory from a segment managed by a table of contents.
+ *
+ * This is not a full-blown allocator; there's no way to free memory. It's
+ * just a way of dividing a single physical shared memory segment into logical
+ * chunks that may be used for different purposes.
+ *
+ * We allocate backwards from the end of the segment, so that the TOC entries
+ * can grow forward from the start of the segment.
+ */
+void *
+shm_toc_allocate(shm_toc *toc, Size nbytes)
+{
+ volatile shm_toc *vtoc = toc;
+ Size total_bytes;
+ Size allocated_bytes;
+ Size nentry;
+ Size toc_bytes;
+
+ /*
+ * Make sure request is well-aligned. XXX: MAXALIGN is not enough,
+ * because atomic ops might need a wider alignment. We don't have a
+ * proper definition for the minimum to make atomic ops safe, but
+ * BUFFERALIGN ought to be enough.
+ */
+ nbytes = BUFFERALIGN(nbytes);
+
+ SpinLockAcquire(&toc->toc_mutex);
+
+ total_bytes = vtoc->toc_total_bytes;
+ allocated_bytes = vtoc->toc_allocated_bytes;
+ nentry = vtoc->toc_nentry;
+ toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+ + allocated_bytes;
+
+ /* Check for memory exhaustion and overflow. */
+ if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes)
+ {
+ SpinLockRelease(&toc->toc_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+ }
+ vtoc->toc_allocated_bytes += nbytes;
+
+ SpinLockRelease(&toc->toc_mutex);
+
+ return ((char *) toc) + (total_bytes - allocated_bytes - nbytes);
+}
+
+/*
+ * Return the number of bytes that can still be allocated.
+ */
+Size
+shm_toc_freespace(shm_toc *toc)
+{
+ volatile shm_toc *vtoc = toc;
+ Size total_bytes;
+ Size allocated_bytes;
+ Size nentry;
+ Size toc_bytes;
+
+ SpinLockAcquire(&toc->toc_mutex);
+ total_bytes = vtoc->toc_total_bytes;
+ allocated_bytes = vtoc->toc_allocated_bytes;
+ nentry = vtoc->toc_nentry;
+ SpinLockRelease(&toc->toc_mutex);
+
+ toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry);
+ Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes);
+ return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes));
+}
+
+/*
+ * Insert a TOC entry.
+ *
+ * The idea here is that the process setting up the shared memory segment will
+ * register the addresses of data structures within the segment using this
+ * function. Each data structure will be identified using a 64-bit key, which
+ * is assumed to be a well-known or discoverable integer. Other processes
+ * accessing the shared memory segment can pass the same key to
+ * shm_toc_lookup() to discover the addresses of those data structures.
+ *
+ * Since the shared memory segment may be mapped at different addresses within
+ * different backends, we store relative rather than absolute pointers.
+ *
+ * This won't scale well to a large number of keys. Hopefully, that isn't
+ * necessary; if it proves to be, we might need to provide a more sophisticated
+ * data structure here. But the real idea here is just to give someone mapping
+ * a dynamic shared memory the ability to find the bare minimum number of
+ * pointers that they need to bootstrap. If you're storing a lot of stuff in
+ * the TOC, you're doing it wrong.
+ */
+void
+shm_toc_insert(shm_toc *toc, uint64 key, void *address)
+{
+ volatile shm_toc *vtoc = toc;
+ Size total_bytes;
+ Size allocated_bytes;
+ Size nentry;
+ Size toc_bytes;
+ Size offset;
+
+ /* Relativize pointer. */
+ Assert(address > (void *) toc);
+ offset = ((char *) address) - (char *) toc;
+
+ SpinLockAcquire(&toc->toc_mutex);
+
+ total_bytes = vtoc->toc_total_bytes;
+ allocated_bytes = vtoc->toc_allocated_bytes;
+ nentry = vtoc->toc_nentry;
+ toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry)
+ + allocated_bytes;
+
+ /* Check for memory exhaustion and overflow. */
+ if (toc_bytes + sizeof(shm_toc_entry) > total_bytes ||
+ toc_bytes + sizeof(shm_toc_entry) < toc_bytes ||
+ nentry >= PG_UINT32_MAX)
+ {
+ SpinLockRelease(&toc->toc_mutex);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory")));
+ }
+
+ Assert(offset < total_bytes);
+ vtoc->toc_entry[nentry].key = key;
+ vtoc->toc_entry[nentry].offset = offset;
+
+ /*
+ * By placing a write barrier after filling in the entry and before
+ * updating the number of entries, we make it safe to read the TOC
+ * unlocked.
+ */
+ pg_write_barrier();
+
+ vtoc->toc_nentry++;
+
+ SpinLockRelease(&toc->toc_mutex);
+}
+
+/*
+ * Look up a TOC entry.
+ *
+ * If the key is not found, returns NULL if noError is true, otherwise
+ * throws elog(ERROR).
+ *
+ * Unlike the other functions in this file, this operation acquires no lock;
+ * it uses only barriers. It probably wouldn't hurt concurrency very much even
+ * if it did get a lock, but since it's reasonably likely that a group of
+ * worker processes could each read a series of entries from the same TOC
+ * right around the same time, there seems to be some value in avoiding it.
+ */
+void *
+shm_toc_lookup(shm_toc *toc, uint64 key, bool noError)
+{
+ uint32 nentry;
+ uint32 i;
+
+ /*
+ * Read the number of entries before we examine any entry. We assume that
+ * reading a uint32 is atomic.
+ */
+ nentry = toc->toc_nentry;
+ pg_read_barrier();
+
+ /* Now search for a matching entry. */
+ for (i = 0; i < nentry; ++i)
+ {
+ if (toc->toc_entry[i].key == key)
+ return ((char *) toc) + toc->toc_entry[i].offset;
+ }
+
+ /* No matching entry was found. */
+ if (!noError)
+ elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p",
+ key, toc);
+ return NULL;
+}
+
+/*
+ * Estimate how much shared memory will be required to store a TOC and its
+ * dependent data structures.
+ */
+Size
+shm_toc_estimate(shm_toc_estimator *e)
+{
+ Size sz;
+
+ sz = offsetof(shm_toc, toc_entry);
+ sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry)));
+ sz = add_size(sz, e->space_for_chunks);
+
+ return BUFFERALIGN(sz);
+}
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
new file mode 100644
index 0000000..4425e99
--- /dev/null
+++ b/src/backend/storage/ipc/shmem.c
@@ -0,0 +1,611 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmem.c
+ * create shared memory and initialize shared memory data structures.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/shmem.c
+ *
+ *-------------------------------------------------------------------------
+ */
+/*
+ * POSTGRES processes share one or more regions of shared memory.
+ * The shared memory is created by a postmaster and is inherited
+ * by each backend via fork() (or, in some ports, via other OS-specific
+ * methods). The routines in this file are used for allocating and
+ * binding to shared memory data structures.
+ *
+ * NOTES:
+ * (a) There are three kinds of shared memory data structures
+ * available to POSTGRES: fixed-size structures, queues and hash
+ * tables. Fixed-size structures contain things like global variables
+ * for a module and should never be allocated after the shared memory
+ * initialization phase. Hash tables have a fixed maximum size, but
+ * their actual size can vary dynamically. When entries are added
+ * to the table, more space is allocated. Queues link data structures
+ * that have been allocated either within fixed-size structures or as hash
+ * buckets. Each shared data structure has a string name to identify
+ * it (assigned in the module that declares it).
+ *
+ * (b) During initialization, each module looks for its
+ * shared data structures in a hash table called the "Shmem Index".
+ * If the data structure is not present, the caller can allocate
+ * a new one and initialize it. If the data structure is present,
+ * the caller "attaches" to the structure by initializing a pointer
+ * in the local address space.
+ * The shmem index has two purposes: first, it gives us
+ * a simple model of how the world looks when a backend process
+ * initializes. If something is present in the shmem index,
+ * it is initialized. If it is not, it is uninitialized. Second,
+ * the shmem index allows us to allocate shared memory on demand
+ * instead of trying to preallocate structures and hard-wire the
+ * sizes and locations in header files. If you are using a lot
+ * of shared memory in a lot of different places (and changing
+ * things during development), this is important.
+ *
+ * (c) In standard Unix-ish environments, individual backends do not
+ * need to re-establish their local pointers into shared memory, because
+ * they inherit correct values of those variables via fork() from the
+ * postmaster. However, this does not work in the EXEC_BACKEND case.
+ * In ports using EXEC_BACKEND, new backends have to set up their local
+ * pointers using the method described in (b) above.
+ *
+ * (d) memory allocation model: shared memory can never be
+ * freed, once allocated. Each hash table has its own free list,
+ * so hash buckets can be reused when an item is deleted. However,
+ * if one hash table grows very large and then shrinks, its space
+ * cannot be redistributed to other tables. We could build a simple
+ * hash bucket garbage collector if need be. Right now, it seems
+ * unnecessary.
+ */
+
+#include "postgres.h"
+
+#include "access/transam.h"
+#include "fmgr.h"
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "storage/lwlock.h"
+#include "storage/pg_shmem.h"
+#include "storage/shmem.h"
+#include "storage/spin.h"
+#include "utils/builtins.h"
+
+static void *ShmemAllocRaw(Size size, Size *allocated_size);
+
+/* shared memory global variables */
+
+static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */
+
+static void *ShmemBase; /* start address of shared memory */
+
+static void *ShmemEnd; /* end+1 address of shared memory */
+
+slock_t *ShmemLock; /* spinlock for shared memory and LWLock
+ * allocation */
+
+static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */
+
+
+/*
+ * InitShmemAccess() --- set up basic pointers to shared memory.
+ *
+ * Note: the argument should be declared "PGShmemHeader *seghdr",
+ * but we use void to avoid having to include ipc.h in shmem.h.
+ */
+void
+InitShmemAccess(void *seghdr)
+{
+ PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr;
+
+ ShmemSegHdr = shmhdr;
+ ShmemBase = (void *) shmhdr;
+ ShmemEnd = (char *) ShmemBase + shmhdr->totalsize;
+}
+
+/*
+ * InitShmemAllocation() --- set up shared-memory space allocation.
+ *
+ * This should be called only in the postmaster or a standalone backend.
+ */
+void
+InitShmemAllocation(void)
+{
+ PGShmemHeader *shmhdr = ShmemSegHdr;
+ char *aligned;
+
+ Assert(shmhdr != NULL);
+
+ /*
+ * Initialize the spinlock used by ShmemAlloc. We must use
+ * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet.
+ */
+ ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t));
+
+ SpinLockInit(ShmemLock);
+
+ /*
+ * Allocations after this point should go through ShmemAlloc, which
+ * expects to allocate everything on cache line boundaries. Make sure the
+ * first allocation begins on a cache line boundary.
+ */
+ aligned = (char *)
+ (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset)));
+ shmhdr->freeoffset = aligned - (char *) shmhdr;
+
+ /* ShmemIndex can't be set up yet (need LWLocks first) */
+ shmhdr->index = NULL;
+ ShmemIndex = (HTAB *) NULL;
+
+ /*
+ * Initialize ShmemVariableCache for transaction manager. (This doesn't
+ * really belong here, but not worth moving.)
+ */
+ ShmemVariableCache = (VariableCache)
+ ShmemAlloc(sizeof(*ShmemVariableCache));
+ memset(ShmemVariableCache, 0, sizeof(*ShmemVariableCache));
+}
+
+/*
+ * ShmemAlloc -- allocate max-aligned chunk from shared memory
+ *
+ * Throws error if request cannot be satisfied.
+ *
+ * Assumes ShmemLock and ShmemSegHdr are initialized.
+ */
+void *
+ShmemAlloc(Size size)
+{
+ void *newSpace;
+ Size allocated_size;
+
+ newSpace = ShmemAllocRaw(size, &allocated_size);
+ if (!newSpace)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory (%zu bytes requested)",
+ size)));
+ return newSpace;
+}
+
+/*
+ * ShmemAllocNoError -- allocate max-aligned chunk from shared memory
+ *
+ * As ShmemAlloc, but returns NULL if out of space, rather than erroring.
+ */
+void *
+ShmemAllocNoError(Size size)
+{
+ Size allocated_size;
+
+ return ShmemAllocRaw(size, &allocated_size);
+}
+
+/*
+ * ShmemAllocRaw -- allocate align chunk and return allocated size
+ *
+ * Also sets *allocated_size to the number of bytes allocated, which will
+ * be equal to the number requested plus any padding we choose to add.
+ */
+static void *
+ShmemAllocRaw(Size size, Size *allocated_size)
+{
+ Size newStart;
+ Size newFree;
+ void *newSpace;
+
+ /*
+ * Ensure all space is adequately aligned. We used to only MAXALIGN this
+ * space but experience has proved that on modern systems that is not good
+ * enough. Many parts of the system are very sensitive to critical data
+ * structures getting split across cache line boundaries. To avoid that,
+ * attempt to align the beginning of the allocation to a cache line
+ * boundary. The calling code will still need to be careful about how it
+ * uses the allocated space - e.g. by padding each element in an array of
+ * structures out to a power-of-two size - but without this, even that
+ * won't be sufficient.
+ */
+ size = CACHELINEALIGN(size);
+ *allocated_size = size;
+
+ Assert(ShmemSegHdr != NULL);
+
+ SpinLockAcquire(ShmemLock);
+
+ newStart = ShmemSegHdr->freeoffset;
+
+ newFree = newStart + size;
+ if (newFree <= ShmemSegHdr->totalsize)
+ {
+ newSpace = (void *) ((char *) ShmemBase + newStart);
+ ShmemSegHdr->freeoffset = newFree;
+ }
+ else
+ newSpace = NULL;
+
+ SpinLockRelease(ShmemLock);
+
+ /* note this assert is okay with newSpace == NULL */
+ Assert(newSpace == (void *) CACHELINEALIGN(newSpace));
+
+ return newSpace;
+}
+
+/*
+ * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory
+ *
+ * Allocate space without locking ShmemLock. This should be used for,
+ * and only for, allocations that must happen before ShmemLock is ready.
+ *
+ * We consider maxalign, rather than cachealign, sufficient here.
+ */
+void *
+ShmemAllocUnlocked(Size size)
+{
+ Size newStart;
+ Size newFree;
+ void *newSpace;
+
+ /*
+ * Ensure allocated space is adequately aligned.
+ */
+ size = MAXALIGN(size);
+
+ Assert(ShmemSegHdr != NULL);
+
+ newStart = ShmemSegHdr->freeoffset;
+
+ newFree = newStart + size;
+ if (newFree > ShmemSegHdr->totalsize)
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of shared memory (%zu bytes requested)",
+ size)));
+ ShmemSegHdr->freeoffset = newFree;
+
+ newSpace = (void *) ((char *) ShmemBase + newStart);
+
+ Assert(newSpace == (void *) MAXALIGN(newSpace));
+
+ return newSpace;
+}
+
+/*
+ * ShmemAddrIsValid -- test if an address refers to shared memory
+ *
+ * Returns true if the pointer points within the shared memory segment.
+ */
+bool
+ShmemAddrIsValid(const void *addr)
+{
+ return (addr >= ShmemBase) && (addr < ShmemEnd);
+}
+
+/*
+ * InitShmemIndex() --- set up or attach to shmem index table.
+ */
+void
+InitShmemIndex(void)
+{
+ HASHCTL info;
+
+ /*
+ * Create the shared memory shmem index.
+ *
+ * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex
+ * hashtable to exist already, we have a bit of a circularity problem in
+ * initializing the ShmemIndex itself. The special "ShmemIndex" hash
+ * table name will tell ShmemInitStruct to fake it.
+ */
+ info.keysize = SHMEM_INDEX_KEYSIZE;
+ info.entrysize = sizeof(ShmemIndexEnt);
+
+ ShmemIndex = ShmemInitHash("ShmemIndex",
+ SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
+ &info,
+ HASH_ELEM | HASH_STRINGS);
+}
+
+/*
+ * ShmemInitHash -- Create and initialize, or attach to, a
+ * shared memory hash table.
+ *
+ * We assume caller is doing some kind of synchronization
+ * so that two processes don't try to create/initialize the same
+ * table at once. (In practice, all creations are done in the postmaster
+ * process; child processes should always be attaching to existing tables.)
+ *
+ * max_size is the estimated maximum number of hashtable entries. This is
+ * not a hard limit, but the access efficiency will degrade if it is
+ * exceeded substantially (since it's used to compute directory size and
+ * the hash table buckets will get overfull).
+ *
+ * init_size is the number of hashtable entries to preallocate. For a table
+ * whose maximum size is certain, this should be equal to max_size; that
+ * ensures that no run-time out-of-shared-memory failures can occur.
+ *
+ * *infoP and hash_flags must specify at least the entry sizes and key
+ * comparison semantics (see hash_create()). Flag bits and values specific
+ * to shared-memory hash tables are added here, except that callers may
+ * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE.
+ *
+ * Note: before Postgres 9.0, this function returned NULL for some failure
+ * cases. Now, it always throws error instead, so callers need not check
+ * for NULL.
+ */
+HTAB *
+ShmemInitHash(const char *name, /* table string name for shmem index */
+ long init_size, /* initial table size */
+ long max_size, /* max size of the table */
+ HASHCTL *infoP, /* info about key and bucket size */
+ int hash_flags) /* info about infoP */
+{
+ bool found;
+ void *location;
+
+ /*
+ * Hash tables allocated in shared memory have a fixed directory; it can't
+ * grow or other backends wouldn't be able to find it. So, make sure we
+ * make it big enough to start with.
+ *
+ * The shared memory allocator must be specified too.
+ */
+ infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size);
+ infoP->alloc = ShmemAllocNoError;
+ hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE;
+
+ /* look it up in the shmem index */
+ location = ShmemInitStruct(name,
+ hash_get_shared_size(infoP, hash_flags),
+ &found);
+
+ /*
+ * if it already exists, attach to it rather than allocate and initialize
+ * new space
+ */
+ if (found)
+ hash_flags |= HASH_ATTACH;
+
+ /* Pass location of hashtable header to hash_create */
+ infoP->hctl = (HASHHDR *) location;
+
+ return hash_create(name, init_size, infoP, hash_flags);
+}
+
+/*
+ * ShmemInitStruct -- Create/attach to a structure in shared memory.
+ *
+ * This is called during initialization to find or allocate
+ * a data structure in shared memory. If no other process
+ * has created the structure, this routine allocates space
+ * for it. If it exists already, a pointer to the existing
+ * structure is returned.
+ *
+ * Returns: pointer to the object. *foundPtr is set true if the object was
+ * already in the shmem index (hence, already initialized).
+ *
+ * Note: before Postgres 9.0, this function returned NULL for some failure
+ * cases. Now, it always throws error instead, so callers need not check
+ * for NULL.
+ */
+void *
+ShmemInitStruct(const char *name, Size size, bool *foundPtr)
+{
+ ShmemIndexEnt *result;
+ void *structPtr;
+
+ LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE);
+
+ if (!ShmemIndex)
+ {
+ PGShmemHeader *shmemseghdr = ShmemSegHdr;
+
+ /* Must be trying to create/attach to ShmemIndex itself */
+ Assert(strcmp(name, "ShmemIndex") == 0);
+
+ if (IsUnderPostmaster)
+ {
+ /* Must be initializing a (non-standalone) backend */
+ Assert(shmemseghdr->index != NULL);
+ structPtr = shmemseghdr->index;
+ *foundPtr = true;
+ }
+ else
+ {
+ /*
+ * If the shmem index doesn't exist, we are bootstrapping: we must
+ * be trying to init the shmem index itself.
+ *
+ * Notice that the ShmemIndexLock is released before the shmem
+ * index has been initialized. This should be OK because no other
+ * process can be accessing shared memory yet.
+ */
+ Assert(shmemseghdr->index == NULL);
+ structPtr = ShmemAlloc(size);
+ shmemseghdr->index = structPtr;
+ *foundPtr = false;
+ }
+ LWLockRelease(ShmemIndexLock);
+ return structPtr;
+ }
+
+ /* look it up in the shmem index */
+ result = (ShmemIndexEnt *)
+ hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr);
+
+ if (!result)
+ {
+ LWLockRelease(ShmemIndexLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("could not create ShmemIndex entry for data structure \"%s\"",
+ name)));
+ }
+
+ if (*foundPtr)
+ {
+ /*
+ * Structure is in the shmem index so someone else has allocated it
+ * already. The size better be the same as the size we are trying to
+ * initialize to, or there is a name conflict (or worse).
+ */
+ if (result->size != size)
+ {
+ LWLockRelease(ShmemIndexLock);
+ ereport(ERROR,
+ (errmsg("ShmemIndex entry size is wrong for data structure"
+ " \"%s\": expected %zu, actual %zu",
+ name, size, result->size)));
+ }
+ structPtr = result->location;
+ }
+ else
+ {
+ Size allocated_size;
+
+ /* It isn't in the table yet. allocate and initialize it */
+ structPtr = ShmemAllocRaw(size, &allocated_size);
+ if (structPtr == NULL)
+ {
+ /* out of memory; remove the failed ShmemIndex entry */
+ hash_search(ShmemIndex, name, HASH_REMOVE, NULL);
+ LWLockRelease(ShmemIndexLock);
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("not enough shared memory for data structure"
+ " \"%s\" (%zu bytes requested)",
+ name, size)));
+ }
+ result->size = size;
+ result->allocated_size = allocated_size;
+ result->location = structPtr;
+ }
+
+ LWLockRelease(ShmemIndexLock);
+
+ Assert(ShmemAddrIsValid(structPtr));
+
+ Assert(structPtr == (void *) CACHELINEALIGN(structPtr));
+
+ return structPtr;
+}
+
+
+/*
+ * Add two Size values, checking for overflow
+ */
+Size
+add_size(Size s1, Size s2)
+{
+ Size result;
+
+ result = s1 + s2;
+ /* We are assuming Size is an unsigned type here... */
+ if (result < s1 || result < s2)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("requested shared memory size overflows size_t")));
+ return result;
+}
+
+/*
+ * Multiply two Size values, checking for overflow
+ */
+Size
+mul_size(Size s1, Size s2)
+{
+ Size result;
+
+ if (s1 == 0 || s2 == 0)
+ return 0;
+ result = s1 * s2;
+ /* We are assuming Size is an unsigned type here... */
+ if (result / s2 != s1)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("requested shared memory size overflows size_t")));
+ return result;
+}
+
+/* SQL SRF showing allocated shared memory */
+Datum
+pg_get_shmem_allocations(PG_FUNCTION_ARGS)
+{
+#define PG_GET_SHMEM_SIZES_COLS 4
+ ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo;
+ TupleDesc tupdesc;
+ Tuplestorestate *tupstore;
+ MemoryContext per_query_ctx;
+ MemoryContext oldcontext;
+ HASH_SEQ_STATUS hstat;
+ ShmemIndexEnt *ent;
+ Size named_allocated = 0;
+ Datum values[PG_GET_SHMEM_SIZES_COLS];
+ bool nulls[PG_GET_SHMEM_SIZES_COLS];
+
+ /* check to see if caller supports us returning a tuplestore */
+ if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("set-valued function called in context that cannot accept a set")));
+ if (!(rsinfo->allowedModes & SFRM_Materialize))
+ ereport(ERROR,
+ (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+ errmsg("materialize mode required, but it is not allowed in this context")));
+
+ /* Build a tuple descriptor for our result type */
+ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE)
+ elog(ERROR, "return type must be a row type");
+
+ per_query_ctx = rsinfo->econtext->ecxt_per_query_memory;
+ oldcontext = MemoryContextSwitchTo(per_query_ctx);
+
+ tupstore = tuplestore_begin_heap(true, false, work_mem);
+ rsinfo->returnMode = SFRM_Materialize;
+ rsinfo->setResult = tupstore;
+ rsinfo->setDesc = tupdesc;
+
+ MemoryContextSwitchTo(oldcontext);
+
+ LWLockAcquire(ShmemIndexLock, LW_SHARED);
+
+ hash_seq_init(&hstat, ShmemIndex);
+
+ /* output all allocated entries */
+ memset(nulls, 0, sizeof(nulls));
+ while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL)
+ {
+ values[0] = CStringGetTextDatum(ent->key);
+ values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr);
+ values[2] = Int64GetDatum(ent->size);
+ values[3] = Int64GetDatum(ent->allocated_size);
+ named_allocated += ent->allocated_size;
+
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+ }
+
+ /* output shared memory allocated but not counted via the shmem index */
+ values[0] = CStringGetTextDatum("<anonymous>");
+ nulls[1] = true;
+ values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated);
+ values[3] = values[2];
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+ /* output as-of-yet unused shared memory */
+ nulls[0] = true;
+ values[1] = Int64GetDatum(ShmemSegHdr->freeoffset);
+ nulls[1] = false;
+ values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset);
+ values[3] = values[2];
+ tuplestore_putvalues(tupstore, tupdesc, values, nulls);
+
+ LWLockRelease(ShmemIndexLock);
+
+ tuplestore_donestoring(tupstore);
+
+ return (Datum) 0;
+}
diff --git a/src/backend/storage/ipc/shmqueue.c b/src/backend/storage/ipc/shmqueue.c
new file mode 100644
index 0000000..dc3238c
--- /dev/null
+++ b/src/backend/storage/ipc/shmqueue.c
@@ -0,0 +1,190 @@
+/*-------------------------------------------------------------------------
+ *
+ * shmqueue.c
+ * shared memory linked lists
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/shmqueue.c
+ *
+ * NOTES
+ *
+ * Package for managing doubly-linked lists in shared memory.
+ * The only tricky thing is that SHM_QUEUE will usually be a field
+ * in a larger record. SHMQueueNext has to return a pointer
+ * to the record itself instead of a pointer to the SHMQueue field
+ * of the record. It takes an extra parameter and does some extra
+ * pointer arithmetic to do this correctly.
+ *
+ * NOTE: These are set up so they can be turned into macros some day.
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "storage/shmem.h"
+
+
+/*
+ * ShmemQueueInit -- make the head of a new queue point
+ * to itself
+ */
+void
+SHMQueueInit(SHM_QUEUE *queue)
+{
+ Assert(ShmemAddrIsValid(queue));
+ queue->prev = queue->next = queue;
+}
+
+/*
+ * SHMQueueIsDetached -- true if element is not currently
+ * in a queue.
+ */
+bool
+SHMQueueIsDetached(const SHM_QUEUE *queue)
+{
+ Assert(ShmemAddrIsValid(queue));
+ return (queue->prev == NULL);
+}
+
+/*
+ * SHMQueueElemInit -- clear an element's links
+ */
+void
+SHMQueueElemInit(SHM_QUEUE *queue)
+{
+ Assert(ShmemAddrIsValid(queue));
+ queue->prev = queue->next = NULL;
+}
+
+/*
+ * SHMQueueDelete -- remove an element from the queue and
+ * close the links
+ */
+void
+SHMQueueDelete(SHM_QUEUE *queue)
+{
+ SHM_QUEUE *nextElem = queue->next;
+ SHM_QUEUE *prevElem = queue->prev;
+
+ Assert(ShmemAddrIsValid(queue));
+ Assert(ShmemAddrIsValid(nextElem));
+ Assert(ShmemAddrIsValid(prevElem));
+
+ prevElem->next = queue->next;
+ nextElem->prev = queue->prev;
+
+ queue->prev = queue->next = NULL;
+}
+
+/*
+ * SHMQueueInsertBefore -- put elem in queue before the given queue
+ * element. Inserting "before" the queue head puts the elem
+ * at the tail of the queue.
+ */
+void
+SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+ SHM_QUEUE *prevPtr = queue->prev;
+
+ Assert(ShmemAddrIsValid(queue));
+ Assert(ShmemAddrIsValid(elem));
+
+ elem->next = prevPtr->next;
+ elem->prev = queue->prev;
+ queue->prev = elem;
+ prevPtr->next = elem;
+}
+
+/*
+ * SHMQueueInsertAfter -- put elem in queue after the given queue
+ * element. Inserting "after" the queue head puts the elem
+ * at the head of the queue.
+ */
+void
+SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem)
+{
+ SHM_QUEUE *nextPtr = queue->next;
+
+ Assert(ShmemAddrIsValid(queue));
+ Assert(ShmemAddrIsValid(elem));
+
+ elem->prev = nextPtr->prev;
+ elem->next = queue->next;
+ queue->next = elem;
+ nextPtr->prev = elem;
+}
+
+/*--------------------
+ * SHMQueueNext -- Get the next element from a queue
+ *
+ * To start the iteration, pass the queue head as both queue and curElem.
+ * Returns NULL if no more elements.
+ *
+ * Next element is at curElem->next. If SHMQueue is part of
+ * a larger structure, we want to return a pointer to the
+ * whole structure rather than a pointer to its SHMQueue field.
+ * For example,
+ * struct {
+ * int stuff;
+ * SHMQueue elem;
+ * } ELEMType;
+ * When this element is in a queue, prevElem->next points at struct.elem.
+ * We subtract linkOffset to get the correct start address of the structure.
+ *
+ * calls to SHMQueueNext should take these parameters:
+ * &(queueHead), &(queueHead), offsetof(ELEMType, elem)
+ * or
+ * &(queueHead), &(curElem->elem), offsetof(ELEMType, elem)
+ *--------------------
+ */
+Pointer
+SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
+{
+ SHM_QUEUE *elemPtr = curElem->next;
+
+ Assert(ShmemAddrIsValid(curElem));
+
+ if (elemPtr == queue) /* back to the queue head? */
+ return NULL;
+
+ return (Pointer) (((char *) elemPtr) - linkOffset);
+}
+
+/*--------------------
+ * SHMQueuePrev -- Get the previous element from a queue
+ *
+ * Same as SHMQueueNext, just starting at tail and moving towards head.
+ * All other comments and usage applies.
+ */
+Pointer
+SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset)
+{
+ SHM_QUEUE *elemPtr = curElem->prev;
+
+ Assert(ShmemAddrIsValid(curElem));
+
+ if (elemPtr == queue) /* back to the queue head? */
+ return NULL;
+
+ return (Pointer) (((char *) elemPtr) - linkOffset);
+}
+
+/*
+ * SHMQueueEmpty -- true if queue head is only element, false otherwise
+ */
+bool
+SHMQueueEmpty(const SHM_QUEUE *queue)
+{
+ Assert(ShmemAddrIsValid(queue));
+
+ if (queue->prev == queue)
+ {
+ Assert(queue->next == queue);
+ return true;
+ }
+ return false;
+}
diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c
new file mode 100644
index 0000000..de69d60
--- /dev/null
+++ b/src/backend/storage/ipc/signalfuncs.c
@@ -0,0 +1,300 @@
+/*-------------------------------------------------------------------------
+ *
+ * signalfuncs.c
+ * Functions for signaling backends
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/signalfuncs.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+
+#include "catalog/pg_authid.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "postmaster/syslogger.h"
+#include "storage/pmsignal.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "utils/acl.h"
+#include "utils/builtins.h"
+
+
+/*
+ * Send a signal to another backend.
+ *
+ * The signal is delivered if the user is either a superuser or the same
+ * role as the backend being signaled. For "dangerous" signals, an explicit
+ * check for superuser needs to be done prior to calling this function.
+ *
+ * Returns 0 on success, 1 on general failure, 2 on normal permission error
+ * and 3 if the caller needs to be a superuser.
+ *
+ * In the event of a general failure (return code 1), a warning message will
+ * be emitted. For permission errors, doing that is the responsibility of
+ * the caller.
+ */
+#define SIGNAL_BACKEND_SUCCESS 0
+#define SIGNAL_BACKEND_ERROR 1
+#define SIGNAL_BACKEND_NOPERMISSION 2
+#define SIGNAL_BACKEND_NOSUPERUSER 3
+static int
+pg_signal_backend(int pid, int sig)
+{
+ PGPROC *proc = BackendPidGetProc(pid);
+
+ /*
+ * BackendPidGetProc returns NULL if the pid isn't valid; but by the time
+ * we reach kill(), a process for which we get a valid proc here might
+ * have terminated on its own. There's no way to acquire a lock on an
+ * arbitrary process to prevent that. But since so far all the callers of
+ * this mechanism involve some request for ending the process anyway, that
+ * it might end on its own first is not a problem.
+ */
+ if (proc == NULL)
+ {
+ /*
+ * This is just a warning so a loop-through-resultset will not abort
+ * if one backend terminated on its own during the run.
+ */
+ ereport(WARNING,
+ (errmsg("PID %d is not a PostgreSQL server process", pid)));
+ return SIGNAL_BACKEND_ERROR;
+ }
+
+ /* Only allow superusers to signal superuser-owned backends. */
+ if (superuser_arg(proc->roleId) && !superuser())
+ return SIGNAL_BACKEND_NOSUPERUSER;
+
+ /* Users can signal backends they have role membership in. */
+ if (!has_privs_of_role(GetUserId(), proc->roleId) &&
+ !has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND))
+ return SIGNAL_BACKEND_NOPERMISSION;
+
+ /*
+ * Can the process we just validated above end, followed by the pid being
+ * recycled for a new process, before reaching here? Then we'd be trying
+ * to kill the wrong thing. Seems near impossible when sequential pid
+ * assignment and wraparound is used. Perhaps it could happen on a system
+ * where pid re-use is randomized. That race condition possibility seems
+ * too unlikely to worry about.
+ */
+
+ /* If we have setsid(), signal the backend's whole process group */
+#ifdef HAVE_SETSID
+ if (kill(-pid, sig))
+#else
+ if (kill(pid, sig))
+#endif
+ {
+ /* Again, just a warning to allow loops */
+ ereport(WARNING,
+ (errmsg("could not send signal to process %d: %m", pid)));
+ return SIGNAL_BACKEND_ERROR;
+ }
+ return SIGNAL_BACKEND_SUCCESS;
+}
+
+/*
+ * Signal to cancel a backend process. This is allowed if you are a member of
+ * the role whose process is being canceled.
+ *
+ * Note that only superusers can signal superuser-owned processes.
+ */
+Datum
+pg_cancel_backend(PG_FUNCTION_ARGS)
+{
+ int r = pg_signal_backend(PG_GETARG_INT32(0), SIGINT);
+
+ if (r == SIGNAL_BACKEND_NOSUPERUSER)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be a superuser to cancel superuser query")));
+
+ if (r == SIGNAL_BACKEND_NOPERMISSION)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be a member of the role whose query is being canceled or member of pg_signal_backend")));
+
+ PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS);
+}
+
+/*
+ * Wait until there is no backend process with the given PID and return true.
+ * On timeout, a warning is emitted and false is returned.
+ */
+static bool
+pg_wait_until_termination(int pid, int64 timeout)
+{
+ /*
+ * Wait in steps of waittime milliseconds until this function exits or
+ * timeout.
+ */
+ int64 waittime = 100;
+
+ /*
+ * Initially remaining time is the entire timeout specified by the user.
+ */
+ int64 remainingtime = timeout;
+
+ /*
+ * Check existence of the backend. If the backend still exists, then wait
+ * for waittime milliseconds, again check for the existence. Repeat this
+ * until timeout or an error occurs or a pending interrupt such as query
+ * cancel gets processed.
+ */
+ do
+ {
+ if (remainingtime < waittime)
+ waittime = remainingtime;
+
+ if (kill(pid, 0) == -1)
+ {
+ if (errno == ESRCH)
+ return true;
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INTERNAL_ERROR),
+ errmsg("could not check the existence of the backend with PID %d: %m",
+ pid)));
+ }
+
+ /* Process interrupts, if any, before waiting */
+ CHECK_FOR_INTERRUPTS();
+
+ (void) WaitLatch(MyLatch,
+ WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
+ waittime,
+ WAIT_EVENT_BACKEND_TERMINATION);
+
+ ResetLatch(MyLatch);
+
+ remainingtime -= waittime;
+ } while (remainingtime > 0);
+
+ ereport(WARNING,
+ (errmsg_plural("backend with PID %d did not terminate within %lld millisecond",
+ "backend with PID %d did not terminate within %lld milliseconds",
+ timeout,
+ pid, (long long int) timeout)));
+
+ return false;
+}
+
+/*
+ * Send a signal to terminate a backend process. This is allowed if you are a
+ * member of the role whose process is being terminated. If the timeout input
+ * argument is 0, then this function just signals the backend and returns
+ * true. If timeout is nonzero, then it waits until no process has the given
+ * PID; if the process ends within the timeout, true is returned, and if the
+ * timeout is exceeded, a warning is emitted and false is returned.
+ *
+ * Note that only superusers can signal superuser-owned processes.
+ */
+Datum
+pg_terminate_backend(PG_FUNCTION_ARGS)
+{
+ int pid;
+ int r;
+ int timeout; /* milliseconds */
+
+ pid = PG_GETARG_INT32(0);
+ timeout = PG_GETARG_INT64(1);
+
+ if (timeout < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
+ errmsg("\"timeout\" must not be negative")));
+
+ r = pg_signal_backend(pid, SIGTERM);
+
+ if (r == SIGNAL_BACKEND_NOSUPERUSER)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be a superuser to terminate superuser process")));
+
+ if (r == SIGNAL_BACKEND_NOPERMISSION)
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be a member of the role whose process is being terminated or member of pg_signal_backend")));
+
+ /* Wait only on success and if actually requested */
+ if (r == SIGNAL_BACKEND_SUCCESS && timeout > 0)
+ PG_RETURN_BOOL(pg_wait_until_termination(pid, timeout));
+ else
+ PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS);
+}
+
+/*
+ * Signal to reload the database configuration
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_reload_conf(PG_FUNCTION_ARGS)
+{
+ if (kill(PostmasterPid, SIGHUP))
+ {
+ ereport(WARNING,
+ (errmsg("failed to send signal to postmaster: %m")));
+ PG_RETURN_BOOL(false);
+ }
+
+ PG_RETURN_BOOL(true);
+}
+
+
+/*
+ * Rotate log file
+ *
+ * This function is kept to support adminpack 1.0.
+ */
+Datum
+pg_rotate_logfile(PG_FUNCTION_ARGS)
+{
+ if (!superuser())
+ ereport(ERROR,
+ (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+ errmsg("must be superuser to rotate log files with adminpack 1.0"),
+ /* translator: %s is a SQL function name */
+ errhint("Consider using %s, which is part of core, instead.",
+ "pg_logfile_rotate()")));
+
+ if (!Logging_collector)
+ {
+ ereport(WARNING,
+ (errmsg("rotation not possible because log collection not active")));
+ PG_RETURN_BOOL(false);
+ }
+
+ SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE);
+ PG_RETURN_BOOL(true);
+}
+
+/*
+ * Rotate log file
+ *
+ * Permission checking for this function is managed through the normal
+ * GRANT system.
+ */
+Datum
+pg_rotate_logfile_v2(PG_FUNCTION_ARGS)
+{
+ if (!Logging_collector)
+ {
+ ereport(WARNING,
+ (errmsg("rotation not possible because log collection not active")));
+ PG_RETURN_BOOL(false);
+ }
+
+ SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE);
+ PG_RETURN_BOOL(true);
+}
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
new file mode 100644
index 0000000..f585d63
--- /dev/null
+++ b/src/backend/storage/ipc/sinval.c
@@ -0,0 +1,205 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinval.c
+ * POSTGRES shared cache invalidation communication code.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/sinval.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "access/xact.h"
+#include "commands/async.h"
+#include "miscadmin.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/sinvaladt.h"
+#include "utils/inval.h"
+
+
+uint64 SharedInvalidMessageCounter;
+
+
+/*
+ * Because backends sitting idle will not be reading sinval events, we
+ * need a way to give an idle backend a swift kick in the rear and make
+ * it catch up before the sinval queue overflows and forces it to go
+ * through a cache reset exercise. This is done by sending
+ * PROCSIG_CATCHUP_INTERRUPT to any backend that gets too far behind.
+ *
+ * The signal handler will set an interrupt pending flag and will set the
+ * processes latch. Whenever starting to read from the client, or when
+ * interrupted while doing so, ProcessClientReadInterrupt() will call
+ * ProcessCatchupEvent().
+ */
+volatile sig_atomic_t catchupInterruptPending = false;
+
+
+/*
+ * SendSharedInvalidMessages
+ * Add shared-cache-invalidation message(s) to the global SI message queue.
+ */
+void
+SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, int n)
+{
+ SIInsertDataEntries(msgs, n);
+}
+
+/*
+ * ReceiveSharedInvalidMessages
+ * Process shared-cache-invalidation messages waiting for this backend
+ *
+ * We guarantee to process all messages that had been queued before the
+ * routine was entered. It is of course possible for more messages to get
+ * queued right after our last SIGetDataEntries call.
+ *
+ * NOTE: it is entirely possible for this routine to be invoked recursively
+ * as a consequence of processing inside the invalFunction or resetFunction.
+ * Furthermore, such a recursive call must guarantee that all outstanding
+ * inval messages have been processed before it exits. This is the reason
+ * for the strange-looking choice to use a statically allocated buffer array
+ * and counters; it's so that a recursive call can process messages already
+ * sucked out of sinvaladt.c.
+ */
+void
+ReceiveSharedInvalidMessages(void (*invalFunction) (SharedInvalidationMessage *msg),
+ void (*resetFunction) (void))
+{
+#define MAXINVALMSGS 32
+ static SharedInvalidationMessage messages[MAXINVALMSGS];
+
+ /*
+ * We use volatile here to prevent bugs if a compiler doesn't realize that
+ * recursion is a possibility ...
+ */
+ static volatile int nextmsg = 0;
+ static volatile int nummsgs = 0;
+
+ /* Deal with any messages still pending from an outer recursion */
+ while (nextmsg < nummsgs)
+ {
+ SharedInvalidationMessage msg = messages[nextmsg++];
+
+ SharedInvalidMessageCounter++;
+ invalFunction(&msg);
+ }
+
+ do
+ {
+ int getResult;
+
+ nextmsg = nummsgs = 0;
+
+ /* Try to get some more messages */
+ getResult = SIGetDataEntries(messages, MAXINVALMSGS);
+
+ if (getResult < 0)
+ {
+ /* got a reset message */
+ elog(DEBUG4, "cache state reset");
+ SharedInvalidMessageCounter++;
+ resetFunction();
+ break; /* nothing more to do */
+ }
+
+ /* Process them, being wary that a recursive call might eat some */
+ nextmsg = 0;
+ nummsgs = getResult;
+
+ while (nextmsg < nummsgs)
+ {
+ SharedInvalidationMessage msg = messages[nextmsg++];
+
+ SharedInvalidMessageCounter++;
+ invalFunction(&msg);
+ }
+
+ /*
+ * We only need to loop if the last SIGetDataEntries call (which might
+ * have been within a recursive call) returned a full buffer.
+ */
+ } while (nummsgs == MAXINVALMSGS);
+
+ /*
+ * We are now caught up. If we received a catchup signal, reset that
+ * flag, and call SICleanupQueue(). This is not so much because we need
+ * to flush dead messages right now, as that we want to pass on the
+ * catchup signal to the next slowest backend. "Daisy chaining" the
+ * catchup signal this way avoids creating spikes in system load for what
+ * should be just a background maintenance activity.
+ */
+ if (catchupInterruptPending)
+ {
+ catchupInterruptPending = false;
+ elog(DEBUG4, "sinval catchup complete, cleaning queue");
+ SICleanupQueue(false, 0);
+ }
+}
+
+
+/*
+ * HandleCatchupInterrupt
+ *
+ * This is called when PROCSIG_CATCHUP_INTERRUPT is received.
+ *
+ * We used to directly call ProcessCatchupEvent directly when idle. These days
+ * we just set a flag to do it later and notify the process of that fact by
+ * setting the process's latch.
+ */
+void
+HandleCatchupInterrupt(void)
+{
+ /*
+ * Note: this is called by a SIGNAL HANDLER. You must be very wary what
+ * you do here.
+ */
+
+ catchupInterruptPending = true;
+
+ /* make sure the event is processed in due course */
+ SetLatch(MyLatch);
+}
+
+/*
+ * ProcessCatchupInterrupt
+ *
+ * The portion of catchup interrupt handling that runs outside of the signal
+ * handler, which allows it to actually process pending invalidations.
+ */
+void
+ProcessCatchupInterrupt(void)
+{
+ while (catchupInterruptPending)
+ {
+ /*
+ * What we need to do here is cause ReceiveSharedInvalidMessages() to
+ * run, which will do the necessary work and also reset the
+ * catchupInterruptPending flag. If we are inside a transaction we
+ * can just call AcceptInvalidationMessages() to do this. If we
+ * aren't, we start and immediately end a transaction; the call to
+ * AcceptInvalidationMessages() happens down inside transaction start.
+ *
+ * It is awfully tempting to just call AcceptInvalidationMessages()
+ * without the rest of the xact start/stop overhead, and I think that
+ * would actually work in the normal case; but I am not sure that
+ * things would clean up nicely if we got an error partway through.
+ */
+ if (IsTransactionOrTransactionBlock())
+ {
+ elog(DEBUG4, "ProcessCatchupEvent inside transaction");
+ AcceptInvalidationMessages();
+ }
+ else
+ {
+ elog(DEBUG4, "ProcessCatchupEvent outside transaction");
+ StartTransactionCommand();
+ CommitTransactionCommand();
+ }
+ }
+}
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
new file mode 100644
index 0000000..946bd8e
--- /dev/null
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -0,0 +1,777 @@
+/*-------------------------------------------------------------------------
+ *
+ * sinvaladt.c
+ * POSTGRES shared cache invalidation data manager.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/sinvaladt.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <signal.h>
+#include <unistd.h>
+
+#include "access/transam.h"
+#include "miscadmin.h"
+#include "storage/backendid.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/procsignal.h"
+#include "storage/shmem.h"
+#include "storage/sinvaladt.h"
+#include "storage/spin.h"
+
+/*
+ * Conceptually, the shared cache invalidation messages are stored in an
+ * infinite array, where maxMsgNum is the next array subscript to store a
+ * submitted message in, minMsgNum is the smallest array subscript containing
+ * a message not yet read by all backends, and we always have maxMsgNum >=
+ * minMsgNum. (They are equal when there are no messages pending.) For each
+ * active backend, there is a nextMsgNum pointer indicating the next message it
+ * needs to read; we have maxMsgNum >= nextMsgNum >= minMsgNum for every
+ * backend.
+ *
+ * (In the current implementation, minMsgNum is a lower bound for the
+ * per-process nextMsgNum values, but it isn't rigorously kept equal to the
+ * smallest nextMsgNum --- it may lag behind. We only update it when
+ * SICleanupQueue is called, and we try not to do that often.)
+ *
+ * In reality, the messages are stored in a circular buffer of MAXNUMMESSAGES
+ * entries. We translate MsgNum values into circular-buffer indexes by
+ * computing MsgNum % MAXNUMMESSAGES (this should be fast as long as
+ * MAXNUMMESSAGES is a constant and a power of 2). As long as maxMsgNum
+ * doesn't exceed minMsgNum by more than MAXNUMMESSAGES, we have enough space
+ * in the buffer. If the buffer does overflow, we recover by setting the
+ * "reset" flag for each backend that has fallen too far behind. A backend
+ * that is in "reset" state is ignored while determining minMsgNum. When
+ * it does finally attempt to receive inval messages, it must discard all
+ * its invalidatable state, since it won't know what it missed.
+ *
+ * To reduce the probability of needing resets, we send a "catchup" interrupt
+ * to any backend that seems to be falling unreasonably far behind. The
+ * normal behavior is that at most one such interrupt is in flight at a time;
+ * when a backend completes processing a catchup interrupt, it executes
+ * SICleanupQueue, which will signal the next-furthest-behind backend if
+ * needed. This avoids undue contention from multiple backends all trying
+ * to catch up at once. However, the furthest-back backend might be stuck
+ * in a state where it can't catch up. Eventually it will get reset, so it
+ * won't cause any more problems for anyone but itself. But we don't want
+ * to find that a bunch of other backends are now too close to the reset
+ * threshold to be saved. So SICleanupQueue is designed to occasionally
+ * send extra catchup interrupts as the queue gets fuller, to backends that
+ * are far behind and haven't gotten one yet. As long as there aren't a lot
+ * of "stuck" backends, we won't need a lot of extra interrupts, since ones
+ * that aren't stuck will propagate their interrupts to the next guy.
+ *
+ * We would have problems if the MsgNum values overflow an integer, so
+ * whenever minMsgNum exceeds MSGNUMWRAPAROUND, we subtract MSGNUMWRAPAROUND
+ * from all the MsgNum variables simultaneously. MSGNUMWRAPAROUND can be
+ * large so that we don't need to do this often. It must be a multiple of
+ * MAXNUMMESSAGES so that the existing circular-buffer entries don't need
+ * to be moved when we do it.
+ *
+ * Access to the shared sinval array is protected by two locks, SInvalReadLock
+ * and SInvalWriteLock. Readers take SInvalReadLock in shared mode; this
+ * authorizes them to modify their own ProcState but not to modify or even
+ * look at anyone else's. When we need to perform array-wide updates,
+ * such as in SICleanupQueue, we take SInvalReadLock in exclusive mode to
+ * lock out all readers. Writers take SInvalWriteLock (always in exclusive
+ * mode) to serialize adding messages to the queue. Note that a writer
+ * can operate in parallel with one or more readers, because the writer
+ * has no need to touch anyone's ProcState, except in the infrequent cases
+ * when SICleanupQueue is needed. The only point of overlap is that
+ * the writer wants to change maxMsgNum while readers need to read it.
+ * We deal with that by having a spinlock that readers must take for just
+ * long enough to read maxMsgNum, while writers take it for just long enough
+ * to write maxMsgNum. (The exact rule is that you need the spinlock to
+ * read maxMsgNum if you are not holding SInvalWriteLock, and you need the
+ * spinlock to write maxMsgNum unless you are holding both locks.)
+ *
+ * Note: since maxMsgNum is an int and hence presumably atomically readable/
+ * writable, the spinlock might seem unnecessary. The reason it is needed
+ * is to provide a memory barrier: we need to be sure that messages written
+ * to the array are actually there before maxMsgNum is increased, and that
+ * readers will see that data after fetching maxMsgNum. Multiprocessors
+ * that have weak memory-ordering guarantees can fail without the memory
+ * barrier instructions that are included in the spinlock sequences.
+ */
+
+
+/*
+ * Configurable parameters.
+ *
+ * MAXNUMMESSAGES: max number of shared-inval messages we can buffer.
+ * Must be a power of 2 for speed.
+ *
+ * MSGNUMWRAPAROUND: how often to reduce MsgNum variables to avoid overflow.
+ * Must be a multiple of MAXNUMMESSAGES. Should be large.
+ *
+ * CLEANUP_MIN: the minimum number of messages that must be in the buffer
+ * before we bother to call SICleanupQueue.
+ *
+ * CLEANUP_QUANTUM: how often (in messages) to call SICleanupQueue once
+ * we exceed CLEANUP_MIN. Should be a power of 2 for speed.
+ *
+ * SIG_THRESHOLD: the minimum number of messages a backend must have fallen
+ * behind before we'll send it PROCSIG_CATCHUP_INTERRUPT.
+ *
+ * WRITE_QUANTUM: the max number of messages to push into the buffer per
+ * iteration of SIInsertDataEntries. Noncritical but should be less than
+ * CLEANUP_QUANTUM, because we only consider calling SICleanupQueue once
+ * per iteration.
+ */
+
+#define MAXNUMMESSAGES 4096
+#define MSGNUMWRAPAROUND (MAXNUMMESSAGES * 262144)
+#define CLEANUP_MIN (MAXNUMMESSAGES / 2)
+#define CLEANUP_QUANTUM (MAXNUMMESSAGES / 16)
+#define SIG_THRESHOLD (MAXNUMMESSAGES / 2)
+#define WRITE_QUANTUM 64
+
+/* Per-backend state in shared invalidation structure */
+typedef struct ProcState
+{
+ /* procPid is zero in an inactive ProcState array entry. */
+ pid_t procPid; /* PID of backend, for signaling */
+ PGPROC *proc; /* PGPROC of backend */
+ /* nextMsgNum is meaningless if procPid == 0 or resetState is true. */
+ int nextMsgNum; /* next message number to read */
+ bool resetState; /* backend needs to reset its state */
+ bool signaled; /* backend has been sent catchup signal */
+ bool hasMessages; /* backend has unread messages */
+
+ /*
+ * Backend only sends invalidations, never receives them. This only makes
+ * sense for Startup process during recovery because it doesn't maintain a
+ * relcache, yet it fires inval messages to allow query backends to see
+ * schema changes.
+ */
+ bool sendOnly; /* backend only sends, never receives */
+
+ /*
+ * Next LocalTransactionId to use for each idle backend slot. We keep
+ * this here because it is indexed by BackendId and it is convenient to
+ * copy the value to and from local memory when MyBackendId is set. It's
+ * meaningless in an active ProcState entry.
+ */
+ LocalTransactionId nextLXID;
+} ProcState;
+
+/* Shared cache invalidation memory segment */
+typedef struct SISeg
+{
+ /*
+ * General state information
+ */
+ int minMsgNum; /* oldest message still needed */
+ int maxMsgNum; /* next message number to be assigned */
+ int nextThreshold; /* # of messages to call SICleanupQueue */
+ int lastBackend; /* index of last active procState entry, +1 */
+ int maxBackends; /* size of procState array */
+
+ slock_t msgnumLock; /* spinlock protecting maxMsgNum */
+
+ /*
+ * Circular buffer holding shared-inval messages
+ */
+ SharedInvalidationMessage buffer[MAXNUMMESSAGES];
+
+ /*
+ * Per-backend invalidation state info (has MaxBackends entries).
+ */
+ ProcState procState[FLEXIBLE_ARRAY_MEMBER];
+} SISeg;
+
+static SISeg *shmInvalBuffer; /* pointer to the shared inval buffer */
+
+
+static LocalTransactionId nextLocalTransactionId;
+
+static void CleanupInvalidationState(int status, Datum arg);
+
+
+/*
+ * SInvalShmemSize --- return shared-memory space needed
+ */
+Size
+SInvalShmemSize(void)
+{
+ Size size;
+
+ size = offsetof(SISeg, procState);
+ size = add_size(size, mul_size(sizeof(ProcState), MaxBackends));
+
+ return size;
+}
+
+/*
+ * CreateSharedInvalidationState
+ * Create and initialize the SI message buffer
+ */
+void
+CreateSharedInvalidationState(void)
+{
+ int i;
+ bool found;
+
+ /* Allocate space in shared memory */
+ shmInvalBuffer = (SISeg *)
+ ShmemInitStruct("shmInvalBuffer", SInvalShmemSize(), &found);
+ if (found)
+ return;
+
+ /* Clear message counters, save size of procState array, init spinlock */
+ shmInvalBuffer->minMsgNum = 0;
+ shmInvalBuffer->maxMsgNum = 0;
+ shmInvalBuffer->nextThreshold = CLEANUP_MIN;
+ shmInvalBuffer->lastBackend = 0;
+ shmInvalBuffer->maxBackends = MaxBackends;
+ SpinLockInit(&shmInvalBuffer->msgnumLock);
+
+ /* The buffer[] array is initially all unused, so we need not fill it */
+
+ /* Mark all backends inactive, and initialize nextLXID */
+ for (i = 0; i < shmInvalBuffer->maxBackends; i++)
+ {
+ shmInvalBuffer->procState[i].procPid = 0; /* inactive */
+ shmInvalBuffer->procState[i].proc = NULL;
+ shmInvalBuffer->procState[i].nextMsgNum = 0; /* meaningless */
+ shmInvalBuffer->procState[i].resetState = false;
+ shmInvalBuffer->procState[i].signaled = false;
+ shmInvalBuffer->procState[i].hasMessages = false;
+ shmInvalBuffer->procState[i].nextLXID = InvalidLocalTransactionId;
+ }
+}
+
+/*
+ * SharedInvalBackendInit
+ * Initialize a new backend to operate on the sinval buffer
+ */
+void
+SharedInvalBackendInit(bool sendOnly)
+{
+ int index;
+ ProcState *stateP = NULL;
+ SISeg *segP = shmInvalBuffer;
+
+ /*
+ * This can run in parallel with read operations, but not with write
+ * operations, since SIInsertDataEntries relies on lastBackend to set
+ * hasMessages appropriately.
+ */
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+ /* Look for a free entry in the procState array */
+ for (index = 0; index < segP->lastBackend; index++)
+ {
+ if (segP->procState[index].procPid == 0) /* inactive slot? */
+ {
+ stateP = &segP->procState[index];
+ break;
+ }
+ }
+
+ if (stateP == NULL)
+ {
+ if (segP->lastBackend < segP->maxBackends)
+ {
+ stateP = &segP->procState[segP->lastBackend];
+ Assert(stateP->procPid == 0);
+ segP->lastBackend++;
+ }
+ else
+ {
+ /*
+ * out of procState slots: MaxBackends exceeded -- report normally
+ */
+ MyBackendId = InvalidBackendId;
+ LWLockRelease(SInvalWriteLock);
+ ereport(FATAL,
+ (errcode(ERRCODE_TOO_MANY_CONNECTIONS),
+ errmsg("sorry, too many clients already")));
+ }
+ }
+
+ MyBackendId = (stateP - &segP->procState[0]) + 1;
+
+ /* Advertise assigned backend ID in MyProc */
+ MyProc->backendId = MyBackendId;
+
+ /* Fetch next local transaction ID into local memory */
+ nextLocalTransactionId = stateP->nextLXID;
+
+ /* mark myself active, with all extant messages already read */
+ stateP->procPid = MyProcPid;
+ stateP->proc = MyProc;
+ stateP->nextMsgNum = segP->maxMsgNum;
+ stateP->resetState = false;
+ stateP->signaled = false;
+ stateP->hasMessages = false;
+ stateP->sendOnly = sendOnly;
+
+ LWLockRelease(SInvalWriteLock);
+
+ /* register exit routine to mark my entry inactive at exit */
+ on_shmem_exit(CleanupInvalidationState, PointerGetDatum(segP));
+
+ elog(DEBUG4, "my backend ID is %d", MyBackendId);
+}
+
+/*
+ * CleanupInvalidationState
+ * Mark the current backend as no longer active.
+ *
+ * This function is called via on_shmem_exit() during backend shutdown.
+ *
+ * arg is really of type "SISeg*".
+ */
+static void
+CleanupInvalidationState(int status, Datum arg)
+{
+ SISeg *segP = (SISeg *) DatumGetPointer(arg);
+ ProcState *stateP;
+ int i;
+
+ Assert(PointerIsValid(segP));
+
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+ stateP = &segP->procState[MyBackendId - 1];
+
+ /* Update next local transaction ID for next holder of this backendID */
+ stateP->nextLXID = nextLocalTransactionId;
+
+ /* Mark myself inactive */
+ stateP->procPid = 0;
+ stateP->proc = NULL;
+ stateP->nextMsgNum = 0;
+ stateP->resetState = false;
+ stateP->signaled = false;
+
+ /* Recompute index of last active backend */
+ for (i = segP->lastBackend; i > 0; i--)
+ {
+ if (segP->procState[i - 1].procPid != 0)
+ break;
+ }
+ segP->lastBackend = i;
+
+ LWLockRelease(SInvalWriteLock);
+}
+
+/*
+ * BackendIdGetProc
+ * Get the PGPROC structure for a backend, given the backend ID.
+ * The result may be out of date arbitrarily quickly, so the caller
+ * must be careful about how this information is used. NULL is
+ * returned if the backend is not active.
+ */
+PGPROC *
+BackendIdGetProc(int backendID)
+{
+ PGPROC *result = NULL;
+ SISeg *segP = shmInvalBuffer;
+
+ /* Need to lock out additions/removals of backends */
+ LWLockAcquire(SInvalWriteLock, LW_SHARED);
+
+ if (backendID > 0 && backendID <= segP->lastBackend)
+ {
+ ProcState *stateP = &segP->procState[backendID - 1];
+
+ result = stateP->proc;
+ }
+
+ LWLockRelease(SInvalWriteLock);
+
+ return result;
+}
+
+/*
+ * BackendIdGetTransactionIds
+ * Get the xid and xmin of the backend. The result may be out of date
+ * arbitrarily quickly, so the caller must be careful about how this
+ * information is used.
+ */
+void
+BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmin)
+{
+ SISeg *segP = shmInvalBuffer;
+
+ *xid = InvalidTransactionId;
+ *xmin = InvalidTransactionId;
+
+ /* Need to lock out additions/removals of backends */
+ LWLockAcquire(SInvalWriteLock, LW_SHARED);
+
+ if (backendID > 0 && backendID <= segP->lastBackend)
+ {
+ ProcState *stateP = &segP->procState[backendID - 1];
+ PGPROC *proc = stateP->proc;
+
+ if (proc != NULL)
+ {
+ *xid = proc->xid;
+ *xmin = proc->xmin;
+ }
+ }
+
+ LWLockRelease(SInvalWriteLock);
+}
+
+/*
+ * SIInsertDataEntries
+ * Add new invalidation message(s) to the buffer.
+ */
+void
+SIInsertDataEntries(const SharedInvalidationMessage *data, int n)
+{
+ SISeg *segP = shmInvalBuffer;
+
+ /*
+ * N can be arbitrarily large. We divide the work into groups of no more
+ * than WRITE_QUANTUM messages, to be sure that we don't hold the lock for
+ * an unreasonably long time. (This is not so much because we care about
+ * letting in other writers, as that some just-caught-up backend might be
+ * trying to do SICleanupQueue to pass on its signal, and we don't want it
+ * to have to wait a long time.) Also, we need to consider calling
+ * SICleanupQueue every so often.
+ */
+ while (n > 0)
+ {
+ int nthistime = Min(n, WRITE_QUANTUM);
+ int numMsgs;
+ int max;
+ int i;
+
+ n -= nthistime;
+
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+
+ /*
+ * If the buffer is full, we *must* acquire some space. Clean the
+ * queue and reset anyone who is preventing space from being freed.
+ * Otherwise, clean the queue only when it's exceeded the next
+ * fullness threshold. We have to loop and recheck the buffer state
+ * after any call of SICleanupQueue.
+ */
+ for (;;)
+ {
+ numMsgs = segP->maxMsgNum - segP->minMsgNum;
+ if (numMsgs + nthistime > MAXNUMMESSAGES ||
+ numMsgs >= segP->nextThreshold)
+ SICleanupQueue(true, nthistime);
+ else
+ break;
+ }
+
+ /*
+ * Insert new message(s) into proper slot of circular buffer
+ */
+ max = segP->maxMsgNum;
+ while (nthistime-- > 0)
+ {
+ segP->buffer[max % MAXNUMMESSAGES] = *data++;
+ max++;
+ }
+
+ /* Update current value of maxMsgNum using spinlock */
+ SpinLockAcquire(&segP->msgnumLock);
+ segP->maxMsgNum = max;
+ SpinLockRelease(&segP->msgnumLock);
+
+ /*
+ * Now that the maxMsgNum change is globally visible, we give everyone
+ * a swift kick to make sure they read the newly added messages.
+ * Releasing SInvalWriteLock will enforce a full memory barrier, so
+ * these (unlocked) changes will be committed to memory before we exit
+ * the function.
+ */
+ for (i = 0; i < segP->lastBackend; i++)
+ {
+ ProcState *stateP = &segP->procState[i];
+
+ stateP->hasMessages = true;
+ }
+
+ LWLockRelease(SInvalWriteLock);
+ }
+}
+
+/*
+ * SIGetDataEntries
+ * get next SI message(s) for current backend, if there are any
+ *
+ * Possible return values:
+ * 0: no SI message available
+ * n>0: next n SI messages have been extracted into data[]
+ * -1: SI reset message extracted
+ *
+ * If the return value is less than the array size "datasize", the caller
+ * can assume that there are no more SI messages after the one(s) returned.
+ * Otherwise, another call is needed to collect more messages.
+ *
+ * NB: this can run in parallel with other instances of SIGetDataEntries
+ * executing on behalf of other backends, since each instance will modify only
+ * fields of its own backend's ProcState, and no instance will look at fields
+ * of other backends' ProcStates. We express this by grabbing SInvalReadLock
+ * in shared mode. Note that this is not exactly the normal (read-only)
+ * interpretation of a shared lock! Look closely at the interactions before
+ * allowing SInvalReadLock to be grabbed in shared mode for any other reason!
+ *
+ * NB: this can also run in parallel with SIInsertDataEntries. It is not
+ * guaranteed that we will return any messages added after the routine is
+ * entered.
+ *
+ * Note: we assume that "datasize" is not so large that it might be important
+ * to break our hold on SInvalReadLock into segments.
+ */
+int
+SIGetDataEntries(SharedInvalidationMessage *data, int datasize)
+{
+ SISeg *segP;
+ ProcState *stateP;
+ int max;
+ int n;
+
+ segP = shmInvalBuffer;
+ stateP = &segP->procState[MyBackendId - 1];
+
+ /*
+ * Before starting to take locks, do a quick, unlocked test to see whether
+ * there can possibly be anything to read. On a multiprocessor system,
+ * it's possible that this load could migrate backwards and occur before
+ * we actually enter this function, so we might miss a sinval message that
+ * was just added by some other processor. But they can't migrate
+ * backwards over a preceding lock acquisition, so it should be OK. If we
+ * haven't acquired a lock preventing against further relevant
+ * invalidations, any such occurrence is not much different than if the
+ * invalidation had arrived slightly later in the first place.
+ */
+ if (!stateP->hasMessages)
+ return 0;
+
+ LWLockAcquire(SInvalReadLock, LW_SHARED);
+
+ /*
+ * We must reset hasMessages before determining how many messages we're
+ * going to read. That way, if new messages arrive after we have
+ * determined how many we're reading, the flag will get reset and we'll
+ * notice those messages part-way through.
+ *
+ * Note that, if we don't end up reading all of the messages, we had
+ * better be certain to reset this flag before exiting!
+ */
+ stateP->hasMessages = false;
+
+ /* Fetch current value of maxMsgNum using spinlock */
+ SpinLockAcquire(&segP->msgnumLock);
+ max = segP->maxMsgNum;
+ SpinLockRelease(&segP->msgnumLock);
+
+ if (stateP->resetState)
+ {
+ /*
+ * Force reset. We can say we have dealt with any messages added
+ * since the reset, as well; and that means we should clear the
+ * signaled flag, too.
+ */
+ stateP->nextMsgNum = max;
+ stateP->resetState = false;
+ stateP->signaled = false;
+ LWLockRelease(SInvalReadLock);
+ return -1;
+ }
+
+ /*
+ * Retrieve messages and advance backend's counter, until data array is
+ * full or there are no more messages.
+ *
+ * There may be other backends that haven't read the message(s), so we
+ * cannot delete them here. SICleanupQueue() will eventually remove them
+ * from the queue.
+ */
+ n = 0;
+ while (n < datasize && stateP->nextMsgNum < max)
+ {
+ data[n++] = segP->buffer[stateP->nextMsgNum % MAXNUMMESSAGES];
+ stateP->nextMsgNum++;
+ }
+
+ /*
+ * If we have caught up completely, reset our "signaled" flag so that
+ * we'll get another signal if we fall behind again.
+ *
+ * If we haven't caught up completely, reset the hasMessages flag so that
+ * we see the remaining messages next time.
+ */
+ if (stateP->nextMsgNum >= max)
+ stateP->signaled = false;
+ else
+ stateP->hasMessages = true;
+
+ LWLockRelease(SInvalReadLock);
+ return n;
+}
+
+/*
+ * SICleanupQueue
+ * Remove messages that have been consumed by all active backends
+ *
+ * callerHasWriteLock is true if caller is holding SInvalWriteLock.
+ * minFree is the minimum number of message slots to make free.
+ *
+ * Possible side effects of this routine include marking one or more
+ * backends as "reset" in the array, and sending PROCSIG_CATCHUP_INTERRUPT
+ * to some backend that seems to be getting too far behind. We signal at
+ * most one backend at a time, for reasons explained at the top of the file.
+ *
+ * Caution: because we transiently release write lock when we have to signal
+ * some other backend, it is NOT guaranteed that there are still minFree
+ * free message slots at exit. Caller must recheck and perhaps retry.
+ */
+void
+SICleanupQueue(bool callerHasWriteLock, int minFree)
+{
+ SISeg *segP = shmInvalBuffer;
+ int min,
+ minsig,
+ lowbound,
+ numMsgs,
+ i;
+ ProcState *needSig = NULL;
+
+ /* Lock out all writers and readers */
+ if (!callerHasWriteLock)
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+ LWLockAcquire(SInvalReadLock, LW_EXCLUSIVE);
+
+ /*
+ * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify the
+ * furthest-back backend that needs signaling (if any), and reset any
+ * backends that are too far back. Note that because we ignore sendOnly
+ * backends here it is possible for them to keep sending messages without
+ * a problem even when they are the only active backend.
+ */
+ min = segP->maxMsgNum;
+ minsig = min - SIG_THRESHOLD;
+ lowbound = min - MAXNUMMESSAGES + minFree;
+
+ for (i = 0; i < segP->lastBackend; i++)
+ {
+ ProcState *stateP = &segP->procState[i];
+ int n = stateP->nextMsgNum;
+
+ /* Ignore if inactive or already in reset state */
+ if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly)
+ continue;
+
+ /*
+ * If we must free some space and this backend is preventing it, force
+ * him into reset state and then ignore until he catches up.
+ */
+ if (n < lowbound)
+ {
+ stateP->resetState = true;
+ /* no point in signaling him ... */
+ continue;
+ }
+
+ /* Track the global minimum nextMsgNum */
+ if (n < min)
+ min = n;
+
+ /* Also see who's furthest back of the unsignaled backends */
+ if (n < minsig && !stateP->signaled)
+ {
+ minsig = n;
+ needSig = stateP;
+ }
+ }
+ segP->minMsgNum = min;
+
+ /*
+ * When minMsgNum gets really large, decrement all message counters so as
+ * to forestall overflow of the counters. This happens seldom enough that
+ * folding it into the previous loop would be a loser.
+ */
+ if (min >= MSGNUMWRAPAROUND)
+ {
+ segP->minMsgNum -= MSGNUMWRAPAROUND;
+ segP->maxMsgNum -= MSGNUMWRAPAROUND;
+ for (i = 0; i < segP->lastBackend; i++)
+ {
+ /* we don't bother skipping inactive entries here */
+ segP->procState[i].nextMsgNum -= MSGNUMWRAPAROUND;
+ }
+ }
+
+ /*
+ * Determine how many messages are still in the queue, and set the
+ * threshold at which we should repeat SICleanupQueue().
+ */
+ numMsgs = segP->maxMsgNum - segP->minMsgNum;
+ if (numMsgs < CLEANUP_MIN)
+ segP->nextThreshold = CLEANUP_MIN;
+ else
+ segP->nextThreshold = (numMsgs / CLEANUP_QUANTUM + 1) * CLEANUP_QUANTUM;
+
+ /*
+ * Lastly, signal anyone who needs a catchup interrupt. Since
+ * SendProcSignal() might not be fast, we don't want to hold locks while
+ * executing it.
+ */
+ if (needSig)
+ {
+ pid_t his_pid = needSig->procPid;
+ BackendId his_backendId = (needSig - &segP->procState[0]) + 1;
+
+ needSig->signaled = true;
+ LWLockRelease(SInvalReadLock);
+ LWLockRelease(SInvalWriteLock);
+ elog(DEBUG4, "sending sinval catchup signal to PID %d", (int) his_pid);
+ SendProcSignal(his_pid, PROCSIG_CATCHUP_INTERRUPT, his_backendId);
+ if (callerHasWriteLock)
+ LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE);
+ }
+ else
+ {
+ LWLockRelease(SInvalReadLock);
+ if (!callerHasWriteLock)
+ LWLockRelease(SInvalWriteLock);
+ }
+}
+
+
+/*
+ * GetNextLocalTransactionId --- allocate a new LocalTransactionId
+ *
+ * We split VirtualTransactionIds into two parts so that it is possible
+ * to allocate a new one without any contention for shared memory, except
+ * for a bit of additional overhead during backend startup/shutdown.
+ * The high-order part of a VirtualTransactionId is a BackendId, and the
+ * low-order part is a LocalTransactionId, which we assign from a local
+ * counter. To avoid the risk of a VirtualTransactionId being reused
+ * within a short interval, successive procs occupying the same backend ID
+ * slot should use a consecutive sequence of local IDs, which is implemented
+ * by copying nextLocalTransactionId as seen above.
+ */
+LocalTransactionId
+GetNextLocalTransactionId(void)
+{
+ LocalTransactionId result;
+
+ /* loop to avoid returning InvalidLocalTransactionId at wraparound */
+ do
+ {
+ result = nextLocalTransactionId++;
+ } while (!LocalTransactionIdIsValid(result));
+
+ return result;
+}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
new file mode 100644
index 0000000..687ce03
--- /dev/null
+++ b/src/backend/storage/ipc/standby.c
@@ -0,0 +1,1450 @@
+/*-------------------------------------------------------------------------
+ *
+ * standby.c
+ * Misc functions used in Hot Standby mode.
+ *
+ * All functions for handling RM_STANDBY_ID, which relate to
+ * AccessExclusiveLocks and starting snapshots for Hot Standby mode.
+ * Plus conflict recovery processing.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/ipc/standby.c
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+#include "access/transam.h"
+#include "access/twophase.h"
+#include "access/xact.h"
+#include "access/xlog.h"
+#include "access/xloginsert.h"
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "storage/bufmgr.h"
+#include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/procarray.h"
+#include "storage/sinvaladt.h"
+#include "storage/standby.h"
+#include "utils/hsearch.h"
+#include "utils/memutils.h"
+#include "utils/ps_status.h"
+#include "utils/timeout.h"
+#include "utils/timestamp.h"
+
+/* User-settable GUC parameters */
+int vacuum_defer_cleanup_age;
+int max_standby_archive_delay = 30 * 1000;
+int max_standby_streaming_delay = 30 * 1000;
+bool log_recovery_conflict_waits = false;
+
+static HTAB *RecoveryLockLists;
+
+/* Flags set by timeout handlers */
+static volatile sig_atomic_t got_standby_deadlock_timeout = false;
+static volatile sig_atomic_t got_standby_delay_timeout = false;
+static volatile sig_atomic_t got_standby_lock_timeout = false;
+
+static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+ ProcSignalReason reason,
+ uint32 wait_event_info,
+ bool report_waiting);
+static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason);
+static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts);
+static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks);
+static const char *get_recovery_conflict_desc(ProcSignalReason reason);
+
+/*
+ * Keep track of all the locks owned by a given transaction.
+ */
+typedef struct RecoveryLockListsEntry
+{
+ TransactionId xid;
+ List *locks;
+} RecoveryLockListsEntry;
+
+/*
+ * InitRecoveryTransactionEnvironment
+ * Initialize tracking of our primary's in-progress transactions.
+ *
+ * We need to issue shared invalidations and hold locks. Holding locks
+ * means others may want to wait on us, so we need to make a lock table
+ * vxact entry like a real transaction. We could create and delete
+ * lock table entries for each transaction but its simpler just to create
+ * one permanent entry and leave it there all the time. Locks are then
+ * acquired and released as needed. Yes, this means you can see the
+ * Startup process in pg_locks once we have run this.
+ */
+void
+InitRecoveryTransactionEnvironment(void)
+{
+ VirtualTransactionId vxid;
+ HASHCTL hash_ctl;
+
+ /*
+ * Initialize the hash table for tracking the list of locks held by each
+ * transaction.
+ */
+ hash_ctl.keysize = sizeof(TransactionId);
+ hash_ctl.entrysize = sizeof(RecoveryLockListsEntry);
+ RecoveryLockLists = hash_create("RecoveryLockLists",
+ 64,
+ &hash_ctl,
+ HASH_ELEM | HASH_BLOBS);
+
+ /*
+ * Initialize shared invalidation management for Startup process, being
+ * careful to register ourselves as a sendOnly process so we don't need to
+ * read messages, nor will we get signaled when the queue starts filling
+ * up.
+ */
+ SharedInvalBackendInit(true);
+
+ /*
+ * Lock a virtual transaction id for Startup process.
+ *
+ * We need to do GetNextLocalTransactionId() because
+ * SharedInvalBackendInit() leaves localTransactionId invalid and the lock
+ * manager doesn't like that at all.
+ *
+ * Note that we don't need to run XactLockTableInsert() because nobody
+ * needs to wait on xids. That sounds a little strange, but table locks
+ * are held by vxids and row level locks are held by xids. All queries
+ * hold AccessShareLocks so never block while we write or lock new rows.
+ */
+ vxid.backendId = MyBackendId;
+ vxid.localTransactionId = GetNextLocalTransactionId();
+ VirtualXactLockTableInsert(vxid);
+
+ standbyState = STANDBY_INITIALIZED;
+}
+
+/*
+ * ShutdownRecoveryTransactionEnvironment
+ * Shut down transaction tracking
+ *
+ * Prepare to switch from hot standby mode to normal operation. Shut down
+ * recovery-time transaction tracking.
+ *
+ * This must be called even in shutdown of startup process if transaction
+ * tracking has been initialized. Otherwise some locks the tracked
+ * transactions were holding will not be released and and may interfere with
+ * the processes still running (but will exit soon later) at the exit of
+ * startup process.
+ */
+void
+ShutdownRecoveryTransactionEnvironment(void)
+{
+ /*
+ * Do nothing if RecoveryLockLists is NULL because which means that
+ * transaction tracking has not been yet initialized or has been already
+ * shutdowned. This prevents transaction tracking from being shutdowned
+ * unexpectedly more than once.
+ */
+ if (RecoveryLockLists == NULL)
+ return;
+
+ /* Mark all tracked in-progress transactions as finished. */
+ ExpireAllKnownAssignedTransactionIds();
+
+ /* Release all locks the tracked transactions were holding */
+ StandbyReleaseAllLocks();
+
+ /* Destroy the hash table of locks. */
+ hash_destroy(RecoveryLockLists);
+ RecoveryLockLists = NULL;
+
+ /* Cleanup our VirtualTransaction */
+ VirtualXactLockTableCleanup();
+}
+
+
+/*
+ * -----------------------------------------------------
+ * Standby wait timers and backend cancel logic
+ * -----------------------------------------------------
+ */
+
+/*
+ * Determine the cutoff time at which we want to start canceling conflicting
+ * transactions. Returns zero (a time safely in the past) if we are willing
+ * to wait forever.
+ */
+static TimestampTz
+GetStandbyLimitTime(void)
+{
+ TimestampTz rtime;
+ bool fromStream;
+
+ /*
+ * The cutoff time is the last WAL data receipt time plus the appropriate
+ * delay variable. Delay of -1 means wait forever.
+ */
+ GetXLogReceiptTime(&rtime, &fromStream);
+ if (fromStream)
+ {
+ if (max_standby_streaming_delay < 0)
+ return 0; /* wait forever */
+ return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay);
+ }
+ else
+ {
+ if (max_standby_archive_delay < 0)
+ return 0; /* wait forever */
+ return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay);
+ }
+}
+
+#define STANDBY_INITIAL_WAIT_US 1000
+static int standbyWait_us = STANDBY_INITIAL_WAIT_US;
+
+/*
+ * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs.
+ * We wait here for a while then return. If we decide we can't wait any
+ * more then we return true, if we can wait some more return false.
+ */
+static bool
+WaitExceedsMaxStandbyDelay(uint32 wait_event_info)
+{
+ TimestampTz ltime;
+
+ CHECK_FOR_INTERRUPTS();
+
+ /* Are we past the limit time? */
+ ltime = GetStandbyLimitTime();
+ if (ltime && GetCurrentTimestamp() >= ltime)
+ return true;
+
+ /*
+ * Sleep a bit (this is essential to avoid busy-waiting).
+ */
+ pgstat_report_wait_start(wait_event_info);
+ pg_usleep(standbyWait_us);
+ pgstat_report_wait_end();
+
+ /*
+ * Progressively increase the sleep times, but not to more than 1s, since
+ * pg_usleep isn't interruptible on some platforms.
+ */
+ standbyWait_us *= 2;
+ if (standbyWait_us > 1000000)
+ standbyWait_us = 1000000;
+
+ return false;
+}
+
+/*
+ * Log the recovery conflict.
+ *
+ * wait_start is the timestamp when the caller started to wait.
+ * now is the timestamp when this function has been called.
+ * wait_list is the list of virtual transaction ids assigned to
+ * conflicting processes. still_waiting indicates whether
+ * the startup process is still waiting for the recovery conflict
+ * to be resolved or not.
+ */
+void
+LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start,
+ TimestampTz now, VirtualTransactionId *wait_list,
+ bool still_waiting)
+{
+ long secs;
+ int usecs;
+ long msecs;
+ StringInfoData buf;
+ int nprocs = 0;
+
+ /*
+ * There must be no conflicting processes when the recovery conflict has
+ * already been resolved.
+ */
+ Assert(still_waiting || wait_list == NULL);
+
+ TimestampDifference(wait_start, now, &secs, &usecs);
+ msecs = secs * 1000 + usecs / 1000;
+ usecs = usecs % 1000;
+
+ if (wait_list)
+ {
+ VirtualTransactionId *vxids;
+
+ /* Construct a string of list of the conflicting processes */
+ vxids = wait_list;
+ while (VirtualTransactionIdIsValid(*vxids))
+ {
+ PGPROC *proc = BackendIdGetProc(vxids->backendId);
+
+ /* proc can be NULL if the target backend is not active */
+ if (proc)
+ {
+ if (nprocs == 0)
+ {
+ initStringInfo(&buf);
+ appendStringInfo(&buf, "%d", proc->pid);
+ }
+ else
+ appendStringInfo(&buf, ", %d", proc->pid);
+
+ nprocs++;
+ }
+
+ vxids++;
+ }
+ }
+
+ /*
+ * If wait_list is specified, report the list of PIDs of active
+ * conflicting backends in a detail message. Note that if all the backends
+ * in the list are not active, no detail message is logged.
+ */
+ if (still_waiting)
+ {
+ ereport(LOG,
+ errmsg("recovery still waiting after %ld.%03d ms: %s",
+ msecs, usecs, get_recovery_conflict_desc(reason)),
+ nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.",
+ "Conflicting processes: %s.",
+ nprocs, buf.data) : 0);
+ }
+ else
+ {
+ ereport(LOG,
+ errmsg("recovery finished waiting after %ld.%03d ms: %s",
+ msecs, usecs, get_recovery_conflict_desc(reason)));
+ }
+
+ if (nprocs > 0)
+ pfree(buf.data);
+}
+
+/*
+ * This is the main executioner for any query backend that conflicts with
+ * recovery processing. Judgement has already been passed on it within
+ * a specific rmgr. Here we just issue the orders to the procs. The procs
+ * then throw the required error as instructed.
+ *
+ * If report_waiting is true, "waiting" is reported in PS display and the
+ * wait for recovery conflict is reported in the log, if necessary. If
+ * the caller is responsible for reporting them, report_waiting should be
+ * false. Otherwise, both the caller and this function report the same
+ * thing unexpectedly.
+ */
+static void
+ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
+ ProcSignalReason reason, uint32 wait_event_info,
+ bool report_waiting)
+{
+ TimestampTz waitStart = 0;
+ char *new_status = NULL;
+ bool logged_recovery_conflict = false;
+
+ /* Fast exit, to avoid a kernel call if there's no work to be done. */
+ if (!VirtualTransactionIdIsValid(*waitlist))
+ return;
+
+ /* Set the wait start timestamp for reporting */
+ if (report_waiting && (log_recovery_conflict_waits || update_process_title))
+ waitStart = GetCurrentTimestamp();
+
+ while (VirtualTransactionIdIsValid(*waitlist))
+ {
+ /* reset standbyWait_us for each xact we wait for */
+ standbyWait_us = STANDBY_INITIAL_WAIT_US;
+
+ /* wait until the virtual xid is gone */
+ while (!VirtualXactLock(*waitlist, false))
+ {
+ /* Is it time to kill it? */
+ if (WaitExceedsMaxStandbyDelay(wait_event_info))
+ {
+ pid_t pid;
+
+ /*
+ * Now find out who to throw out of the balloon.
+ */
+ Assert(VirtualTransactionIdIsValid(*waitlist));
+ pid = CancelVirtualTransaction(*waitlist, reason);
+
+ /*
+ * Wait a little bit for it to die so that we avoid flooding
+ * an unresponsive backend when system is heavily loaded.
+ */
+ if (pid != 0)
+ pg_usleep(5000L);
+ }
+
+ if (waitStart != 0 && (!logged_recovery_conflict || new_status == NULL))
+ {
+ TimestampTz now = 0;
+ bool maybe_log_conflict;
+ bool maybe_update_title;
+
+ maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict);
+ maybe_update_title = (update_process_title && new_status == NULL);
+
+ /* Get the current timestamp if not report yet */
+ if (maybe_log_conflict || maybe_update_title)
+ now = GetCurrentTimestamp();
+
+ /*
+ * Report via ps if we have been waiting for more than 500
+ * msec (should that be configurable?)
+ */
+ if (maybe_update_title &&
+ TimestampDifferenceExceeds(waitStart, now, 500))
+ {
+ const char *old_status;
+ int len;
+
+ old_status = get_ps_display(&len);
+ new_status = (char *) palloc(len + 8 + 1);
+ memcpy(new_status, old_status, len);
+ strcpy(new_status + len, " waiting");
+ set_ps_display(new_status);
+ new_status[len] = '\0'; /* truncate off " waiting" */
+ }
+
+ /*
+ * Emit the log message if the startup process is waiting
+ * longer than deadlock_timeout for recovery conflict.
+ */
+ if (maybe_log_conflict &&
+ TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout))
+ {
+ LogRecoveryConflict(reason, waitStart, now, waitlist, true);
+ logged_recovery_conflict = true;
+ }
+ }
+ }
+
+ /* The virtual transaction is gone now, wait for the next one */
+ waitlist++;
+ }
+
+ /*
+ * Emit the log message if recovery conflict was resolved but the startup
+ * process waited longer than deadlock_timeout for it.
+ */
+ if (logged_recovery_conflict)
+ LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(),
+ NULL, false);
+
+ /* Reset ps display if we changed it */
+ if (new_status)
+ {
+ set_ps_display(new_status);
+ pfree(new_status);
+ }
+}
+
+void
+ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node)
+{
+ VirtualTransactionId *backends;
+
+ /*
+ * If we get passed InvalidTransactionId then we do nothing (no conflict).
+ *
+ * This can happen when replaying already-applied WAL records after a
+ * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE
+ * record that marks as frozen a page which was already all-visible. It's
+ * also quite common with records generated during index deletion
+ * (original execution of the deletion can reason that a recovery conflict
+ * which is sufficient for the deletion operation must take place before
+ * replay of the deletion record itself).
+ */
+ if (!TransactionIdIsValid(latestRemovedXid))
+ return;
+
+ backends = GetConflictingVirtualXIDs(latestRemovedXid,
+ node.dbNode);
+
+ ResolveRecoveryConflictWithVirtualXIDs(backends,
+ PROCSIG_RECOVERY_CONFLICT_SNAPSHOT,
+ WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT,
+ true);
+}
+
+/*
+ * Variant of ResolveRecoveryConflictWithSnapshot that works with
+ * FullTransactionId values
+ */
+void
+ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid,
+ RelFileNode node)
+{
+ /*
+ * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds,
+ * so truncate the logged FullTransactionId. If the logged value is very
+ * old, so that XID wrap-around already happened on it, there can't be any
+ * snapshots that still see it.
+ */
+ FullTransactionId nextXid = ReadNextFullTransactionId();
+ uint64 diff;
+
+ diff = U64FromFullTransactionId(nextXid) -
+ U64FromFullTransactionId(latestRemovedFullXid);
+ if (diff < MaxTransactionId / 2)
+ {
+ TransactionId latestRemovedXid;
+
+ latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid);
+ ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node);
+ }
+}
+
+void
+ResolveRecoveryConflictWithTablespace(Oid tsid)
+{
+ VirtualTransactionId *temp_file_users;
+
+ /*
+ * Standby users may be currently using this tablespace for their
+ * temporary files. We only care about current users because
+ * temp_tablespace parameter will just ignore tablespaces that no longer
+ * exist.
+ *
+ * Ask everybody to cancel their queries immediately so we can ensure no
+ * temp files remain and we can remove the tablespace. Nuke the entire
+ * site from orbit, it's the only way to be sure.
+ *
+ * XXX: We could work out the pids of active backends using this
+ * tablespace by examining the temp filenames in the directory. We would
+ * then convert the pids into VirtualXIDs before attempting to cancel
+ * them.
+ *
+ * We don't wait for commit because drop tablespace is non-transactional.
+ */
+ temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId,
+ InvalidOid);
+ ResolveRecoveryConflictWithVirtualXIDs(temp_file_users,
+ PROCSIG_RECOVERY_CONFLICT_TABLESPACE,
+ WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE,
+ true);
+}
+
+void
+ResolveRecoveryConflictWithDatabase(Oid dbid)
+{
+ /*
+ * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that
+ * only waits for transactions and completely idle sessions would block
+ * us. This is rare enough that we do this as simply as possible: no wait,
+ * just force them off immediately.
+ *
+ * No locking is required here because we already acquired
+ * AccessExclusiveLock. Anybody trying to connect while we do this will
+ * block during InitPostgres() and then disconnect when they see the
+ * database has been removed.
+ */
+ while (CountDBBackends(dbid) > 0)
+ {
+ CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true);
+
+ /*
+ * Wait awhile for them to die so that we avoid flooding an
+ * unresponsive backend when system is heavily loaded.
+ */
+ pg_usleep(10000);
+ }
+}
+
+/*
+ * ResolveRecoveryConflictWithLock is called from ProcSleep()
+ * to resolve conflicts with other backends holding relation locks.
+ *
+ * The WaitLatch sleep normally done in ProcSleep()
+ * (when not InHotStandby) is performed here, for code clarity.
+ *
+ * We either resolve conflicts immediately or set a timeout to wake us at
+ * the limit of our patience.
+ *
+ * Resolve conflicts by canceling to all backends holding a conflicting
+ * lock. As we are already queued to be granted the lock, no new lock
+ * requests conflicting with ours will be granted in the meantime.
+ *
+ * We also must check for deadlocks involving the Startup process and
+ * hot-standby backend processes. If deadlock_timeout is reached in
+ * this function, all the backends holding the conflicting locks are
+ * requested to check themselves for deadlocks.
+ *
+ * logging_conflict should be true if the recovery conflict has not been
+ * logged yet even though logging is enabled. After deadlock_timeout is
+ * reached and the request for deadlock check is sent, we wait again to
+ * be signaled by the release of the lock if logging_conflict is false.
+ * Otherwise we return without waiting again so that the caller can report
+ * the recovery conflict. In this case, then, this function is called again
+ * with logging_conflict=false (because the recovery conflict has already
+ * been logged) and we will wait again for the lock to be released.
+ */
+void
+ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict)
+{
+ TimestampTz ltime;
+ TimestampTz now;
+
+ Assert(InHotStandby);
+
+ ltime = GetStandbyLimitTime();
+ now = GetCurrentTimestamp();
+
+ /*
+ * Update waitStart if first time through after the startup process
+ * started waiting for the lock. It should not be updated every time
+ * ResolveRecoveryConflictWithLock() is called during the wait.
+ *
+ * Use the current time obtained for comparison with ltime as waitStart
+ * (i.e., the time when this process started waiting for the lock). Since
+ * getting the current time newly can cause overhead, we reuse the
+ * already-obtained time to avoid that overhead.
+ *
+ * Note that waitStart is updated without holding the lock table's
+ * partition lock, to avoid the overhead by additional lock acquisition.
+ * This can cause "waitstart" in pg_locks to become NULL for a very short
+ * period of time after the wait started even though "granted" is false.
+ * This is OK in practice because we can assume that users are likely to
+ * look at "waitstart" when waiting for the lock for a long time.
+ */
+ if (pg_atomic_read_u64(&MyProc->waitStart) == 0)
+ pg_atomic_write_u64(&MyProc->waitStart, now);
+
+ if (now >= ltime && ltime != 0)
+ {
+ /*
+ * We're already behind, so clear a path as quickly as possible.
+ */
+ VirtualTransactionId *backends;
+
+ backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
+
+ /*
+ * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting
+ * "waiting" in PS display by disabling its argument report_waiting
+ * because the caller, WaitOnLock(), has already reported that.
+ */
+ ResolveRecoveryConflictWithVirtualXIDs(backends,
+ PROCSIG_RECOVERY_CONFLICT_LOCK,
+ PG_WAIT_LOCK | locktag.locktag_type,
+ false);
+ }
+ else
+ {
+ /*
+ * Wait (or wait again) until ltime, and check for deadlocks as well
+ * if we will be waiting longer than deadlock_timeout
+ */
+ EnableTimeoutParams timeouts[2];
+ int cnt = 0;
+
+ if (ltime != 0)
+ {
+ got_standby_lock_timeout = false;
+ timeouts[cnt].id = STANDBY_LOCK_TIMEOUT;
+ timeouts[cnt].type = TMPARAM_AT;
+ timeouts[cnt].fin_time = ltime;
+ cnt++;
+ }
+
+ got_standby_deadlock_timeout = false;
+ timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
+ timeouts[cnt].type = TMPARAM_AFTER;
+ timeouts[cnt].delay_ms = DeadlockTimeout;
+ cnt++;
+
+ enable_timeouts(timeouts, cnt);
+ }
+
+ /* Wait to be signaled by the release of the Relation Lock */
+ ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
+
+ /*
+ * Exit if ltime is reached. Then all the backends holding conflicting
+ * locks will be canceled in the next ResolveRecoveryConflictWithLock()
+ * call.
+ */
+ if (got_standby_lock_timeout)
+ goto cleanup;
+
+ if (got_standby_deadlock_timeout)
+ {
+ VirtualTransactionId *backends;
+
+ backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL);
+
+ /* Quick exit if there's no work to be done */
+ if (!VirtualTransactionIdIsValid(*backends))
+ goto cleanup;
+
+ /*
+ * Send signals to all the backends holding the conflicting locks, to
+ * ask them to check themselves for deadlocks.
+ */
+ while (VirtualTransactionIdIsValid(*backends))
+ {
+ SignalVirtualTransaction(*backends,
+ PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK,
+ false);
+ backends++;
+ }
+
+ /*
+ * Exit if the recovery conflict has not been logged yet even though
+ * logging is enabled, so that the caller can log that. Then
+ * RecoveryConflictWithLock() is called again and we will wait again
+ * for the lock to be released.
+ */
+ if (logging_conflict)
+ goto cleanup;
+
+ /*
+ * Wait again here to be signaled by the release of the Relation Lock,
+ * to prevent the subsequent RecoveryConflictWithLock() from causing
+ * deadlock_timeout and sending a request for deadlocks check again.
+ * Otherwise the request continues to be sent every deadlock_timeout
+ * until the relation locks are released or ltime is reached.
+ */
+ got_standby_deadlock_timeout = false;
+ ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type);
+ }
+
+cleanup:
+
+ /*
+ * Clear any timeout requests established above. We assume here that the
+ * Startup process doesn't have any other outstanding timeouts than those
+ * used by this function. If that stops being true, we could cancel the
+ * timeouts individually, but that'd be slower.
+ */
+ disable_all_timeouts(false);
+ got_standby_lock_timeout = false;
+ got_standby_deadlock_timeout = false;
+}
+
+/*
+ * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup()
+ * to resolve conflicts with other backends holding buffer pins.
+ *
+ * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup()
+ * (when not InHotStandby) is performed here, for code clarity.
+ *
+ * We either resolve conflicts immediately or set a timeout to wake us at
+ * the limit of our patience.
+ *
+ * Resolve conflicts by sending a PROCSIG signal to all backends to check if
+ * they hold one of the buffer pins that is blocking Startup process. If so,
+ * those backends will take an appropriate error action, ERROR or FATAL.
+ *
+ * We also must check for deadlocks. Deadlocks occur because if queries
+ * wait on a lock, that must be behind an AccessExclusiveLock, which can only
+ * be cleared if the Startup process replays a transaction completion record.
+ * If Startup process is also waiting then that is a deadlock. The deadlock
+ * can occur if the query is waiting and then the Startup sleeps, or if
+ * Startup is sleeping and the query waits on a lock. We protect against
+ * only the former sequence here, the latter sequence is checked prior to
+ * the query sleeping, in CheckRecoveryConflictDeadlock().
+ *
+ * Deadlocks are extremely rare, and relatively expensive to check for,
+ * so we don't do a deadlock check right away ... only if we have had to wait
+ * at least deadlock_timeout.
+ */
+void
+ResolveRecoveryConflictWithBufferPin(void)
+{
+ TimestampTz ltime;
+
+ Assert(InHotStandby);
+
+ ltime = GetStandbyLimitTime();
+
+ if (GetCurrentTimestamp() >= ltime && ltime != 0)
+ {
+ /*
+ * We're already behind, so clear a path as quickly as possible.
+ */
+ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+ }
+ else
+ {
+ /*
+ * Wake up at ltime, and check for deadlocks as well if we will be
+ * waiting longer than deadlock_timeout
+ */
+ EnableTimeoutParams timeouts[2];
+ int cnt = 0;
+
+ if (ltime != 0)
+ {
+ timeouts[cnt].id = STANDBY_TIMEOUT;
+ timeouts[cnt].type = TMPARAM_AT;
+ timeouts[cnt].fin_time = ltime;
+ cnt++;
+ }
+
+ got_standby_deadlock_timeout = false;
+ timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT;
+ timeouts[cnt].type = TMPARAM_AFTER;
+ timeouts[cnt].delay_ms = DeadlockTimeout;
+ cnt++;
+
+ enable_timeouts(timeouts, cnt);
+ }
+
+ /*
+ * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted
+ * by one of the timeouts established above.
+ *
+ * We assume that only UnpinBuffer() and the timeout requests established
+ * above can wake us up here. WakeupRecovery() called by walreceiver or
+ * SIGHUP signal handler, etc cannot do that because it uses the different
+ * latch from that ProcWaitForSignal() waits on.
+ */
+ ProcWaitForSignal(PG_WAIT_BUFFER_PIN);
+
+ if (got_standby_delay_timeout)
+ SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
+ else if (got_standby_deadlock_timeout)
+ {
+ /*
+ * Send out a request for hot-standby backends to check themselves for
+ * deadlocks.
+ *
+ * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait
+ * to be signaled by UnpinBuffer() again and send a request for
+ * deadlocks check if deadlock_timeout happens. This causes the
+ * request to continue to be sent every deadlock_timeout until the
+ * buffer is unpinned or ltime is reached. This would increase the
+ * workload in the startup process and backends. In practice it may
+ * not be so harmful because the period that the buffer is kept pinned
+ * is basically no so long. But we should fix this?
+ */
+ SendRecoveryConflictWithBufferPin(
+ PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+ }
+
+ /*
+ * Clear any timeout requests established above. We assume here that the
+ * Startup process doesn't have any other timeouts than what this function
+ * uses. If that stops being true, we could cancel the timeouts
+ * individually, but that'd be slower.
+ */
+ disable_all_timeouts(false);
+ got_standby_delay_timeout = false;
+ got_standby_deadlock_timeout = false;
+}
+
+static void
+SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
+{
+ Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN ||
+ reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK);
+
+ /*
+ * We send signal to all backends to ask them if they are holding the
+ * buffer pin which is delaying the Startup process. We must not set the
+ * conflict flag yet, since most backends will be innocent. Let the
+ * SIGUSR1 handling in each backend decide their own fate.
+ */
+ CancelDBBackends(InvalidOid, reason, false);
+}
+
+/*
+ * In Hot Standby perform early deadlock detection. We abort the lock
+ * wait if we are about to sleep while holding the buffer pin that Startup
+ * process is waiting for.
+ *
+ * Note: this code is pessimistic, because there is no way for it to
+ * determine whether an actual deadlock condition is present: the lock we
+ * need to wait for might be unrelated to any held by the Startup process.
+ * Sooner or later, this mechanism should get ripped out in favor of somehow
+ * accounting for buffer locks in DeadLockCheck(). However, errors here
+ * seem to be very low-probability in practice, so for now it's not worth
+ * the trouble.
+ */
+void
+CheckRecoveryConflictDeadlock(void)
+{
+ Assert(!InRecovery); /* do not call in Startup process */
+
+ if (!HoldingBufferPinThatDelaysRecovery())
+ return;
+
+ /*
+ * Error message should match ProcessInterrupts() but we avoid calling
+ * that because we aren't handling an interrupt at this point. Note that
+ * we only cancel the current transaction here, so if we are in a
+ * subtransaction and the pin is held by a parent, then the Startup
+ * process will continue to wait even though we have avoided deadlock.
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_T_R_DEADLOCK_DETECTED),
+ errmsg("canceling statement due to conflict with recovery"),
+ errdetail("User transaction caused buffer deadlock with recovery.")));
+}
+
+
+/* --------------------------------
+ * timeout handler routines
+ * --------------------------------
+ */
+
+/*
+ * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is
+ * exceeded.
+ */
+void
+StandbyDeadLockHandler(void)
+{
+ got_standby_deadlock_timeout = true;
+}
+
+/*
+ * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded.
+ */
+void
+StandbyTimeoutHandler(void)
+{
+ got_standby_delay_timeout = true;
+}
+
+/*
+ * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded.
+ */
+void
+StandbyLockTimeoutHandler(void)
+{
+ got_standby_lock_timeout = true;
+}
+
+/*
+ * -----------------------------------------------------
+ * Locking in Recovery Mode
+ * -----------------------------------------------------
+ *
+ * All locks are held by the Startup process using a single virtual
+ * transaction. This implementation is both simpler and in some senses,
+ * more correct. The locks held mean "some original transaction held
+ * this lock, so query access is not allowed at this time". So the Startup
+ * process is the proxy by which the original locks are implemented.
+ *
+ * We only keep track of AccessExclusiveLocks, which are only ever held by
+ * one transaction on one relation.
+ *
+ * We keep a hash table of lists of locks in local memory keyed by xid,
+ * RecoveryLockLists, so we can keep track of the various entries made by
+ * the Startup process's virtual xid in the shared lock table.
+ *
+ * List elements use type xl_standby_lock, since the WAL record type exactly
+ * matches the information that we need to keep track of.
+ *
+ * We use session locks rather than normal locks so we don't need
+ * ResourceOwners.
+ */
+
+
+void
+StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid)
+{
+ RecoveryLockListsEntry *entry;
+ xl_standby_lock *newlock;
+ LOCKTAG locktag;
+ bool found;
+
+ /* Already processed? */
+ if (!TransactionIdIsValid(xid) ||
+ TransactionIdDidCommit(xid) ||
+ TransactionIdDidAbort(xid))
+ return;
+
+ elog(trace_recovery(DEBUG4),
+ "adding recovery lock: db %u rel %u", dbOid, relOid);
+
+ /* dbOid is InvalidOid when we are locking a shared relation. */
+ Assert(OidIsValid(relOid));
+
+ /* Create a new list for this xid, if we don't have one already. */
+ entry = hash_search(RecoveryLockLists, &xid, HASH_ENTER, &found);
+ if (!found)
+ {
+ entry->xid = xid;
+ entry->locks = NIL;
+ }
+
+ newlock = palloc(sizeof(xl_standby_lock));
+ newlock->xid = xid;
+ newlock->dbOid = dbOid;
+ newlock->relOid = relOid;
+ entry->locks = lappend(entry->locks, newlock);
+
+ SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid);
+
+ (void) LockAcquire(&locktag, AccessExclusiveLock, true, false);
+}
+
+static void
+StandbyReleaseLockList(List *locks)
+{
+ ListCell *lc;
+
+ foreach(lc, locks)
+ {
+ xl_standby_lock *lock = (xl_standby_lock *) lfirst(lc);
+ LOCKTAG locktag;
+
+ elog(trace_recovery(DEBUG4),
+ "releasing recovery lock: xid %u db %u rel %u",
+ lock->xid, lock->dbOid, lock->relOid);
+ SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid);
+ if (!LockRelease(&locktag, AccessExclusiveLock, true))
+ {
+ elog(LOG,
+ "RecoveryLockLists contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u",
+ lock->xid, lock->dbOid, lock->relOid);
+ Assert(false);
+ }
+ }
+
+ list_free_deep(locks);
+}
+
+static void
+StandbyReleaseLocks(TransactionId xid)
+{
+ RecoveryLockListsEntry *entry;
+
+ if (TransactionIdIsValid(xid))
+ {
+ if ((entry = hash_search(RecoveryLockLists, &xid, HASH_FIND, NULL)))
+ {
+ StandbyReleaseLockList(entry->locks);
+ hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+ }
+ }
+ else
+ StandbyReleaseAllLocks();
+}
+
+/*
+ * Release locks for a transaction tree, starting at xid down, from
+ * RecoveryLockLists.
+ *
+ * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode,
+ * to remove any AccessExclusiveLocks requested by a transaction.
+ */
+void
+StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids)
+{
+ int i;
+
+ StandbyReleaseLocks(xid);
+
+ for (i = 0; i < nsubxids; i++)
+ StandbyReleaseLocks(subxids[i]);
+}
+
+/*
+ * Called at end of recovery and when we see a shutdown checkpoint.
+ */
+void
+StandbyReleaseAllLocks(void)
+{
+ HASH_SEQ_STATUS status;
+ RecoveryLockListsEntry *entry;
+
+ elog(trace_recovery(DEBUG2), "release all standby locks");
+
+ hash_seq_init(&status, RecoveryLockLists);
+ while ((entry = hash_seq_search(&status)))
+ {
+ StandbyReleaseLockList(entry->locks);
+ hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+ }
+}
+
+/*
+ * StandbyReleaseOldLocks
+ * Release standby locks held by top-level XIDs that aren't running,
+ * as long as they're not prepared transactions.
+ */
+void
+StandbyReleaseOldLocks(TransactionId oldxid)
+{
+ HASH_SEQ_STATUS status;
+ RecoveryLockListsEntry *entry;
+
+ hash_seq_init(&status, RecoveryLockLists);
+ while ((entry = hash_seq_search(&status)))
+ {
+ Assert(TransactionIdIsValid(entry->xid));
+
+ /* Skip if prepared transaction. */
+ if (StandbyTransactionIdIsPrepared(entry->xid))
+ continue;
+
+ /* Skip if >= oldxid. */
+ if (!TransactionIdPrecedes(entry->xid, oldxid))
+ continue;
+
+ /* Remove all locks and hash table entry. */
+ StandbyReleaseLockList(entry->locks);
+ hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL);
+ }
+}
+
+/*
+ * --------------------------------------------------------------------
+ * Recovery handling for Rmgr RM_STANDBY_ID
+ *
+ * These record types will only be created if XLogStandbyInfoActive()
+ * --------------------------------------------------------------------
+ */
+
+void
+standby_redo(XLogReaderState *record)
+{
+ uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+
+ /* Backup blocks are not used in standby records */
+ Assert(!XLogRecHasAnyBlockRefs(record));
+
+ /* Do nothing if we're not in hot standby mode */
+ if (standbyState == STANDBY_DISABLED)
+ return;
+
+ if (info == XLOG_STANDBY_LOCK)
+ {
+ xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record);
+ int i;
+
+ for (i = 0; i < xlrec->nlocks; i++)
+ StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid,
+ xlrec->locks[i].dbOid,
+ xlrec->locks[i].relOid);
+ }
+ else if (info == XLOG_RUNNING_XACTS)
+ {
+ xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record);
+ RunningTransactionsData running;
+
+ running.xcnt = xlrec->xcnt;
+ running.subxcnt = xlrec->subxcnt;
+ running.subxid_overflow = xlrec->subxid_overflow;
+ running.nextXid = xlrec->nextXid;
+ running.latestCompletedXid = xlrec->latestCompletedXid;
+ running.oldestRunningXid = xlrec->oldestRunningXid;
+ running.xids = xlrec->xids;
+
+ ProcArrayApplyRecoveryInfo(&running);
+ }
+ else if (info == XLOG_INVALIDATIONS)
+ {
+ xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record);
+
+ ProcessCommittedInvalidationMessages(xlrec->msgs,
+ xlrec->nmsgs,
+ xlrec->relcacheInitFileInval,
+ xlrec->dbId,
+ xlrec->tsId);
+ }
+ else
+ elog(PANIC, "standby_redo: unknown op code %u", info);
+}
+
+/*
+ * Log details of the current snapshot to WAL. This allows the snapshot state
+ * to be reconstructed on the standby and for logical decoding.
+ *
+ * This is used for Hot Standby as follows:
+ *
+ * We can move directly to STANDBY_SNAPSHOT_READY at startup if we
+ * start from a shutdown checkpoint because we know nothing was running
+ * at that time and our recovery snapshot is known empty. In the more
+ * typical case of an online checkpoint we need to jump through a few
+ * hoops to get a correct recovery snapshot and this requires a two or
+ * sometimes a three stage process.
+ *
+ * The initial snapshot must contain all running xids and all current
+ * AccessExclusiveLocks at a point in time on the standby. Assembling
+ * that information while the server is running requires many and
+ * various LWLocks, so we choose to derive that information piece by
+ * piece and then re-assemble that info on the standby. When that
+ * information is fully assembled we move to STANDBY_SNAPSHOT_READY.
+ *
+ * Since locking on the primary when we derive the information is not
+ * strict, we note that there is a time window between the derivation and
+ * writing to WAL of the derived information. That allows race conditions
+ * that we must resolve, since xids and locks may enter or leave the
+ * snapshot during that window. This creates the issue that an xid or
+ * lock may start *after* the snapshot has been derived yet *before* the
+ * snapshot is logged in the running xacts WAL record. We resolve this by
+ * starting to accumulate changes at a point just prior to when we derive
+ * the snapshot on the primary, then ignore duplicates when we later apply
+ * the snapshot from the running xacts record. This is implemented during
+ * CreateCheckpoint() where we use the logical checkpoint location as
+ * our starting point and then write the running xacts record immediately
+ * before writing the main checkpoint WAL record. Since we always start
+ * up from a checkpoint and are immediately at our starting point, we
+ * unconditionally move to STANDBY_INITIALIZED. After this point we
+ * must do 4 things:
+ * * move shared nextXid forwards as we see new xids
+ * * extend the clog and subtrans with each new xid
+ * * keep track of uncommitted known assigned xids
+ * * keep track of uncommitted AccessExclusiveLocks
+ *
+ * When we see a commit/abort we must remove known assigned xids and locks
+ * from the completing transaction. Attempted removals that cannot locate
+ * an entry are expected and must not cause an error when we are in state
+ * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and
+ * KnownAssignedXidsRemove().
+ *
+ * Later, when we apply the running xact data we must be careful to ignore
+ * transactions already committed, since those commits raced ahead when
+ * making WAL entries.
+ *
+ * The loose timing also means that locks may be recorded that have a
+ * zero xid, since xids are removed from procs before locks are removed.
+ * So we must prune the lock list down to ensure we hold locks only for
+ * currently running xids, performed by StandbyReleaseOldLocks().
+ * Zero xids should no longer be possible, but we may be replaying WAL
+ * from a time when they were possible.
+ *
+ * For logical decoding only the running xacts information is needed;
+ * there's no need to look at the locking information, but it's logged anyway,
+ * as there's no independent knob to just enable logical decoding. For
+ * details of how this is used, check snapbuild.c's introductory comment.
+ *
+ *
+ * Returns the RecPtr of the last inserted record.
+ */
+XLogRecPtr
+LogStandbySnapshot(void)
+{
+ XLogRecPtr recptr;
+ RunningTransactions running;
+ xl_standby_lock *locks;
+ int nlocks;
+
+ Assert(XLogStandbyInfoActive());
+
+ /*
+ * Get details of any AccessExclusiveLocks being held at the moment.
+ */
+ locks = GetRunningTransactionLocks(&nlocks);
+ if (nlocks > 0)
+ LogAccessExclusiveLocks(nlocks, locks);
+ pfree(locks);
+
+ /*
+ * Log details of all in-progress transactions. This should be the last
+ * record we write, because standby will open up when it sees this.
+ */
+ running = GetRunningTransactionData();
+
+ /*
+ * GetRunningTransactionData() acquired ProcArrayLock, we must release it.
+ * For Hot Standby this can be done before inserting the WAL record
+ * because ProcArrayApplyRecoveryInfo() rechecks the commit status using
+ * the clog. For logical decoding, though, the lock can't be released
+ * early because the clog might be "in the future" from the POV of the
+ * historic snapshot. This would allow for situations where we're waiting
+ * for the end of a transaction listed in the xl_running_xacts record
+ * which, according to the WAL, has committed before the xl_running_xacts
+ * record. Fortunately this routine isn't executed frequently, and it's
+ * only a shared lock.
+ */
+ if (wal_level < WAL_LEVEL_LOGICAL)
+ LWLockRelease(ProcArrayLock);
+
+ recptr = LogCurrentRunningXacts(running);
+
+ /* Release lock if we kept it longer ... */
+ if (wal_level >= WAL_LEVEL_LOGICAL)
+ LWLockRelease(ProcArrayLock);
+
+ /* GetRunningTransactionData() acquired XidGenLock, we must release it */
+ LWLockRelease(XidGenLock);
+
+ return recptr;
+}
+
+/*
+ * Record an enhanced snapshot of running transactions into WAL.
+ *
+ * The definitions of RunningTransactionsData and xl_xact_running_xacts are
+ * similar. We keep them separate because xl_xact_running_xacts is a
+ * contiguous chunk of memory and never exists fully until it is assembled in
+ * WAL. The inserted records are marked as not being important for durability,
+ * to avoid triggering superfluous checkpoint / archiving activity.
+ */
+static XLogRecPtr
+LogCurrentRunningXacts(RunningTransactions CurrRunningXacts)
+{
+ xl_running_xacts xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.xcnt = CurrRunningXacts->xcnt;
+ xlrec.subxcnt = CurrRunningXacts->subxcnt;
+ xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow;
+ xlrec.nextXid = CurrRunningXacts->nextXid;
+ xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid;
+ xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid;
+
+ /* Header */
+ XLogBeginInsert();
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+ XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts);
+
+ /* array of TransactionIds */
+ if (xlrec.xcnt > 0)
+ XLogRegisterData((char *) CurrRunningXacts->xids,
+ (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId));
+
+ recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS);
+
+ if (CurrRunningXacts->subxid_overflow)
+ elog(trace_recovery(DEBUG2),
+ "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+ CurrRunningXacts->xcnt,
+ LSN_FORMAT_ARGS(recptr),
+ CurrRunningXacts->oldestRunningXid,
+ CurrRunningXacts->latestCompletedXid,
+ CurrRunningXacts->nextXid);
+ else
+ elog(trace_recovery(DEBUG2),
+ "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)",
+ CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt,
+ LSN_FORMAT_ARGS(recptr),
+ CurrRunningXacts->oldestRunningXid,
+ CurrRunningXacts->latestCompletedXid,
+ CurrRunningXacts->nextXid);
+
+ /*
+ * Ensure running_xacts information is synced to disk not too far in the
+ * future. We don't want to stall anything though (i.e. use XLogFlush()),
+ * so we let the wal writer do it during normal operation.
+ * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced
+ * and nudge the WALWriter into action if sleeping. Check
+ * XLogBackgroundFlush() for details why a record might not be flushed
+ * without it.
+ */
+ XLogSetAsyncXactLSN(recptr);
+
+ return recptr;
+}
+
+/*
+ * Wholesale logging of AccessExclusiveLocks. Other lock types need not be
+ * logged, as described in backend/storage/lmgr/README.
+ */
+static void
+LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks)
+{
+ xl_standby_locks xlrec;
+
+ xlrec.nlocks = nlocks;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks));
+ XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock));
+ XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
+
+ (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK);
+}
+
+/*
+ * Individual logging of AccessExclusiveLocks for use during LockAcquire()
+ */
+void
+LogAccessExclusiveLock(Oid dbOid, Oid relOid)
+{
+ xl_standby_lock xlrec;
+
+ xlrec.xid = GetCurrentTransactionId();
+
+ xlrec.dbOid = dbOid;
+ xlrec.relOid = relOid;
+
+ LogAccessExclusiveLocks(1, &xlrec);
+ MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK;
+}
+
+/*
+ * Prepare to log an AccessExclusiveLock, for use during LockAcquire()
+ */
+void
+LogAccessExclusiveLockPrepare(void)
+{
+ /*
+ * Ensure that a TransactionId has been assigned to this transaction, for
+ * two reasons, both related to lock release on the standby. First, we
+ * must assign an xid so that RecordTransactionCommit() and
+ * RecordTransactionAbort() do not optimise away the transaction
+ * completion record which recovery relies upon to release locks. It's a
+ * hack, but for a corner case not worth adding code for into the main
+ * commit path. Second, we must assign an xid before the lock is recorded
+ * in shared memory, otherwise a concurrently executing
+ * GetRunningTransactionLocks() might see a lock associated with an
+ * InvalidTransactionId which we later assert cannot happen.
+ */
+ (void) GetCurrentTransactionId();
+}
+
+/*
+ * Emit WAL for invalidations. This currently is only used for commits without
+ * an xid but which contain invalidations.
+ */
+void
+LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs,
+ bool relcacheInitFileInval)
+{
+ xl_invalidations xlrec;
+
+ /* prepare record */
+ memset(&xlrec, 0, sizeof(xlrec));
+ xlrec.dbId = MyDatabaseId;
+ xlrec.tsId = MyDatabaseTableSpace;
+ xlrec.relcacheInitFileInval = relcacheInitFileInval;
+ xlrec.nmsgs = nmsgs;
+
+ /* perform insertion */
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations);
+ XLogRegisterData((char *) msgs,
+ nmsgs * sizeof(SharedInvalidationMessage));
+ XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS);
+}
+
+/* Return the description of recovery conflict */
+static const char *
+get_recovery_conflict_desc(ProcSignalReason reason)
+{
+ const char *reasonDesc = _("unknown reason");
+
+ switch (reason)
+ {
+ case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN:
+ reasonDesc = _("recovery conflict on buffer pin");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_LOCK:
+ reasonDesc = _("recovery conflict on lock");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_TABLESPACE:
+ reasonDesc = _("recovery conflict on tablespace");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT:
+ reasonDesc = _("recovery conflict on snapshot");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK:
+ reasonDesc = _("recovery conflict on buffer deadlock");
+ break;
+ case PROCSIG_RECOVERY_CONFLICT_DATABASE:
+ reasonDesc = _("recovery conflict on database");
+ break;
+ default:
+ break;
+ }
+
+ return reasonDesc;
+}