diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 12:15:05 +0000 |
commit | 46651ce6fe013220ed397add242004d764fc0153 (patch) | |
tree | 6e5299f990f88e60174a1d3ae6e48eedd2688b2b /src/backend/storage/ipc | |
parent | Initial commit. (diff) | |
download | postgresql-14-46651ce6fe013220ed397add242004d764fc0153.tar.xz postgresql-14-46651ce6fe013220ed397add242004d764fc0153.zip |
Adding upstream version 14.5.upstream/14.5upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/storage/ipc')
-rw-r--r-- | src/backend/storage/ipc/Makefile | 30 | ||||
-rw-r--r-- | src/backend/storage/ipc/barrier.c | 333 | ||||
-rw-r--r-- | src/backend/storage/ipc/dsm.c | 1248 | ||||
-rw-r--r-- | src/backend/storage/ipc/dsm_impl.c | 1058 | ||||
-rw-r--r-- | src/backend/storage/ipc/ipc.c | 435 | ||||
-rw-r--r-- | src/backend/storage/ipc/ipci.c | 291 | ||||
-rw-r--r-- | src/backend/storage/ipc/latch.c | 2158 | ||||
-rw-r--r-- | src/backend/storage/ipc/pmsignal.c | 430 | ||||
-rw-r--r-- | src/backend/storage/ipc/procarray.c | 5220 | ||||
-rw-r--r-- | src/backend/storage/ipc/procsignal.c | 685 | ||||
-rw-r--r-- | src/backend/storage/ipc/shm_mq.c | 1288 | ||||
-rw-r--r-- | src/backend/storage/ipc/shm_toc.c | 272 | ||||
-rw-r--r-- | src/backend/storage/ipc/shmem.c | 611 | ||||
-rw-r--r-- | src/backend/storage/ipc/shmqueue.c | 190 | ||||
-rw-r--r-- | src/backend/storage/ipc/signalfuncs.c | 300 | ||||
-rw-r--r-- | src/backend/storage/ipc/sinval.c | 205 | ||||
-rw-r--r-- | src/backend/storage/ipc/sinvaladt.c | 777 | ||||
-rw-r--r-- | src/backend/storage/ipc/standby.c | 1450 |
18 files changed, 16981 insertions, 0 deletions
diff --git a/src/backend/storage/ipc/Makefile b/src/backend/storage/ipc/Makefile new file mode 100644 index 0000000..df90c6b --- /dev/null +++ b/src/backend/storage/ipc/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for storage/ipc +# +# src/backend/storage/ipc/Makefile +# + +subdir = src/backend/storage/ipc +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global + +OBJS = \ + barrier.o \ + dsm.o \ + dsm_impl.o \ + ipc.o \ + ipci.o \ + latch.o \ + pmsignal.o \ + procarray.o \ + procsignal.o \ + shm_mq.o \ + shm_toc.o \ + shmem.o \ + shmqueue.o \ + signalfuncs.o \ + sinval.o \ + sinvaladt.o \ + standby.o + +include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/ipc/barrier.c b/src/backend/storage/ipc/barrier.c new file mode 100644 index 0000000..5c05297 --- /dev/null +++ b/src/backend/storage/ipc/barrier.c @@ -0,0 +1,333 @@ +/*------------------------------------------------------------------------- + * + * barrier.c + * Barriers for synchronizing cooperating processes. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * From Wikipedia[1]: "In parallel computing, a barrier is a type of + * synchronization method. A barrier for a group of threads or processes in + * the source code means any thread/process must stop at this point and cannot + * proceed until all other threads/processes reach this barrier." + * + * This implementation of barriers allows for static sets of participants + * known up front, or dynamic sets of participants which processes can join or + * leave at any time. In the dynamic case, a phase number can be used to + * track progress through a parallel algorithm, and may be necessary to + * synchronize with the current phase of a multi-phase algorithm when a new + * participant joins. In the static case, the phase number is used + * internally, but it isn't strictly necessary for client code to access it + * because the phase can only advance when the declared number of participants + * reaches the barrier, so client code should be in no doubt about the current + * phase of computation at all times. + * + * Consider a parallel algorithm that involves separate phases of computation + * A, B and C where the output of each phase is needed before the next phase + * can begin. + * + * In the case of a static barrier initialized with 4 participants, each + * participant works on phase A, then calls BarrierArriveAndWait to wait until + * all 4 participants have reached that point. When BarrierArriveAndWait + * returns control, each participant can work on B, and so on. Because the + * barrier knows how many participants to expect, the phases of computation + * don't need labels or numbers, since each process's program counter implies + * the current phase. Even if some of the processes are slow to start up and + * begin running phase A, the other participants are expecting them and will + * patiently wait at the barrier. The code could be written as follows: + * + * perform_a(); + * BarrierArriveAndWait(&barrier, ...); + * perform_b(); + * BarrierArriveAndWait(&barrier, ...); + * perform_c(); + * BarrierArriveAndWait(&barrier, ...); + * + * If the number of participants is not known up front, then a dynamic barrier + * is needed and the number should be set to zero at initialization. New + * complications arise because the number necessarily changes over time as + * participants attach and detach, and therefore phases B, C or even the end + * of processing may be reached before any given participant has started + * running and attached. Therefore the client code must perform an initial + * test of the phase number after attaching, because it needs to find out + * which phase of the algorithm has been reached by any participants that are + * already attached in order to synchronize with that work. Once the program + * counter or some other representation of current progress is synchronized + * with the barrier's phase, normal control flow can be used just as in the + * static case. Our example could be written using a switch statement with + * cases that fall-through, as follows: + * + * phase = BarrierAttach(&barrier); + * switch (phase) + * { + * case PHASE_A: + * perform_a(); + * BarrierArriveAndWait(&barrier, ...); + * case PHASE_B: + * perform_b(); + * BarrierArriveAndWait(&barrier, ...); + * case PHASE_C: + * perform_c(); + * BarrierArriveAndWait(&barrier, ...); + * } + * BarrierDetach(&barrier); + * + * Static barriers behave similarly to POSIX's pthread_barrier_t. Dynamic + * barriers behave similarly to Java's java.util.concurrent.Phaser. + * + * [1] https://en.wikipedia.org/wiki/Barrier_(computer_science) + * + * IDENTIFICATION + * src/backend/storage/ipc/barrier.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" +#include "storage/barrier.h" + +static inline bool BarrierDetachImpl(Barrier *barrier, bool arrive); + +/* + * Initialize this barrier. To use a static party size, provide the number of + * participants to wait for at each phase indicating that that number of + * backends is implicitly attached. To use a dynamic party size, specify zero + * here and then use BarrierAttach() and + * BarrierDetach()/BarrierArriveAndDetach() to register and deregister + * participants explicitly. + */ +void +BarrierInit(Barrier *barrier, int participants) +{ + SpinLockInit(&barrier->mutex); + barrier->participants = participants; + barrier->arrived = 0; + barrier->phase = 0; + barrier->elected = 0; + barrier->static_party = participants > 0; + ConditionVariableInit(&barrier->condition_variable); +} + +/* + * Arrive at this barrier, wait for all other attached participants to arrive + * too and then return. Increments the current phase. The caller must be + * attached. + * + * While waiting, pg_stat_activity shows a wait_event_type and wait_event + * controlled by the wait_event_info passed in, which should be a value from + * one of the WaitEventXXX enums defined in pgstat.h. + * + * Return true in one arbitrarily chosen participant. Return false in all + * others. The return code can be used to elect one participant to execute a + * phase of work that must be done serially while other participants wait. + */ +bool +BarrierArriveAndWait(Barrier *barrier, uint32 wait_event_info) +{ + bool release = false; + bool elected; + int start_phase; + int next_phase; + + SpinLockAcquire(&barrier->mutex); + start_phase = barrier->phase; + next_phase = start_phase + 1; + ++barrier->arrived; + if (barrier->arrived == barrier->participants) + { + release = true; + barrier->arrived = 0; + barrier->phase = next_phase; + barrier->elected = next_phase; + } + SpinLockRelease(&barrier->mutex); + + /* + * If we were the last expected participant to arrive, we can release our + * peers and return true to indicate that this backend has been elected to + * perform any serial work. + */ + if (release) + { + ConditionVariableBroadcast(&barrier->condition_variable); + + return true; + } + + /* + * Otherwise we have to wait for the last participant to arrive and + * advance the phase. + */ + elected = false; + ConditionVariablePrepareToSleep(&barrier->condition_variable); + for (;;) + { + /* + * We know that phase must either be start_phase, indicating that we + * need to keep waiting, or next_phase, indicating that the last + * participant that we were waiting for has either arrived or detached + * so that the next phase has begun. The phase cannot advance any + * further than that without this backend's participation, because + * this backend is attached. + */ + SpinLockAcquire(&barrier->mutex); + Assert(barrier->phase == start_phase || barrier->phase == next_phase); + release = barrier->phase == next_phase; + if (release && barrier->elected != next_phase) + { + /* + * Usually the backend that arrives last and releases the other + * backends is elected to return true (see above), so that it can + * begin processing serial work while it has a CPU timeslice. + * However, if the barrier advanced because someone detached, then + * one of the backends that is awoken will need to be elected. + */ + barrier->elected = barrier->phase; + elected = true; + } + SpinLockRelease(&barrier->mutex); + if (release) + break; + ConditionVariableSleep(&barrier->condition_variable, wait_event_info); + } + ConditionVariableCancelSleep(); + + return elected; +} + +/* + * Arrive at this barrier, but detach rather than waiting. Returns true if + * the caller was the last to detach. + */ +bool +BarrierArriveAndDetach(Barrier *barrier) +{ + return BarrierDetachImpl(barrier, true); +} + +/* + * Arrive at a barrier, and detach all but the last to arrive. Returns true if + * the caller was the last to arrive, and is therefore still attached. + */ +bool +BarrierArriveAndDetachExceptLast(Barrier *barrier) +{ + SpinLockAcquire(&barrier->mutex); + if (barrier->participants > 1) + { + --barrier->participants; + SpinLockRelease(&barrier->mutex); + + return false; + } + Assert(barrier->participants == 1); + ++barrier->phase; + SpinLockRelease(&barrier->mutex); + + return true; +} + +/* + * Attach to a barrier. All waiting participants will now wait for this + * participant to call BarrierArriveAndWait(), BarrierDetach() or + * BarrierArriveAndDetach(). Return the current phase. + */ +int +BarrierAttach(Barrier *barrier) +{ + int phase; + + Assert(!barrier->static_party); + + SpinLockAcquire(&barrier->mutex); + ++barrier->participants; + phase = barrier->phase; + SpinLockRelease(&barrier->mutex); + + return phase; +} + +/* + * Detach from a barrier. This may release other waiters from + * BarrierArriveAndWait() and advance the phase if they were only waiting for + * this backend. Return true if this participant was the last to detach. + */ +bool +BarrierDetach(Barrier *barrier) +{ + return BarrierDetachImpl(barrier, false); +} + +/* + * Return the current phase of a barrier. The caller must be attached. + */ +int +BarrierPhase(Barrier *barrier) +{ + /* + * It is OK to read barrier->phase without locking, because it can't + * change without us (we are attached to it), and we executed a memory + * barrier when we either attached or participated in changing it last + * time. + */ + return barrier->phase; +} + +/* + * Return an instantaneous snapshot of the number of participants currently + * attached to this barrier. For debugging purposes only. + */ +int +BarrierParticipants(Barrier *barrier) +{ + int participants; + + SpinLockAcquire(&barrier->mutex); + participants = barrier->participants; + SpinLockRelease(&barrier->mutex); + + return participants; +} + +/* + * Detach from a barrier. If 'arrive' is true then also increment the phase + * if there are no other participants. If there are other participants + * waiting, then the phase will be advanced and they'll be released if they + * were only waiting for the caller. Return true if this participant was the + * last to detach. + */ +static inline bool +BarrierDetachImpl(Barrier *barrier, bool arrive) +{ + bool release; + bool last; + + Assert(!barrier->static_party); + + SpinLockAcquire(&barrier->mutex); + Assert(barrier->participants > 0); + --barrier->participants; + + /* + * If any other participants are waiting and we were the last participant + * waited for, release them. If no other participants are waiting, but + * this is a BarrierArriveAndDetach() call, then advance the phase too. + */ + if ((arrive || barrier->participants > 0) && + barrier->arrived == barrier->participants) + { + release = true; + barrier->arrived = 0; + ++barrier->phase; + } + else + release = false; + + last = barrier->participants == 0; + SpinLockRelease(&barrier->mutex); + + if (release) + ConditionVariableBroadcast(&barrier->condition_variable); + + return last; +} diff --git a/src/backend/storage/ipc/dsm.c b/src/backend/storage/ipc/dsm.c new file mode 100644 index 0000000..b461a5f --- /dev/null +++ b/src/backend/storage/ipc/dsm.c @@ -0,0 +1,1248 @@ +/*------------------------------------------------------------------------- + * + * dsm.c + * manage dynamic shared memory segments + * + * This file provides a set of services to make programming with dynamic + * shared memory segments more convenient. Unlike the low-level + * facilities provided by dsm_impl.h and dsm_impl.c, mappings and segments + * created using this module will be cleaned up automatically. Mappings + * will be removed when the resource owner under which they were created + * is cleaned up, unless dsm_pin_mapping() is used, in which case they + * have session lifespan. Segments will be removed when there are no + * remaining mappings, or at postmaster shutdown in any case. After a + * hard postmaster crash, remaining segments will be removed, if they + * still exist, at the next postmaster startup. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/dsm.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <fcntl.h> +#include <unistd.h> +#ifndef WIN32 +#include <sys/mman.h> +#endif +#include <sys/stat.h> + +#include "lib/ilist.h" +#include "miscadmin.h" +#include "port/pg_bitutils.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "utils/freepage.h" +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/resowner_private.h" + +#define PG_DYNSHMEM_CONTROL_MAGIC 0x9a503d32 + +#define PG_DYNSHMEM_FIXED_SLOTS 64 +#define PG_DYNSHMEM_SLOTS_PER_BACKEND 5 + +#define INVALID_CONTROL_SLOT ((uint32) -1) + +/* Backend-local tracking for on-detach callbacks. */ +typedef struct dsm_segment_detach_callback +{ + on_dsm_detach_callback function; + Datum arg; + slist_node node; +} dsm_segment_detach_callback; + +/* Backend-local state for a dynamic shared memory segment. */ +struct dsm_segment +{ + dlist_node node; /* List link in dsm_segment_list. */ + ResourceOwner resowner; /* Resource owner. */ + dsm_handle handle; /* Segment name. */ + uint32 control_slot; /* Slot in control segment. */ + void *impl_private; /* Implementation-specific private data. */ + void *mapped_address; /* Mapping address, or NULL if unmapped. */ + Size mapped_size; /* Size of our mapping. */ + slist_head on_detach; /* On-detach callbacks. */ +}; + +/* Shared-memory state for a dynamic shared memory segment. */ +typedef struct dsm_control_item +{ + dsm_handle handle; + uint32 refcnt; /* 2+ = active, 1 = moribund, 0 = gone */ + size_t first_page; + size_t npages; + void *impl_private_pm_handle; /* only needed on Windows */ + bool pinned; +} dsm_control_item; + +/* Layout of the dynamic shared memory control segment. */ +typedef struct dsm_control_header +{ + uint32 magic; + uint32 nitems; + uint32 maxitems; + dsm_control_item item[FLEXIBLE_ARRAY_MEMBER]; +} dsm_control_header; + +static void dsm_cleanup_for_mmap(void); +static void dsm_postmaster_shutdown(int code, Datum arg); +static dsm_segment *dsm_create_descriptor(void); +static bool dsm_control_segment_sane(dsm_control_header *control, + Size mapped_size); +static uint64 dsm_control_bytes_needed(uint32 nitems); +static inline dsm_handle make_main_region_dsm_handle(int slot); +static inline bool is_main_region_dsm_handle(dsm_handle handle); + +/* Has this backend initialized the dynamic shared memory system yet? */ +static bool dsm_init_done = false; + +/* Preallocated DSM space in the main shared memory region. */ +static void *dsm_main_space_begin = NULL; + +/* + * List of dynamic shared memory segments used by this backend. + * + * At process exit time, we must decrement the reference count of each + * segment we have attached; this list makes it possible to find all such + * segments. + * + * This list should always be empty in the postmaster. We could probably + * allow the postmaster to map dynamic shared memory segments before it + * begins to start child processes, provided that each process adjusted + * the reference counts for those segments in the control segment at + * startup time, but there's no obvious need for such a facility, which + * would also be complex to handle in the EXEC_BACKEND case. Once the + * postmaster has begun spawning children, there's an additional problem: + * each new mapping would require an update to the control segment, + * which requires locking, in which the postmaster must not be involved. + */ +static dlist_head dsm_segment_list = DLIST_STATIC_INIT(dsm_segment_list); + +/* + * Control segment information. + * + * Unlike ordinary shared memory segments, the control segment is not + * reference counted; instead, it lasts for the postmaster's entire + * life cycle. For simplicity, it doesn't have a dsm_segment object either. + */ +static dsm_handle dsm_control_handle; +static dsm_control_header *dsm_control; +static Size dsm_control_mapped_size = 0; +static void *dsm_control_impl_private = NULL; + +/* + * Start up the dynamic shared memory system. + * + * This is called just once during each cluster lifetime, at postmaster + * startup time. + */ +void +dsm_postmaster_startup(PGShmemHeader *shim) +{ + void *dsm_control_address = NULL; + uint32 maxitems; + Size segsize; + + Assert(!IsUnderPostmaster); + + /* + * If we're using the mmap implementations, clean up any leftovers. + * Cleanup isn't needed on Windows, and happens earlier in startup for + * POSIX and System V shared memory, via a direct call to + * dsm_cleanup_using_control_segment. + */ + if (dynamic_shared_memory_type == DSM_IMPL_MMAP) + dsm_cleanup_for_mmap(); + + /* Determine size for new control segment. */ + maxitems = PG_DYNSHMEM_FIXED_SLOTS + + PG_DYNSHMEM_SLOTS_PER_BACKEND * MaxBackends; + elog(DEBUG2, "dynamic shared memory system will support %u segments", + maxitems); + segsize = dsm_control_bytes_needed(maxitems); + + /* + * Loop until we find an unused identifier for the new control segment. We + * sometimes use 0 as a sentinel value indicating that no control segment + * is known to exist, so avoid using that value for a real control + * segment. + */ + for (;;) + { + Assert(dsm_control_address == NULL); + Assert(dsm_control_mapped_size == 0); + dsm_control_handle = random() << 1; /* Even numbers only */ + if (dsm_control_handle == DSM_HANDLE_INVALID) + continue; + if (dsm_impl_op(DSM_OP_CREATE, dsm_control_handle, segsize, + &dsm_control_impl_private, &dsm_control_address, + &dsm_control_mapped_size, ERROR)) + break; + } + dsm_control = dsm_control_address; + on_shmem_exit(dsm_postmaster_shutdown, PointerGetDatum(shim)); + elog(DEBUG2, + "created dynamic shared memory control segment %u (%zu bytes)", + dsm_control_handle, segsize); + shim->dsm_control = dsm_control_handle; + + /* Initialize control segment. */ + dsm_control->magic = PG_DYNSHMEM_CONTROL_MAGIC; + dsm_control->nitems = 0; + dsm_control->maxitems = maxitems; +} + +/* + * Determine whether the control segment from the previous postmaster + * invocation still exists. If so, remove the dynamic shared memory + * segments to which it refers, and then the control segment itself. + */ +void +dsm_cleanup_using_control_segment(dsm_handle old_control_handle) +{ + void *mapped_address = NULL; + void *junk_mapped_address = NULL; + void *impl_private = NULL; + void *junk_impl_private = NULL; + Size mapped_size = 0; + Size junk_mapped_size = 0; + uint32 nitems; + uint32 i; + dsm_control_header *old_control; + + /* + * Try to attach the segment. If this fails, it probably just means that + * the operating system has been rebooted and the segment no longer + * exists, or an unrelated process has used the same shm ID. So just fall + * out quietly. + */ + if (!dsm_impl_op(DSM_OP_ATTACH, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, DEBUG1)) + return; + + /* + * We've managed to reattach it, but the contents might not be sane. If + * they aren't, we disregard the segment after all. + */ + old_control = (dsm_control_header *) mapped_address; + if (!dsm_control_segment_sane(old_control, mapped_size)) + { + dsm_impl_op(DSM_OP_DETACH, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, LOG); + return; + } + + /* + * OK, the control segment looks basically valid, so we can use it to get + * a list of segments that need to be removed. + */ + nitems = old_control->nitems; + for (i = 0; i < nitems; ++i) + { + dsm_handle handle; + uint32 refcnt; + + /* If the reference count is 0, the slot is actually unused. */ + refcnt = old_control->item[i].refcnt; + if (refcnt == 0) + continue; + + /* If it was using the main shmem area, there is nothing to do. */ + handle = old_control->item[i].handle; + if (is_main_region_dsm_handle(handle)) + continue; + + /* Log debugging information. */ + elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u (reference count %u)", + handle, refcnt); + + /* Destroy the referenced segment. */ + dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, LOG); + } + + /* Destroy the old control segment, too. */ + elog(DEBUG2, + "cleaning up dynamic shared memory control segment with ID %u", + old_control_handle); + dsm_impl_op(DSM_OP_DESTROY, old_control_handle, 0, &impl_private, + &mapped_address, &mapped_size, LOG); +} + +/* + * When we're using the mmap shared memory implementation, "shared memory" + * segments might even manage to survive an operating system reboot. + * But there's no guarantee as to exactly what will survive: some segments + * may survive, and others may not, and the contents of some may be out + * of date. In particular, the control segment may be out of date, so we + * can't rely on it to figure out what to remove. However, since we know + * what directory contains the files we used as shared memory, we can simply + * scan the directory and blow everything away that shouldn't be there. + */ +static void +dsm_cleanup_for_mmap(void) +{ + DIR *dir; + struct dirent *dent; + + /* Scan the directory for something with a name of the correct format. */ + dir = AllocateDir(PG_DYNSHMEM_DIR); + + while ((dent = ReadDir(dir, PG_DYNSHMEM_DIR)) != NULL) + { + if (strncmp(dent->d_name, PG_DYNSHMEM_MMAP_FILE_PREFIX, + strlen(PG_DYNSHMEM_MMAP_FILE_PREFIX)) == 0) + { + char buf[MAXPGPATH + sizeof(PG_DYNSHMEM_DIR)]; + + snprintf(buf, sizeof(buf), PG_DYNSHMEM_DIR "/%s", dent->d_name); + + elog(DEBUG2, "removing file \"%s\"", buf); + + /* We found a matching file; so remove it. */ + if (unlink(buf) != 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not remove file \"%s\": %m", buf))); + } + } + + /* Cleanup complete. */ + FreeDir(dir); +} + +/* + * At shutdown time, we iterate over the control segment and remove all + * remaining dynamic shared memory segments. We avoid throwing errors here; + * the postmaster is shutting down either way, and this is just non-critical + * resource cleanup. + */ +static void +dsm_postmaster_shutdown(int code, Datum arg) +{ + uint32 nitems; + uint32 i; + void *dsm_control_address; + void *junk_mapped_address = NULL; + void *junk_impl_private = NULL; + Size junk_mapped_size = 0; + PGShmemHeader *shim = (PGShmemHeader *) DatumGetPointer(arg); + + /* + * If some other backend exited uncleanly, it might have corrupted the + * control segment while it was dying. In that case, we warn and ignore + * the contents of the control segment. This may end up leaving behind + * stray shared memory segments, but there's not much we can do about that + * if the metadata is gone. + */ + nitems = dsm_control->nitems; + if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size)) + { + ereport(LOG, + (errmsg("dynamic shared memory control segment is corrupt"))); + return; + } + + /* Remove any remaining segments. */ + for (i = 0; i < nitems; ++i) + { + dsm_handle handle; + + /* If the reference count is 0, the slot is actually unused. */ + if (dsm_control->item[i].refcnt == 0) + continue; + + handle = dsm_control->item[i].handle; + if (is_main_region_dsm_handle(handle)) + continue; + + /* Log debugging information. */ + elog(DEBUG2, "cleaning up orphaned dynamic shared memory with ID %u", + handle); + + /* Destroy the segment. */ + dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, LOG); + } + + /* Remove the control segment itself. */ + elog(DEBUG2, + "cleaning up dynamic shared memory control segment with ID %u", + dsm_control_handle); + dsm_control_address = dsm_control; + dsm_impl_op(DSM_OP_DESTROY, dsm_control_handle, 0, + &dsm_control_impl_private, &dsm_control_address, + &dsm_control_mapped_size, LOG); + dsm_control = dsm_control_address; + shim->dsm_control = 0; +} + +/* + * Prepare this backend for dynamic shared memory usage. Under EXEC_BACKEND, + * we must reread the state file and map the control segment; in other cases, + * we'll have inherited the postmaster's mapping and global variables. + */ +static void +dsm_backend_startup(void) +{ +#ifdef EXEC_BACKEND + { + void *control_address = NULL; + + /* Attach control segment. */ + Assert(dsm_control_handle != 0); + dsm_impl_op(DSM_OP_ATTACH, dsm_control_handle, 0, + &dsm_control_impl_private, &control_address, + &dsm_control_mapped_size, ERROR); + dsm_control = control_address; + /* If control segment doesn't look sane, something is badly wrong. */ + if (!dsm_control_segment_sane(dsm_control, dsm_control_mapped_size)) + { + dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0, + &dsm_control_impl_private, &control_address, + &dsm_control_mapped_size, WARNING); + ereport(FATAL, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("dynamic shared memory control segment is not valid"))); + } + } +#endif + + dsm_init_done = true; +} + +#ifdef EXEC_BACKEND +/* + * When running under EXEC_BACKEND, we get a callback here when the main + * shared memory segment is re-attached, so that we can record the control + * handle retrieved from it. + */ +void +dsm_set_control_handle(dsm_handle h) +{ + Assert(dsm_control_handle == 0 && h != 0); + dsm_control_handle = h; +} +#endif + +/* + * Reserve some space in the main shared memory segment for DSM segments. + */ +size_t +dsm_estimate_size(void) +{ + return 1024 * 1024 * (size_t) min_dynamic_shared_memory; +} + +/* + * Initialize space in the main shared memory segment for DSM segments. + */ +void +dsm_shmem_init(void) +{ + size_t size = dsm_estimate_size(); + bool found; + + if (size == 0) + return; + + dsm_main_space_begin = ShmemInitStruct("Preallocated DSM", size, &found); + if (!found) + { + FreePageManager *fpm = (FreePageManager *) dsm_main_space_begin; + size_t first_page = 0; + size_t pages; + + /* Reserve space for the FreePageManager. */ + while (first_page * FPM_PAGE_SIZE < sizeof(FreePageManager)) + ++first_page; + + /* Initialize it and give it all the rest of the space. */ + FreePageManagerInitialize(fpm, dsm_main_space_begin); + pages = (size / FPM_PAGE_SIZE) - first_page; + FreePageManagerPut(fpm, first_page, pages); + } +} + +/* + * Create a new dynamic shared memory segment. + * + * If there is a non-NULL CurrentResourceOwner, the new segment is associated + * with it and must be detached before the resource owner releases, or a + * warning will be logged. If CurrentResourceOwner is NULL, the segment + * remains attached until explicitly detached or the session ends. + * Creating with a NULL CurrentResourceOwner is equivalent to creating + * with a non-NULL CurrentResourceOwner and then calling dsm_pin_mapping. + */ +dsm_segment * +dsm_create(Size size, int flags) +{ + dsm_segment *seg; + uint32 i; + uint32 nitems; + size_t npages = 0; + size_t first_page = 0; + FreePageManager *dsm_main_space_fpm = dsm_main_space_begin; + bool using_main_dsm_region = false; + + /* Unsafe in postmaster (and pointless in a stand-alone backend). */ + Assert(IsUnderPostmaster); + + if (!dsm_init_done) + dsm_backend_startup(); + + /* Create a new segment descriptor. */ + seg = dsm_create_descriptor(); + + /* + * Lock the control segment while we try to allocate from the main shared + * memory area, if configured. + */ + if (dsm_main_space_fpm) + { + npages = size / FPM_PAGE_SIZE; + if (size % FPM_PAGE_SIZE > 0) + ++npages; + + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (FreePageManagerGet(dsm_main_space_fpm, npages, &first_page)) + { + /* We can carve out a piece of the main shared memory segment. */ + seg->mapped_address = (char *) dsm_main_space_begin + + first_page * FPM_PAGE_SIZE; + seg->mapped_size = npages * FPM_PAGE_SIZE; + using_main_dsm_region = true; + /* We'll choose a handle below. */ + } + } + + if (!using_main_dsm_region) + { + /* + * We need to create a new memory segment. Loop until we find an + * unused segment identifier. + */ + if (dsm_main_space_fpm) + LWLockRelease(DynamicSharedMemoryControlLock); + for (;;) + { + Assert(seg->mapped_address == NULL && seg->mapped_size == 0); + seg->handle = random() << 1; /* Even numbers only */ + if (seg->handle == DSM_HANDLE_INVALID) /* Reserve sentinel */ + continue; + if (dsm_impl_op(DSM_OP_CREATE, seg->handle, size, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, ERROR)) + break; + } + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + } + + /* Search the control segment for an unused slot. */ + nitems = dsm_control->nitems; + for (i = 0; i < nitems; ++i) + { + if (dsm_control->item[i].refcnt == 0) + { + if (using_main_dsm_region) + { + seg->handle = make_main_region_dsm_handle(i); + dsm_control->item[i].first_page = first_page; + dsm_control->item[i].npages = npages; + } + else + Assert(!is_main_region_dsm_handle(seg->handle)); + dsm_control->item[i].handle = seg->handle; + /* refcnt of 1 triggers destruction, so start at 2 */ + dsm_control->item[i].refcnt = 2; + dsm_control->item[i].impl_private_pm_handle = NULL; + dsm_control->item[i].pinned = false; + seg->control_slot = i; + LWLockRelease(DynamicSharedMemoryControlLock); + return seg; + } + } + + /* Verify that we can support an additional mapping. */ + if (nitems >= dsm_control->maxitems) + { + if (using_main_dsm_region) + FreePageManagerPut(dsm_main_space_fpm, first_page, npages); + LWLockRelease(DynamicSharedMemoryControlLock); + if (!using_main_dsm_region) + dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, WARNING); + if (seg->resowner != NULL) + ResourceOwnerForgetDSM(seg->resowner, seg); + dlist_delete(&seg->node); + pfree(seg); + + if ((flags & DSM_CREATE_NULL_IF_MAXSEGMENTS) != 0) + return NULL; + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("too many dynamic shared memory segments"))); + } + + /* Enter the handle into a new array slot. */ + if (using_main_dsm_region) + { + seg->handle = make_main_region_dsm_handle(nitems); + dsm_control->item[i].first_page = first_page; + dsm_control->item[i].npages = npages; + } + dsm_control->item[nitems].handle = seg->handle; + /* refcnt of 1 triggers destruction, so start at 2 */ + dsm_control->item[nitems].refcnt = 2; + dsm_control->item[nitems].impl_private_pm_handle = NULL; + dsm_control->item[nitems].pinned = false; + seg->control_slot = nitems; + dsm_control->nitems++; + LWLockRelease(DynamicSharedMemoryControlLock); + + return seg; +} + +/* + * Attach a dynamic shared memory segment. + * + * See comments for dsm_segment_handle() for an explanation of how this + * is intended to be used. + * + * This function will return NULL if the segment isn't known to the system. + * This can happen if we're asked to attach the segment, but then everyone + * else detaches it (causing it to be destroyed) before we get around to + * attaching it. + * + * If there is a non-NULL CurrentResourceOwner, the attached segment is + * associated with it and must be detached before the resource owner releases, + * or a warning will be logged. Otherwise the segment remains attached until + * explicitly detached or the session ends. See the note atop dsm_create(). + */ +dsm_segment * +dsm_attach(dsm_handle h) +{ + dsm_segment *seg; + dlist_iter iter; + uint32 i; + uint32 nitems; + + /* Unsafe in postmaster (and pointless in a stand-alone backend). */ + Assert(IsUnderPostmaster); + + if (!dsm_init_done) + dsm_backend_startup(); + + /* + * Since this is just a debugging cross-check, we could leave it out + * altogether, or include it only in assert-enabled builds. But since the + * list of attached segments should normally be very short, let's include + * it always for right now. + * + * If you're hitting this error, you probably want to attempt to find an + * existing mapping via dsm_find_mapping() before calling dsm_attach() to + * create a new one. + */ + dlist_foreach(iter, &dsm_segment_list) + { + seg = dlist_container(dsm_segment, node, iter.cur); + if (seg->handle == h) + elog(ERROR, "can't attach the same segment more than once"); + } + + /* Create a new segment descriptor. */ + seg = dsm_create_descriptor(); + seg->handle = h; + + /* Bump reference count for this segment in shared memory. */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + nitems = dsm_control->nitems; + for (i = 0; i < nitems; ++i) + { + /* + * If the reference count is 0, the slot is actually unused. If the + * reference count is 1, the slot is still in use, but the segment is + * in the process of going away; even if the handle matches, another + * slot may already have started using the same handle value by + * coincidence so we have to keep searching. + */ + if (dsm_control->item[i].refcnt <= 1) + continue; + + /* If the handle doesn't match, it's not the slot we want. */ + if (dsm_control->item[i].handle != seg->handle) + continue; + + /* Otherwise we've found a match. */ + dsm_control->item[i].refcnt++; + seg->control_slot = i; + if (is_main_region_dsm_handle(seg->handle)) + { + seg->mapped_address = (char *) dsm_main_space_begin + + dsm_control->item[i].first_page * FPM_PAGE_SIZE; + seg->mapped_size = dsm_control->item[i].npages * FPM_PAGE_SIZE; + } + break; + } + LWLockRelease(DynamicSharedMemoryControlLock); + + /* + * If we didn't find the handle we're looking for in the control segment, + * it probably means that everyone else who had it mapped, including the + * original creator, died before we got to this point. It's up to the + * caller to decide what to do about that. + */ + if (seg->control_slot == INVALID_CONTROL_SLOT) + { + dsm_detach(seg); + return NULL; + } + + /* Here's where we actually try to map the segment. */ + if (!is_main_region_dsm_handle(seg->handle)) + dsm_impl_op(DSM_OP_ATTACH, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, ERROR); + + return seg; +} + +/* + * At backend shutdown time, detach any segments that are still attached. + * (This is similar to dsm_detach_all, except that there's no reason to + * unmap the control segment before exiting, so we don't bother.) + */ +void +dsm_backend_shutdown(void) +{ + while (!dlist_is_empty(&dsm_segment_list)) + { + dsm_segment *seg; + + seg = dlist_head_element(dsm_segment, node, &dsm_segment_list); + dsm_detach(seg); + } +} + +/* + * Detach all shared memory segments, including the control segments. This + * should be called, along with PGSharedMemoryDetach, in processes that + * might inherit mappings but are not intended to be connected to dynamic + * shared memory. + */ +void +dsm_detach_all(void) +{ + void *control_address = dsm_control; + + while (!dlist_is_empty(&dsm_segment_list)) + { + dsm_segment *seg; + + seg = dlist_head_element(dsm_segment, node, &dsm_segment_list); + dsm_detach(seg); + } + + if (control_address != NULL) + dsm_impl_op(DSM_OP_DETACH, dsm_control_handle, 0, + &dsm_control_impl_private, &control_address, + &dsm_control_mapped_size, ERROR); +} + +/* + * Detach from a shared memory segment, destroying the segment if we + * remove the last reference. + * + * This function should never fail. It will often be invoked when aborting + * a transaction, and a further error won't serve any purpose. It's not a + * complete disaster if we fail to unmap or destroy the segment; it means a + * resource leak, but that doesn't necessarily preclude further operations. + */ +void +dsm_detach(dsm_segment *seg) +{ + /* + * Invoke registered callbacks. Just in case one of those callbacks + * throws a further error that brings us back here, pop the callback + * before invoking it, to avoid infinite error recursion. Don't allow + * interrupts while running the individual callbacks in non-error code + * paths, to avoid leaving cleanup work unfinished if we're interrupted by + * a statement timeout or similar. + */ + HOLD_INTERRUPTS(); + while (!slist_is_empty(&seg->on_detach)) + { + slist_node *node; + dsm_segment_detach_callback *cb; + on_dsm_detach_callback function; + Datum arg; + + node = slist_pop_head_node(&seg->on_detach); + cb = slist_container(dsm_segment_detach_callback, node, node); + function = cb->function; + arg = cb->arg; + pfree(cb); + + function(seg, arg); + } + RESUME_INTERRUPTS(); + + /* + * Try to remove the mapping, if one exists. Normally, there will be, but + * maybe not, if we failed partway through a create or attach operation. + * We remove the mapping before decrementing the reference count so that + * the process that sees a zero reference count can be certain that no + * remaining mappings exist. Even if this fails, we pretend that it + * works, because retrying is likely to fail in the same way. + */ + if (seg->mapped_address != NULL) + { + if (!is_main_region_dsm_handle(seg->handle)) + dsm_impl_op(DSM_OP_DETACH, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, WARNING); + seg->impl_private = NULL; + seg->mapped_address = NULL; + seg->mapped_size = 0; + } + + /* Reduce reference count, if we previously increased it. */ + if (seg->control_slot != INVALID_CONTROL_SLOT) + { + uint32 refcnt; + uint32 control_slot = seg->control_slot; + + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + Assert(dsm_control->item[control_slot].handle == seg->handle); + Assert(dsm_control->item[control_slot].refcnt > 1); + refcnt = --dsm_control->item[control_slot].refcnt; + seg->control_slot = INVALID_CONTROL_SLOT; + LWLockRelease(DynamicSharedMemoryControlLock); + + /* If new reference count is 1, try to destroy the segment. */ + if (refcnt == 1) + { + /* A pinned segment should never reach 1. */ + Assert(!dsm_control->item[control_slot].pinned); + + /* + * If we fail to destroy the segment here, or are killed before we + * finish doing so, the reference count will remain at 1, which + * will mean that nobody else can attach to the segment. At + * postmaster shutdown time, or when a new postmaster is started + * after a hard kill, another attempt will be made to remove the + * segment. + * + * The main case we're worried about here is being killed by a + * signal before we can finish removing the segment. In that + * case, it's important to be sure that the segment still gets + * removed. If we actually fail to remove the segment for some + * other reason, the postmaster may not have any better luck than + * we did. There's not much we can do about that, though. + */ + if (is_main_region_dsm_handle(seg->handle) || + dsm_impl_op(DSM_OP_DESTROY, seg->handle, 0, &seg->impl_private, + &seg->mapped_address, &seg->mapped_size, WARNING)) + { + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (is_main_region_dsm_handle(seg->handle)) + FreePageManagerPut((FreePageManager *) dsm_main_space_begin, + dsm_control->item[control_slot].first_page, + dsm_control->item[control_slot].npages); + Assert(dsm_control->item[control_slot].handle == seg->handle); + Assert(dsm_control->item[control_slot].refcnt == 1); + dsm_control->item[control_slot].refcnt = 0; + LWLockRelease(DynamicSharedMemoryControlLock); + } + } + } + + /* Clean up our remaining backend-private data structures. */ + if (seg->resowner != NULL) + ResourceOwnerForgetDSM(seg->resowner, seg); + dlist_delete(&seg->node); + pfree(seg); +} + +/* + * Keep a dynamic shared memory mapping until end of session. + * + * By default, mappings are owned by the current resource owner, which + * typically means they stick around for the duration of the current query + * only. + */ +void +dsm_pin_mapping(dsm_segment *seg) +{ + if (seg->resowner != NULL) + { + ResourceOwnerForgetDSM(seg->resowner, seg); + seg->resowner = NULL; + } +} + +/* + * Arrange to remove a dynamic shared memory mapping at cleanup time. + * + * dsm_pin_mapping() can be used to preserve a mapping for the entire + * lifetime of a process; this function reverses that decision, making + * the segment owned by the current resource owner. This may be useful + * just before performing some operation that will invalidate the segment + * for future use by this backend. + */ +void +dsm_unpin_mapping(dsm_segment *seg) +{ + Assert(seg->resowner == NULL); + ResourceOwnerEnlargeDSMs(CurrentResourceOwner); + seg->resowner = CurrentResourceOwner; + ResourceOwnerRememberDSM(seg->resowner, seg); +} + +/* + * Keep a dynamic shared memory segment until postmaster shutdown, or until + * dsm_unpin_segment is called. + * + * This function should not be called more than once per segment, unless the + * segment is explicitly unpinned with dsm_unpin_segment in between calls. + * + * Note that this function does not arrange for the current process to + * keep the segment mapped indefinitely; if that behavior is desired, + * dsm_pin_mapping() should be used from each process that needs to + * retain the mapping. + */ +void +dsm_pin_segment(dsm_segment *seg) +{ + void *handle; + + /* + * Bump reference count for this segment in shared memory. This will + * ensure that even if there is no session which is attached to this + * segment, it will remain until postmaster shutdown or an explicit call + * to unpin. + */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (dsm_control->item[seg->control_slot].pinned) + elog(ERROR, "cannot pin a segment that is already pinned"); + dsm_impl_pin_segment(seg->handle, seg->impl_private, &handle); + dsm_control->item[seg->control_slot].pinned = true; + dsm_control->item[seg->control_slot].refcnt++; + dsm_control->item[seg->control_slot].impl_private_pm_handle = handle; + LWLockRelease(DynamicSharedMemoryControlLock); +} + +/* + * Unpin a dynamic shared memory segment that was previously pinned with + * dsm_pin_segment. This function should not be called unless dsm_pin_segment + * was previously called for this segment. + * + * The argument is a dsm_handle rather than a dsm_segment in case you want + * to unpin a segment to which you haven't attached. This turns out to be + * useful if, for example, a reference to one shared memory segment is stored + * within another shared memory segment. You might want to unpin the + * referenced segment before destroying the referencing segment. + */ +void +dsm_unpin_segment(dsm_handle handle) +{ + uint32 control_slot = INVALID_CONTROL_SLOT; + bool destroy = false; + uint32 i; + + /* Find the control slot for the given handle. */ + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + for (i = 0; i < dsm_control->nitems; ++i) + { + /* Skip unused slots and segments that are concurrently going away. */ + if (dsm_control->item[i].refcnt <= 1) + continue; + + /* If we've found our handle, we can stop searching. */ + if (dsm_control->item[i].handle == handle) + { + control_slot = i; + break; + } + } + + /* + * We should definitely have found the slot, and it should not already be + * in the process of going away, because this function should only be + * called on a segment which is pinned. + */ + if (control_slot == INVALID_CONTROL_SLOT) + elog(ERROR, "cannot unpin unknown segment handle"); + if (!dsm_control->item[control_slot].pinned) + elog(ERROR, "cannot unpin a segment that is not pinned"); + Assert(dsm_control->item[control_slot].refcnt > 1); + + /* + * Allow implementation-specific code to run. We have to do this before + * releasing the lock, because impl_private_pm_handle may get modified by + * dsm_impl_unpin_segment. + */ + dsm_impl_unpin_segment(handle, + &dsm_control->item[control_slot].impl_private_pm_handle); + + /* Note that 1 means no references (0 means unused slot). */ + if (--dsm_control->item[control_slot].refcnt == 1) + destroy = true; + dsm_control->item[control_slot].pinned = false; + + /* Now we can release the lock. */ + LWLockRelease(DynamicSharedMemoryControlLock); + + /* Clean up resources if that was the last reference. */ + if (destroy) + { + void *junk_impl_private = NULL; + void *junk_mapped_address = NULL; + Size junk_mapped_size = 0; + + /* + * For an explanation of how error handling works in this case, see + * comments in dsm_detach. Note that if we reach this point, the + * current process certainly does not have the segment mapped, because + * if it did, the reference count would have still been greater than 1 + * even after releasing the reference count held by the pin. The fact + * that there can't be a dsm_segment for this handle makes it OK to + * pass the mapped size, mapped address, and private data as NULL + * here. + */ + if (is_main_region_dsm_handle(handle) || + dsm_impl_op(DSM_OP_DESTROY, handle, 0, &junk_impl_private, + &junk_mapped_address, &junk_mapped_size, WARNING)) + { + LWLockAcquire(DynamicSharedMemoryControlLock, LW_EXCLUSIVE); + if (is_main_region_dsm_handle(handle)) + FreePageManagerPut((FreePageManager *) dsm_main_space_begin, + dsm_control->item[control_slot].first_page, + dsm_control->item[control_slot].npages); + Assert(dsm_control->item[control_slot].handle == handle); + Assert(dsm_control->item[control_slot].refcnt == 1); + dsm_control->item[control_slot].refcnt = 0; + LWLockRelease(DynamicSharedMemoryControlLock); + } + } +} + +/* + * Find an existing mapping for a shared memory segment, if there is one. + */ +dsm_segment * +dsm_find_mapping(dsm_handle h) +{ + dlist_iter iter; + dsm_segment *seg; + + dlist_foreach(iter, &dsm_segment_list) + { + seg = dlist_container(dsm_segment, node, iter.cur); + if (seg->handle == h) + return seg; + } + + return NULL; +} + +/* + * Get the address at which a dynamic shared memory segment is mapped. + */ +void * +dsm_segment_address(dsm_segment *seg) +{ + Assert(seg->mapped_address != NULL); + return seg->mapped_address; +} + +/* + * Get the size of a mapping. + */ +Size +dsm_segment_map_length(dsm_segment *seg) +{ + Assert(seg->mapped_address != NULL); + return seg->mapped_size; +} + +/* + * Get a handle for a mapping. + * + * To establish communication via dynamic shared memory between two backends, + * one of them should first call dsm_create() to establish a new shared + * memory mapping. That process should then call dsm_segment_handle() to + * obtain a handle for the mapping, and pass that handle to the + * coordinating backend via some means (e.g. bgw_main_arg, or via the + * main shared memory segment). The recipient, once in possession of the + * handle, should call dsm_attach(). + */ +dsm_handle +dsm_segment_handle(dsm_segment *seg) +{ + return seg->handle; +} + +/* + * Register an on-detach callback for a dynamic shared memory segment. + */ +void +on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, Datum arg) +{ + dsm_segment_detach_callback *cb; + + cb = MemoryContextAlloc(TopMemoryContext, + sizeof(dsm_segment_detach_callback)); + cb->function = function; + cb->arg = arg; + slist_push_head(&seg->on_detach, &cb->node); +} + +/* + * Unregister an on-detach callback for a dynamic shared memory segment. + */ +void +cancel_on_dsm_detach(dsm_segment *seg, on_dsm_detach_callback function, + Datum arg) +{ + slist_mutable_iter iter; + + slist_foreach_modify(iter, &seg->on_detach) + { + dsm_segment_detach_callback *cb; + + cb = slist_container(dsm_segment_detach_callback, node, iter.cur); + if (cb->function == function && cb->arg == arg) + { + slist_delete_current(&iter); + pfree(cb); + break; + } + } +} + +/* + * Discard all registered on-detach callbacks without executing them. + */ +void +reset_on_dsm_detach(void) +{ + dlist_iter iter; + + dlist_foreach(iter, &dsm_segment_list) + { + dsm_segment *seg = dlist_container(dsm_segment, node, iter.cur); + + /* Throw away explicit on-detach actions one by one. */ + while (!slist_is_empty(&seg->on_detach)) + { + slist_node *node; + dsm_segment_detach_callback *cb; + + node = slist_pop_head_node(&seg->on_detach); + cb = slist_container(dsm_segment_detach_callback, node, node); + pfree(cb); + } + + /* + * Decrementing the reference count is a sort of implicit on-detach + * action; make sure we don't do that, either. + */ + seg->control_slot = INVALID_CONTROL_SLOT; + } +} + +/* + * Create a segment descriptor. + */ +static dsm_segment * +dsm_create_descriptor(void) +{ + dsm_segment *seg; + + if (CurrentResourceOwner) + ResourceOwnerEnlargeDSMs(CurrentResourceOwner); + + seg = MemoryContextAlloc(TopMemoryContext, sizeof(dsm_segment)); + dlist_push_head(&dsm_segment_list, &seg->node); + + /* seg->handle must be initialized by the caller */ + seg->control_slot = INVALID_CONTROL_SLOT; + seg->impl_private = NULL; + seg->mapped_address = NULL; + seg->mapped_size = 0; + + seg->resowner = CurrentResourceOwner; + if (CurrentResourceOwner) + ResourceOwnerRememberDSM(CurrentResourceOwner, seg); + + slist_init(&seg->on_detach); + + return seg; +} + +/* + * Sanity check a control segment. + * + * The goal here isn't to detect everything that could possibly be wrong with + * the control segment; there's not enough information for that. Rather, the + * goal is to make sure that someone can iterate over the items in the segment + * without overrunning the end of the mapping and crashing. We also check + * the magic number since, if that's messed up, this may not even be one of + * our segments at all. + */ +static bool +dsm_control_segment_sane(dsm_control_header *control, Size mapped_size) +{ + if (mapped_size < offsetof(dsm_control_header, item)) + return false; /* Mapped size too short to read header. */ + if (control->magic != PG_DYNSHMEM_CONTROL_MAGIC) + return false; /* Magic number doesn't match. */ + if (dsm_control_bytes_needed(control->maxitems) > mapped_size) + return false; /* Max item count won't fit in map. */ + if (control->nitems > control->maxitems) + return false; /* Overfull. */ + return true; +} + +/* + * Compute the number of control-segment bytes needed to store a given + * number of items. + */ +static uint64 +dsm_control_bytes_needed(uint32 nitems) +{ + return offsetof(dsm_control_header, item) + + sizeof(dsm_control_item) * (uint64) nitems; +} + +static inline dsm_handle +make_main_region_dsm_handle(int slot) +{ + dsm_handle handle; + + /* + * We need to create a handle that doesn't collide with any existing extra + * segment created by dsm_impl_op(), so we'll make it odd. It also + * mustn't collide with any other main area pseudo-segment, so we'll + * include the slot number in some of the bits. We also want to make an + * effort to avoid newly created and recently destroyed handles from being + * confused, so we'll make the rest of the bits random. + */ + handle = 1; + handle |= slot << 1; + handle |= random() << (pg_leftmost_one_pos32(dsm_control->maxitems) + 1); + return handle; +} + +static inline bool +is_main_region_dsm_handle(dsm_handle handle) +{ + return handle & 1; +} diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c new file mode 100644 index 0000000..c51e3e6 --- /dev/null +++ b/src/backend/storage/ipc/dsm_impl.c @@ -0,0 +1,1058 @@ +/*------------------------------------------------------------------------- + * + * dsm_impl.c + * manage dynamic shared memory segments + * + * This file provides low-level APIs for creating and destroying shared + * memory segments using several different possible techniques. We refer + * to these segments as dynamic because they can be created, altered, and + * destroyed at any point during the server life cycle. This is unlike + * the main shared memory segment, of which there is always exactly one + * and which is always mapped at a fixed address in every PostgreSQL + * background process. + * + * Because not all systems provide the same primitives in this area, nor + * do all primitives behave the same way on all systems, we provide + * several implementations of this facility. Many systems implement + * POSIX shared memory (shm_open etc.), which is well-suited to our needs + * in this area, with the exception that shared memory identifiers live + * in a flat system-wide namespace, raising the uncomfortable prospect of + * name collisions with other processes (including other copies of + * PostgreSQL) running on the same system. Some systems only support + * the older System V shared memory interface (shmget etc.) which is + * also usable; however, the default allocation limits are often quite + * small, and the namespace is even more restricted. + * + * We also provide an mmap-based shared memory implementation. This may + * be useful on systems that provide shared memory via a special-purpose + * filesystem; by opting for this implementation, the user can even + * control precisely where their shared memory segments are placed. It + * can also be used as a fallback for systems where shm_open and shmget + * are not available or can't be used for some reason. Of course, + * mapping a file residing on an actual spinning disk is a fairly poor + * approximation for shared memory because writeback may hurt performance + * substantially, but there should be few systems where we must make do + * with such poor tools. + * + * As ever, Windows requires its own implementation. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/dsm_impl.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include <fcntl.h> +#include <signal.h> +#include <unistd.h> +#ifndef WIN32 +#include <sys/mman.h> +#endif +#include <sys/stat.h> +#ifdef HAVE_SYS_IPC_H +#include <sys/ipc.h> +#endif +#ifdef HAVE_SYS_SHM_H +#include <sys/shm.h> +#endif + +#include "common/file_perm.h" +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "portability/mem.h" +#include "postmaster/postmaster.h" +#include "storage/dsm_impl.h" +#include "storage/fd.h" +#include "utils/guc.h" +#include "utils/memutils.h" + +#ifdef USE_DSM_POSIX +static bool dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +static int dsm_impl_posix_resize(int fd, off_t size); +#endif +#ifdef USE_DSM_SYSV +static bool dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +#ifdef USE_DSM_WINDOWS +static bool dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +#ifdef USE_DSM_MMAP +static bool dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel); +#endif +static int errcode_for_dynamic_shared_memory(void); + +const struct config_enum_entry dynamic_shared_memory_options[] = { +#ifdef USE_DSM_POSIX + {"posix", DSM_IMPL_POSIX, false}, +#endif +#ifdef USE_DSM_SYSV + {"sysv", DSM_IMPL_SYSV, false}, +#endif +#ifdef USE_DSM_WINDOWS + {"windows", DSM_IMPL_WINDOWS, false}, +#endif +#ifdef USE_DSM_MMAP + {"mmap", DSM_IMPL_MMAP, false}, +#endif + {NULL, 0, false} +}; + +/* Implementation selector. */ +int dynamic_shared_memory_type; + +/* Amount of space reserved for DSM segments in the main area. */ +int min_dynamic_shared_memory; + +/* Size of buffer to be used for zero-filling. */ +#define ZBUFFER_SIZE 8192 + +#define SEGMENT_NAME_PREFIX "Global/PostgreSQL" + +/*------ + * Perform a low-level shared memory operation in a platform-specific way, + * as dictated by the selected implementation. Each implementation is + * required to implement the following primitives. + * + * DSM_OP_CREATE. Create a segment whose size is the request_size and + * map it. + * + * DSM_OP_ATTACH. Map the segment, whose size must be the request_size. + * + * DSM_OP_DETACH. Unmap the segment. + * + * DSM_OP_DESTROY. Unmap the segment, if it is mapped. Destroy the + * segment. + * + * Arguments: + * op: The operation to be performed. + * handle: The handle of an existing object, or for DSM_OP_CREATE, the + * a new handle the caller wants created. + * request_size: For DSM_OP_CREATE, the requested size. Otherwise, 0. + * impl_private: Private, implementation-specific data. Will be a pointer + * to NULL for the first operation on a shared memory segment within this + * backend; thereafter, it will point to the value to which it was set + * on the previous call. + * mapped_address: Pointer to start of current mapping; pointer to NULL + * if none. Updated with new mapping address. + * mapped_size: Pointer to size of current mapping; pointer to 0 if none. + * Updated with new mapped size. + * elevel: Level at which to log errors. + * + * Return value: true on success, false on failure. When false is returned, + * a message should first be logged at the specified elevel, except in the + * case where DSM_OP_CREATE experiences a name collision, which should + * silently return false. + *----- + */ +bool +dsm_impl_op(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + Assert(op == DSM_OP_CREATE || request_size == 0); + Assert((op != DSM_OP_CREATE && op != DSM_OP_ATTACH) || + (*mapped_address == NULL && *mapped_size == 0)); + + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_POSIX + case DSM_IMPL_POSIX: + return dsm_impl_posix(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_SYSV + case DSM_IMPL_SYSV: + return dsm_impl_sysv(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + return dsm_impl_windows(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif +#ifdef USE_DSM_MMAP + case DSM_IMPL_MMAP: + return dsm_impl_mmap(op, handle, request_size, impl_private, + mapped_address, mapped_size, elevel); +#endif + default: + elog(ERROR, "unexpected dynamic shared memory type: %d", + dynamic_shared_memory_type); + return false; + } +} + +#ifdef USE_DSM_POSIX +/* + * Operating system primitives to support POSIX shared memory. + * + * POSIX shared memory segments are created and attached using shm_open() + * and shm_unlink(); other operations, such as sizing or mapping the + * segment, are performed as if the shared memory segments were files. + * + * Indeed, on some platforms, they may be implemented that way. While + * POSIX shared memory segments seem intended to exist in a flat namespace, + * some operating systems may implement them as files, even going so far + * to treat a request for /xyz as a request to create a file by that name + * in the root directory. Users of such broken platforms should select + * a different shared memory implementation. + */ +static bool +dsm_impl_posix(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + char name[64]; + int flags; + int fd; + char *address; + + snprintf(name, 64, "/PostgreSQL.%u", handle); + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && munmap(*mapped_address, *mapped_size) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && shm_unlink(name) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* + * Create new segment or open an existing one for attach. + * + * Even though we will close the FD before returning, it seems desirable + * to use Reserve/ReleaseExternalFD, to reduce the probability of EMFILE + * failure. The fact that we won't hold the FD open long justifies using + * ReserveExternalFD rather than AcquireExternalFD, though. + */ + ReserveExternalFD(); + + flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); + if ((fd = shm_open(name, flags, PG_FILE_MODE_OWNER)) == -1) + { + ReleaseExternalFD(); + if (op == DSM_OP_ATTACH || errno != EEXIST) + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * If we're attaching the segment, determine the current size; if we are + * creating the segment, set the size to the requested value. + */ + if (op == DSM_OP_ATTACH) + { + struct stat st; + + if (fstat(fd, &st) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = st.st_size; + } + else if (dsm_impl_posix_resize(fd, request_size) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", + name, request_size))); + return false; + } + + /* Map it. */ + address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); + if (address == MAP_FAILED) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + close(fd); + ReleaseExternalFD(); + if (op == DSM_OP_CREATE) + shm_unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + close(fd); + ReleaseExternalFD(); + + return true; +} + +/* + * Set the size of a virtual memory region associated with a file descriptor. + * If necessary, also ensure that virtual memory is actually allocated by the + * operating system, to avoid nasty surprises later. + * + * Returns non-zero if either truncation or allocation fails, and sets errno. + */ +static int +dsm_impl_posix_resize(int fd, off_t size) +{ + int rc; + int save_errno; + sigset_t save_sigmask; + + /* + * Block all blockable signals, except SIGQUIT. posix_fallocate() can run + * for quite a long time, and is an all-or-nothing operation. If we + * allowed SIGUSR1 to interrupt us repeatedly (for example, due to recovery + * conflicts), the retry loop might never succeed. + */ + if (IsUnderPostmaster) + sigprocmask(SIG_SETMASK, &BlockSig, &save_sigmask); + + /* Truncate (or extend) the file to the requested size. */ + do + { + rc = ftruncate(fd, size); + } while (rc < 0 && errno == EINTR); + + /* + * On Linux, a shm_open fd is backed by a tmpfs file. After resizing with + * ftruncate, the file may contain a hole. Accessing memory backed by a + * hole causes tmpfs to allocate pages, which fails with SIGBUS if there + * is no more tmpfs space available. So we ask tmpfs to allocate pages + * here, so we can fail gracefully with ENOSPC now rather than risking + * SIGBUS later. + */ +#if defined(HAVE_POSIX_FALLOCATE) && defined(__linux__) + if (rc == 0) + { + /* + * We still use a traditional EINTR retry loop to handle SIGCONT. + * posix_fallocate() doesn't restart automatically, and we don't want + * this to fail if you attach a debugger. + */ + pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE); + do + { + rc = posix_fallocate(fd, 0, size); + } while (rc == EINTR); + pgstat_report_wait_end(); + + /* + * The caller expects errno to be set, but posix_fallocate() doesn't + * set it. Instead it returns error numbers directly. So set errno, + * even though we'll also return rc to indicate success or failure. + */ + errno = rc; + } +#endif /* HAVE_POSIX_FALLOCATE && __linux__ */ + + if (IsUnderPostmaster) + { + save_errno = errno; + sigprocmask(SIG_SETMASK, &save_sigmask, NULL); + errno = save_errno; + } + + return rc; +} + +#endif /* USE_DSM_POSIX */ + +#ifdef USE_DSM_SYSV +/* + * Operating system primitives to support System V shared memory. + * + * System V shared memory segments are manipulated using shmget(), shmat(), + * shmdt(), and shmctl(). As the default allocation limits for System V + * shared memory are usually quite low, the POSIX facilities may be + * preferable; but those are not supported everywhere. + */ +static bool +dsm_impl_sysv(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + key_t key; + int ident; + char *address; + char name[64]; + int *ident_cache; + + /* + * POSIX shared memory and mmap-based shared memory identify segments with + * names. To avoid needless error message variation, we use the handle as + * the name. + */ + snprintf(name, 64, "%u", handle); + + /* + * The System V shared memory namespace is very restricted; names are of + * type key_t, which is expected to be some sort of integer data type, but + * not necessarily the same one as dsm_handle. Since we use dsm_handle to + * identify shared memory segments across processes, this might seem like + * a problem, but it's really not. If dsm_handle is bigger than key_t, + * the cast below might truncate away some bits from the handle the + * user-provided, but it'll truncate exactly the same bits away in exactly + * the same fashion every time we use that handle, which is all that + * really matters. Conversely, if dsm_handle is smaller than key_t, we + * won't use the full range of available key space, but that's no big deal + * either. + * + * We do make sure that the key isn't negative, because that might not be + * portable. + */ + key = (key_t) handle; + if (key < 1) /* avoid compiler warning if type is unsigned */ + key = -key; + + /* + * There's one special key, IPC_PRIVATE, which can't be used. If we end + * up with that value by chance during a create operation, just pretend it + * already exists, so that caller will retry. If we run into it anywhere + * else, the caller has passed a handle that doesn't correspond to + * anything we ever created, which should not happen. + */ + if (key == IPC_PRIVATE) + { + if (op != DSM_OP_CREATE) + elog(DEBUG4, "System V shared memory key may not be IPC_PRIVATE"); + errno = EEXIST; + return false; + } + + /* + * Before we can do anything with a shared memory segment, we have to map + * the shared memory key to a shared memory identifier using shmget(). To + * avoid repeated lookups, we store the key using impl_private. + */ + if (*impl_private != NULL) + { + ident_cache = *impl_private; + ident = *ident_cache; + } + else + { + int flags = IPCProtection; + size_t segsize; + + /* + * Allocate the memory BEFORE acquiring the resource, so that we don't + * leak the resource if memory allocation fails. + */ + ident_cache = MemoryContextAlloc(TopMemoryContext, sizeof(int)); + + /* + * When using shmget to find an existing segment, we must pass the + * size as 0. Passing a non-zero size which is greater than the + * actual size will result in EINVAL. + */ + segsize = 0; + + if (op == DSM_OP_CREATE) + { + flags |= IPC_CREAT | IPC_EXCL; + segsize = request_size; + } + + if ((ident = shmget(key, segsize, flags)) == -1) + { + if (op == DSM_OP_ATTACH || errno != EEXIST) + { + int save_errno = errno; + + pfree(ident_cache); + errno = save_errno; + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not get shared memory segment: %m"))); + } + return false; + } + + *ident_cache = ident; + *impl_private = ident_cache; + } + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + pfree(ident_cache); + *impl_private = NULL; + if (*mapped_address != NULL && shmdt(*mapped_address) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && shmctl(ident, IPC_RMID, NULL) < 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* If we're attaching it, we must use IPC_STAT to determine the size. */ + if (op == DSM_OP_ATTACH) + { + struct shmid_ds shm; + + if (shmctl(ident, IPC_STAT, &shm) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = shm.shm_segsz; + } + + /* Map it. */ + address = shmat(ident, NULL, PG_SHMAT_FLAGS); + if (address == (void *) -1) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + if (op == DSM_OP_CREATE) + shmctl(ident, IPC_RMID, NULL); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + + return true; +} +#endif + +#ifdef USE_DSM_WINDOWS +/* + * Operating system primitives to support Windows shared memory. + * + * Windows shared memory implementation is done using file mapping + * which can be backed by either physical file or system paging file. + * Current implementation uses system paging file as other effects + * like performance are not clear for physical file and it is used in similar + * way for main shared memory in windows. + * + * A memory mapping object is a kernel object - they always get deleted when + * the last reference to them goes away, either explicitly via a CloseHandle or + * when the process containing the reference exits. + */ +static bool +dsm_impl_windows(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, + Size *mapped_size, int elevel) +{ + char *address; + HANDLE hmap; + char name[64]; + MEMORY_BASIC_INFORMATION info; + + /* + * Storing the shared memory segment in the Global\ namespace, can allow + * any process running in any session to access that file mapping object + * provided that the caller has the required access rights. But to avoid + * issues faced in main shared memory, we are using the naming convention + * similar to main shared memory. We can change here once issue mentioned + * in GetSharedMemName is resolved. + */ + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + + /* + * Handle teardown cases. Since Windows automatically destroys the object + * when no references remain, we can treat it the same as detach. + */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && UnmapViewOfFile(*mapped_address) == 0) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + if (*impl_private != NULL + && CloseHandle(*impl_private) == 0) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + + *impl_private = NULL; + *mapped_address = NULL; + *mapped_size = 0; + return true; + } + + /* Create new segment or open an existing one for attach. */ + if (op == DSM_OP_CREATE) + { + DWORD size_high; + DWORD size_low; + DWORD errcode; + + /* Shifts >= the width of the type are undefined. */ +#ifdef _WIN64 + size_high = request_size >> 32; +#else + size_high = 0; +#endif + size_low = (DWORD) request_size; + + /* CreateFileMapping might not clear the error code on success */ + SetLastError(0); + + hmap = CreateFileMapping(INVALID_HANDLE_VALUE, /* Use the pagefile */ + NULL, /* Default security attrs */ + PAGE_READWRITE, /* Memory is read/write */ + size_high, /* Upper 32 bits of size */ + size_low, /* Lower 32 bits of size */ + name); + + errcode = GetLastError(); + if (errcode == ERROR_ALREADY_EXISTS || errcode == ERROR_ACCESS_DENIED) + { + /* + * On Windows, when the segment already exists, a handle for the + * existing segment is returned. We must close it before + * returning. However, if the existing segment is created by a + * service, then it returns ERROR_ACCESS_DENIED. We don't do + * _dosmaperr here, so errno won't be modified. + */ + if (hmap) + CloseHandle(hmap); + return false; + } + + if (!hmap) + { + _dosmaperr(errcode); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not create shared memory segment \"%s\": %m", + name))); + return false; + } + } + else + { + hmap = OpenFileMapping(FILE_MAP_WRITE | FILE_MAP_READ, + FALSE, /* do not inherit the name */ + name); /* name of mapping object */ + if (!hmap) + { + _dosmaperr(GetLastError()); + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + } + + /* Map it. */ + address = MapViewOfFile(hmap, FILE_MAP_WRITE | FILE_MAP_READ, + 0, 0, 0); + if (!address) + { + int save_errno; + + _dosmaperr(GetLastError()); + /* Back out what's already been done. */ + save_errno = errno; + CloseHandle(hmap); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * VirtualQuery gives size in page_size units, which is 4K for Windows. We + * need size only when we are attaching, but it's better to get the size + * when creating new segment to keep size consistent both for + * DSM_OP_CREATE and DSM_OP_ATTACH. + */ + if (VirtualQuery(address, &info, sizeof(info)) == 0) + { + int save_errno; + + _dosmaperr(GetLastError()); + /* Back out what's already been done. */ + save_errno = errno; + UnmapViewOfFile(address); + CloseHandle(hmap); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + + *mapped_address = address; + *mapped_size = info.RegionSize; + *impl_private = hmap; + + return true; +} +#endif + +#ifdef USE_DSM_MMAP +/* + * Operating system primitives to support mmap-based shared memory. + * + * Calling this "shared memory" is somewhat of a misnomer, because what + * we're really doing is creating a bunch of files and mapping them into + * our address space. The operating system may feel obliged to + * synchronize the contents to disk even if nothing is being paged out, + * which will not serve us well. The user can relocate the pg_dynshmem + * directory to a ramdisk to avoid this problem, if available. + */ +static bool +dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, + void **impl_private, void **mapped_address, Size *mapped_size, + int elevel) +{ + char name[64]; + int flags; + int fd; + char *address; + + snprintf(name, 64, PG_DYNSHMEM_DIR "/" PG_DYNSHMEM_MMAP_FILE_PREFIX "%u", + handle); + + /* Handle teardown cases. */ + if (op == DSM_OP_DETACH || op == DSM_OP_DESTROY) + { + if (*mapped_address != NULL + && munmap(*mapped_address, *mapped_size) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not unmap shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = NULL; + *mapped_size = 0; + if (op == DSM_OP_DESTROY && unlink(name) != 0) + { + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not remove shared memory segment \"%s\": %m", + name))); + return false; + } + return true; + } + + /* Create new segment or open an existing one for attach. */ + flags = O_RDWR | (op == DSM_OP_CREATE ? O_CREAT | O_EXCL : 0); + if ((fd = OpenTransientFile(name, flags)) == -1) + { + if (op == DSM_OP_ATTACH || errno != EEXIST) + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not open shared memory segment \"%s\": %m", + name))); + return false; + } + + /* + * If we're attaching the segment, determine the current size; if we are + * creating the segment, set the size to the requested value. + */ + if (op == DSM_OP_ATTACH) + { + struct stat st; + + if (fstat(fd, &st) != 0) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not stat shared memory segment \"%s\": %m", + name))); + return false; + } + request_size = st.st_size; + } + else + { + /* + * Allocate a buffer full of zeros. + * + * Note: palloc zbuffer, instead of just using a local char array, to + * ensure it is reasonably well-aligned; this may save a few cycles + * transferring data to the kernel. + */ + char *zbuffer = (char *) palloc0(ZBUFFER_SIZE); + uint32 remaining = request_size; + bool success = true; + + /* + * Zero-fill the file. We have to do this the hard way to ensure that + * all the file space has really been allocated, so that we don't + * later seg fault when accessing the memory mapping. This is pretty + * pessimal. + */ + while (success && remaining > 0) + { + Size goal = remaining; + + if (goal > ZBUFFER_SIZE) + goal = ZBUFFER_SIZE; + pgstat_report_wait_start(WAIT_EVENT_DSM_FILL_ZERO_WRITE); + if (write(fd, zbuffer, goal) == goal) + remaining -= goal; + else + success = false; + pgstat_report_wait_end(); + } + + if (!success) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + unlink(name); + errno = save_errno ? save_errno : ENOSPC; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not resize shared memory segment \"%s\" to %zu bytes: %m", + name, request_size))); + return false; + } + } + + /* Map it. */ + address = mmap(NULL, request_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_HASSEMAPHORE | MAP_NOSYNC, fd, 0); + if (address == MAP_FAILED) + { + int save_errno; + + /* Back out what's already been done. */ + save_errno = errno; + CloseTransientFile(fd); + if (op == DSM_OP_CREATE) + unlink(name); + errno = save_errno; + + ereport(elevel, + (errcode_for_dynamic_shared_memory(), + errmsg("could not map shared memory segment \"%s\": %m", + name))); + return false; + } + *mapped_address = address; + *mapped_size = request_size; + + if (CloseTransientFile(fd) != 0) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not close shared memory segment \"%s\": %m", + name))); + return false; + } + + return true; +} +#endif + +/* + * Implementation-specific actions that must be performed when a segment is to + * be preserved even when no backend has it attached. + * + * Except on Windows, we don't need to do anything at all. But since Windows + * cleans up segments automatically when no references remain, we duplicate + * the segment handle into the postmaster process. The postmaster needn't + * do anything to receive the handle; Windows transfers it automatically. + */ +void +dsm_impl_pin_segment(dsm_handle handle, void *impl_private, + void **impl_private_pm_handle) +{ + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + { + HANDLE hmap; + + if (!DuplicateHandle(GetCurrentProcess(), impl_private, + PostmasterHandle, &hmap, 0, FALSE, + DUPLICATE_SAME_ACCESS)) + { + char name[64]; + + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + _dosmaperr(GetLastError()); + ereport(ERROR, + (errcode_for_dynamic_shared_memory(), + errmsg("could not duplicate handle for \"%s\": %m", + name))); + } + + /* + * Here, we remember the handle that we created in the + * postmaster process. This handle isn't actually usable in + * any process other than the postmaster, but that doesn't + * matter. We're just holding onto it so that, if the segment + * is unpinned, dsm_impl_unpin_segment can close it. + */ + *impl_private_pm_handle = hmap; + break; + } +#endif + default: + break; + } +} + +/* + * Implementation-specific actions that must be performed when a segment is no + * longer to be preserved, so that it will be cleaned up when all backends + * have detached from it. + * + * Except on Windows, we don't need to do anything at all. For Windows, we + * close the extra handle that dsm_impl_pin_segment created in the + * postmaster's process space. + */ +void +dsm_impl_unpin_segment(dsm_handle handle, void **impl_private) +{ + switch (dynamic_shared_memory_type) + { +#ifdef USE_DSM_WINDOWS + case DSM_IMPL_WINDOWS: + { + if (*impl_private && + !DuplicateHandle(PostmasterHandle, *impl_private, + NULL, NULL, 0, FALSE, + DUPLICATE_CLOSE_SOURCE)) + { + char name[64]; + + snprintf(name, 64, "%s.%u", SEGMENT_NAME_PREFIX, handle); + _dosmaperr(GetLastError()); + ereport(ERROR, + (errcode_for_dynamic_shared_memory(), + errmsg("could not duplicate handle for \"%s\": %m", + name))); + } + + *impl_private = NULL; + break; + } +#endif + default: + break; + } +} + +static int +errcode_for_dynamic_shared_memory(void) +{ + if (errno == EFBIG || errno == ENOMEM) + return errcode(ERRCODE_OUT_OF_MEMORY); + else + return errcode_for_file_access(); +} diff --git a/src/backend/storage/ipc/ipc.c b/src/backend/storage/ipc/ipc.c new file mode 100644 index 0000000..4045d7d --- /dev/null +++ b/src/backend/storage/ipc/ipc.c @@ -0,0 +1,435 @@ +/*------------------------------------------------------------------------- + * + * ipc.c + * POSTGRES inter-process communication definitions. + * + * This file is misnamed, as it no longer has much of anything directly + * to do with IPC. The functionality here is concerned with managing + * exit-time cleanup for either a postmaster or a backend. + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/ipc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> +#include <sys/stat.h> + +#include "miscadmin.h" +#ifdef PROFILE_PID_DIR +#include "postmaster/autovacuum.h" +#endif +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "tcop/tcopprot.h" + + +/* + * This flag is set during proc_exit() to change ereport()'s behavior, + * so that an ereport() from an on_proc_exit routine cannot get us out + * of the exit procedure. We do NOT want to go back to the idle loop... + */ +bool proc_exit_inprogress = false; + +/* + * Set when shmem_exit() is in progress. + */ +bool shmem_exit_inprogress = false; + +/* + * This flag tracks whether we've called atexit() in the current process + * (or in the parent postmaster). + */ +static bool atexit_callback_setup = false; + +/* local functions */ +static void proc_exit_prepare(int code); + + +/* ---------------------------------------------------------------- + * exit() handling stuff + * + * These functions are in generally the same spirit as atexit(), + * but provide some additional features we need --- in particular, + * we want to register callbacks to invoke when we are disconnecting + * from a broken shared-memory context but not exiting the postmaster. + * + * Callback functions can take zero, one, or two args: the first passed + * arg is the integer exitcode, the second is the Datum supplied when + * the callback was registered. + * ---------------------------------------------------------------- + */ + +#define MAX_ON_EXITS 20 + +struct ONEXIT +{ + pg_on_exit_callback function; + Datum arg; +}; + +static struct ONEXIT on_proc_exit_list[MAX_ON_EXITS]; +static struct ONEXIT on_shmem_exit_list[MAX_ON_EXITS]; +static struct ONEXIT before_shmem_exit_list[MAX_ON_EXITS]; + +static int on_proc_exit_index, + on_shmem_exit_index, + before_shmem_exit_index; + + +/* ---------------------------------------------------------------- + * proc_exit + * + * this function calls all the callbacks registered + * for it (to free resources) and then calls exit. + * + * This should be the only function to call exit(). + * -cim 2/6/90 + * + * Unfortunately, we can't really guarantee that add-on code + * obeys the rule of not calling exit() directly. So, while + * this is the preferred way out of the system, we also register + * an atexit callback that will make sure cleanup happens. + * ---------------------------------------------------------------- + */ +void +proc_exit(int code) +{ + /* Clean up everything that must be cleaned up */ + proc_exit_prepare(code); + +#ifdef PROFILE_PID_DIR + { + /* + * If we are profiling ourself then gprof's mcleanup() is about to + * write out a profile to ./gmon.out. Since mcleanup() always uses a + * fixed file name, each backend will overwrite earlier profiles. To + * fix that, we create a separate subdirectory for each backend + * (./gprof/pid) and 'cd' to that subdirectory before we exit() - that + * forces mcleanup() to write each profile into its own directory. We + * end up with something like: $PGDATA/gprof/8829/gmon.out + * $PGDATA/gprof/8845/gmon.out ... + * + * To avoid undesirable disk space bloat, autovacuum workers are + * discriminated against: all their gmon.out files go into the same + * subdirectory. Without this, an installation that is "just sitting + * there" nonetheless eats megabytes of disk space every few seconds. + * + * Note that we do this here instead of in an on_proc_exit() callback + * because we want to ensure that this code executes last - we don't + * want to interfere with any other on_proc_exit() callback. For the + * same reason, we do not include it in proc_exit_prepare ... so if + * you are exiting in the "wrong way" you won't drop your profile in a + * nice place. + */ + char gprofDirName[32]; + + if (IsAutoVacuumWorkerProcess()) + snprintf(gprofDirName, 32, "gprof/avworker"); + else + snprintf(gprofDirName, 32, "gprof/%d", (int) getpid()); + + /* + * Use mkdir() instead of MakePGDirectory() since we aren't making a + * PG directory here. + */ + mkdir("gprof", S_IRWXU | S_IRWXG | S_IRWXO); + mkdir(gprofDirName, S_IRWXU | S_IRWXG | S_IRWXO); + chdir(gprofDirName); + } +#endif + + elog(DEBUG3, "exit(%d)", code); + + exit(code); +} + +/* + * Code shared between proc_exit and the atexit handler. Note that in + * normal exit through proc_exit, this will actually be called twice ... + * but the second call will have nothing to do. + */ +static void +proc_exit_prepare(int code) +{ + /* + * Once we set this flag, we are committed to exit. Any ereport() will + * NOT send control back to the main loop, but right back here. + */ + proc_exit_inprogress = true; + + /* + * Forget any pending cancel or die requests; we're doing our best to + * close up shop already. Note that the signal handlers will not set + * these flags again, now that proc_exit_inprogress is set. + */ + InterruptPending = false; + ProcDiePending = false; + QueryCancelPending = false; + InterruptHoldoffCount = 1; + CritSectionCount = 0; + + /* + * Also clear the error context stack, to prevent error callbacks from + * being invoked by any elog/ereport calls made during proc_exit. Whatever + * context they might want to offer is probably not relevant, and in any + * case they are likely to fail outright after we've done things like + * aborting any open transaction. (In normal exit scenarios the context + * stack should be empty anyway, but it might not be in the case of + * elog(FATAL) for example.) + */ + error_context_stack = NULL; + /* For the same reason, reset debug_query_string before it's clobbered */ + debug_query_string = NULL; + + /* do our shared memory exits first */ + shmem_exit(code); + + elog(DEBUG3, "proc_exit(%d): %d callbacks to make", + code, on_proc_exit_index); + + /* + * call all the registered callbacks. + * + * Note that since we decrement on_proc_exit_index each time, if a + * callback calls ereport(ERROR) or ereport(FATAL) then it won't be + * invoked again when control comes back here (nor will the + * previously-completed callbacks). So, an infinite loop should not be + * possible. + */ + while (--on_proc_exit_index >= 0) + on_proc_exit_list[on_proc_exit_index].function(code, + on_proc_exit_list[on_proc_exit_index].arg); + + on_proc_exit_index = 0; +} + +/* ------------------ + * Run all of the on_shmem_exit routines --- but don't actually exit. + * This is used by the postmaster to re-initialize shared memory and + * semaphores after a backend dies horribly. As with proc_exit(), we + * remove each callback from the list before calling it, to avoid + * infinite loop in case of error. + * ------------------ + */ +void +shmem_exit(int code) +{ + shmem_exit_inprogress = true; + + /* + * Call before_shmem_exit callbacks. + * + * These should be things that need most of the system to still be up and + * working, such as cleanup of temp relations, which requires catalog + * access; or things that need to be completed because later cleanup steps + * depend on them, such as releasing lwlocks. + */ + elog(DEBUG3, "shmem_exit(%d): %d before_shmem_exit callbacks to make", + code, before_shmem_exit_index); + while (--before_shmem_exit_index >= 0) + before_shmem_exit_list[before_shmem_exit_index].function(code, + before_shmem_exit_list[before_shmem_exit_index].arg); + before_shmem_exit_index = 0; + + /* + * Call dynamic shared memory callbacks. + * + * These serve the same purpose as late callbacks, but for dynamic shared + * memory segments rather than the main shared memory segment. + * dsm_backend_shutdown() has the same kind of progressive logic we use + * for the main shared memory segment; namely, it unregisters each + * callback before invoking it, so that we don't get stuck in an infinite + * loop if one of those callbacks itself throws an ERROR or FATAL. + * + * Note that explicitly calling this function here is quite different from + * registering it as an on_shmem_exit callback for precisely this reason: + * if one dynamic shared memory callback errors out, the remaining + * callbacks will still be invoked. Thus, hard-coding this call puts it + * equal footing with callbacks for the main shared memory segment. + */ + dsm_backend_shutdown(); + + /* + * Call on_shmem_exit callbacks. + * + * These are generally releasing low-level shared memory resources. In + * some cases, this is a backstop against the possibility that the early + * callbacks might themselves fail, leading to re-entry to this routine; + * in other cases, it's cleanup that only happens at process exit. + */ + elog(DEBUG3, "shmem_exit(%d): %d on_shmem_exit callbacks to make", + code, on_shmem_exit_index); + while (--on_shmem_exit_index >= 0) + on_shmem_exit_list[on_shmem_exit_index].function(code, + on_shmem_exit_list[on_shmem_exit_index].arg); + on_shmem_exit_index = 0; + + shmem_exit_inprogress = false; +} + +/* ---------------------------------------------------------------- + * atexit_callback + * + * Backstop to ensure that direct calls of exit() don't mess us up. + * + * Somebody who was being really uncooperative could call _exit(), + * but for that case we have a "dead man switch" that will make the + * postmaster treat it as a crash --- see pmsignal.c. + * ---------------------------------------------------------------- + */ +static void +atexit_callback(void) +{ + /* Clean up everything that must be cleaned up */ + /* ... too bad we don't know the real exit code ... */ + proc_exit_prepare(-1); +} + +/* ---------------------------------------------------------------- + * on_proc_exit + * + * this function adds a callback function to the list of + * functions invoked by proc_exit(). -cim 2/6/90 + * ---------------------------------------------------------------- + */ +void +on_proc_exit(pg_on_exit_callback function, Datum arg) +{ + if (on_proc_exit_index >= MAX_ON_EXITS) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("out of on_proc_exit slots"))); + + on_proc_exit_list[on_proc_exit_index].function = function; + on_proc_exit_list[on_proc_exit_index].arg = arg; + + ++on_proc_exit_index; + + if (!atexit_callback_setup) + { + atexit(atexit_callback); + atexit_callback_setup = true; + } +} + +/* ---------------------------------------------------------------- + * before_shmem_exit + * + * Register early callback to perform user-level cleanup, + * e.g. transaction abort, before we begin shutting down + * low-level subsystems. + * ---------------------------------------------------------------- + */ +void +before_shmem_exit(pg_on_exit_callback function, Datum arg) +{ + if (before_shmem_exit_index >= MAX_ON_EXITS) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("out of before_shmem_exit slots"))); + + before_shmem_exit_list[before_shmem_exit_index].function = function; + before_shmem_exit_list[before_shmem_exit_index].arg = arg; + + ++before_shmem_exit_index; + + if (!atexit_callback_setup) + { + atexit(atexit_callback); + atexit_callback_setup = true; + } +} + +/* ---------------------------------------------------------------- + * on_shmem_exit + * + * Register ordinary callback to perform low-level shutdown + * (e.g. releasing our PGPROC); run after before_shmem_exit + * callbacks and before on_proc_exit callbacks. + * ---------------------------------------------------------------- + */ +void +on_shmem_exit(pg_on_exit_callback function, Datum arg) +{ + if (on_shmem_exit_index >= MAX_ON_EXITS) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg_internal("out of on_shmem_exit slots"))); + + on_shmem_exit_list[on_shmem_exit_index].function = function; + on_shmem_exit_list[on_shmem_exit_index].arg = arg; + + ++on_shmem_exit_index; + + if (!atexit_callback_setup) + { + atexit(atexit_callback); + atexit_callback_setup = true; + } +} + +/* ---------------------------------------------------------------- + * cancel_before_shmem_exit + * + * this function removes a previously-registered before_shmem_exit + * callback. We only look at the latest entry for removal, as we + * expect callers to add and remove temporary before_shmem_exit + * callbacks in strict LIFO order. + * ---------------------------------------------------------------- + */ +void +cancel_before_shmem_exit(pg_on_exit_callback function, Datum arg) +{ + if (before_shmem_exit_index > 0 && + before_shmem_exit_list[before_shmem_exit_index - 1].function + == function && + before_shmem_exit_list[before_shmem_exit_index - 1].arg == arg) + --before_shmem_exit_index; + else + elog(ERROR, "before_shmem_exit callback (%p,0x%llx) is not the latest entry", + function, (long long) arg); +} + +/* ---------------------------------------------------------------- + * on_exit_reset + * + * this function clears all on_proc_exit() and on_shmem_exit() + * registered functions. This is used just after forking a backend, + * so that the backend doesn't believe it should call the postmaster's + * on-exit routines when it exits... + * ---------------------------------------------------------------- + */ +void +on_exit_reset(void) +{ + before_shmem_exit_index = 0; + on_shmem_exit_index = 0; + on_proc_exit_index = 0; + reset_on_dsm_detach(); +} + +/* ---------------------------------------------------------------- + * check_on_shmem_exit_lists_are_empty + * + * Debugging check that no shmem cleanup handlers have been registered + * prematurely in the current process. + * ---------------------------------------------------------------- + */ +void +check_on_shmem_exit_lists_are_empty(void) +{ + if (before_shmem_exit_index) + elog(FATAL, "before_shmem_exit has been called prematurely"); + if (on_shmem_exit_index) + elog(FATAL, "on_shmem_exit has been called prematurely"); + /* Checking DSM detach state seems unnecessary given the above */ +} diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c new file mode 100644 index 0000000..3e4ec53 --- /dev/null +++ b/src/backend/storage/ipc/ipci.c @@ -0,0 +1,291 @@ +/*------------------------------------------------------------------------- + * + * ipci.c + * POSTGRES inter-process communication initialization code. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/ipci.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/clog.h" +#include "access/commit_ts.h" +#include "access/heapam.h" +#include "access/multixact.h" +#include "access/nbtree.h" +#include "access/subtrans.h" +#include "access/syncscan.h" +#include "access/twophase.h" +#include "commands/async.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/autovacuum.h" +#include "postmaster/bgworker_internals.h" +#include "postmaster/bgwriter.h" +#include "postmaster/postmaster.h" +#include "replication/logicallauncher.h" +#include "replication/origin.h" +#include "replication/slot.h" +#include "replication/walreceiver.h" +#include "replication/walsender.h" +#include "storage/bufmgr.h" +#include "storage/dsm.h" +#include "storage/ipc.h" +#include "storage/pg_shmem.h" +#include "storage/pmsignal.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/procsignal.h" +#include "storage/sinvaladt.h" +#include "storage/spin.h" +#include "utils/snapmgr.h" + +/* GUCs */ +int shared_memory_type = DEFAULT_SHARED_MEMORY_TYPE; + +shmem_startup_hook_type shmem_startup_hook = NULL; + +static Size total_addin_request = 0; +static bool addin_request_allowed = true; + + +/* + * RequestAddinShmemSpace + * Request that extra shmem space be allocated for use by + * a loadable module. + * + * This is only useful if called from the _PG_init hook of a library that + * is loaded into the postmaster via shared_preload_libraries. Once + * shared memory has been allocated, calls will be ignored. (We could + * raise an error, but it seems better to make it a no-op, so that + * libraries containing such calls can be reloaded if needed.) + */ +void +RequestAddinShmemSpace(Size size) +{ + if (IsUnderPostmaster || !addin_request_allowed) + return; /* too late */ + total_addin_request = add_size(total_addin_request, size); +} + + +/* + * CreateSharedMemoryAndSemaphores + * Creates and initializes shared memory and semaphores. + * + * This is called by the postmaster or by a standalone backend. + * It is also called by a backend forked from the postmaster in the + * EXEC_BACKEND case. In the latter case, the shared memory segment + * already exists and has been physically attached to, but we have to + * initialize pointers in local memory that reference the shared structures, + * because we didn't inherit the correct pointer values from the postmaster + * as we do in the fork() scenario. The easiest way to do that is to run + * through the same code as before. (Note that the called routines mostly + * check IsUnderPostmaster, rather than EXEC_BACKEND, to detect this case. + * This is a bit code-wasteful and could be cleaned up.) + */ +void +CreateSharedMemoryAndSemaphores(void) +{ + PGShmemHeader *shim = NULL; + + if (!IsUnderPostmaster) + { + PGShmemHeader *seghdr; + Size size; + int numSemas; + + /* Compute number of semaphores we'll need */ + numSemas = ProcGlobalSemas(); + numSemas += SpinlockSemas(); + + /* + * Size of the Postgres shared-memory block is estimated via + * moderately-accurate estimates for the big hogs, plus 100K for the + * stuff that's too small to bother with estimating. + * + * We take some care during this phase to ensure that the total size + * request doesn't overflow size_t. If this gets through, we don't + * need to be so careful during the actual allocation phase. + */ + size = 100000; + size = add_size(size, PGSemaphoreShmemSize(numSemas)); + size = add_size(size, SpinlockSemaSize()); + size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE, + sizeof(ShmemIndexEnt))); + size = add_size(size, dsm_estimate_size()); + size = add_size(size, BufferShmemSize()); + size = add_size(size, LockShmemSize()); + size = add_size(size, PredicateLockShmemSize()); + size = add_size(size, ProcGlobalShmemSize()); + size = add_size(size, XLOGShmemSize()); + size = add_size(size, CLOGShmemSize()); + size = add_size(size, CommitTsShmemSize()); + size = add_size(size, SUBTRANSShmemSize()); + size = add_size(size, TwoPhaseShmemSize()); + size = add_size(size, BackgroundWorkerShmemSize()); + size = add_size(size, MultiXactShmemSize()); + size = add_size(size, LWLockShmemSize()); + size = add_size(size, ProcArrayShmemSize()); + size = add_size(size, BackendStatusShmemSize()); + size = add_size(size, SInvalShmemSize()); + size = add_size(size, PMSignalShmemSize()); + size = add_size(size, ProcSignalShmemSize()); + size = add_size(size, CheckpointerShmemSize()); + size = add_size(size, AutoVacuumShmemSize()); + size = add_size(size, ReplicationSlotsShmemSize()); + size = add_size(size, ReplicationOriginShmemSize()); + size = add_size(size, WalSndShmemSize()); + size = add_size(size, WalRcvShmemSize()); + size = add_size(size, PgArchShmemSize()); + size = add_size(size, ApplyLauncherShmemSize()); + size = add_size(size, SnapMgrShmemSize()); + size = add_size(size, BTreeShmemSize()); + size = add_size(size, SyncScanShmemSize()); + size = add_size(size, AsyncShmemSize()); +#ifdef EXEC_BACKEND + size = add_size(size, ShmemBackendArraySize()); +#endif + + /* freeze the addin request size and include it */ + addin_request_allowed = false; + size = add_size(size, total_addin_request); + + /* might as well round it off to a multiple of a typical page size */ + size = add_size(size, 8192 - (size % 8192)); + + elog(DEBUG3, "invoking IpcMemoryCreate(size=%zu)", size); + + /* + * Create the shmem segment + */ + seghdr = PGSharedMemoryCreate(size, &shim); + + InitShmemAccess(seghdr); + + /* + * Create semaphores + */ + PGReserveSemaphores(numSemas); + + /* + * If spinlocks are disabled, initialize emulation layer (which + * depends on semaphores, so the order is important here). + */ +#ifndef HAVE_SPINLOCKS + SpinlockSemaInit(); +#endif + } + else + { + /* + * We are reattaching to an existing shared memory segment. This + * should only be reached in the EXEC_BACKEND case. + */ +#ifndef EXEC_BACKEND + elog(PANIC, "should be attached to shared memory already"); +#endif + } + + /* + * Set up shared memory allocation mechanism + */ + if (!IsUnderPostmaster) + InitShmemAllocation(); + + /* + * Now initialize LWLocks, which do shared memory allocation and are + * needed for InitShmemIndex. + */ + CreateLWLocks(); + + /* + * Set up shmem.c index hashtable + */ + InitShmemIndex(); + + dsm_shmem_init(); + + /* + * Set up xlog, clog, and buffers + */ + XLOGShmemInit(); + CLOGShmemInit(); + CommitTsShmemInit(); + SUBTRANSShmemInit(); + MultiXactShmemInit(); + InitBufferPool(); + + /* + * Set up lock manager + */ + InitLocks(); + + /* + * Set up predicate lock manager + */ + InitPredicateLocks(); + + /* + * Set up process table + */ + if (!IsUnderPostmaster) + InitProcGlobal(); + CreateSharedProcArray(); + CreateSharedBackendStatus(); + TwoPhaseShmemInit(); + BackgroundWorkerShmemInit(); + + /* + * Set up shared-inval messaging + */ + CreateSharedInvalidationState(); + + /* + * Set up interprocess signaling mechanisms + */ + PMSignalShmemInit(); + ProcSignalShmemInit(); + CheckpointerShmemInit(); + AutoVacuumShmemInit(); + ReplicationSlotsShmemInit(); + ReplicationOriginShmemInit(); + WalSndShmemInit(); + WalRcvShmemInit(); + PgArchShmemInit(); + ApplyLauncherShmemInit(); + + /* + * Set up other modules that need some shared memory space + */ + SnapMgrInit(); + BTreeShmemInit(); + SyncScanShmemInit(); + AsyncShmemInit(); + +#ifdef EXEC_BACKEND + + /* + * Alloc the win32 shared backend array + */ + if (!IsUnderPostmaster) + ShmemBackendArrayAllocation(); +#endif + + /* Initialize dynamic shared memory facilities. */ + if (!IsUnderPostmaster) + dsm_postmaster_startup(shim); + + /* + * Now give loadable modules a chance to set up their shmem allocations + */ + if (shmem_startup_hook) + shmem_startup_hook(); +} diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c new file mode 100644 index 0000000..3427bcf --- /dev/null +++ b/src/backend/storage/ipc/latch.c @@ -0,0 +1,2158 @@ +/*------------------------------------------------------------------------- + * + * latch.c + * Routines for inter-process latches + * + * The poll() implementation uses the so-called self-pipe trick to overcome the + * race condition involved with poll() and setting a global flag in the signal + * handler. When a latch is set and the current process is waiting for it, the + * signal handler wakes up the poll() in WaitLatch by writing a byte to a pipe. + * A signal by itself doesn't interrupt poll() on all platforms, and even on + * platforms where it does, a signal that arrives just before the poll() call + * does not prevent poll() from entering sleep. An incoming byte on a pipe + * however reliably interrupts the sleep, and causes poll() to return + * immediately even if the signal arrives before poll() begins. + * + * The epoll() implementation overcomes the race with a different technique: it + * keeps SIGURG blocked and consumes from a signalfd() descriptor instead. We + * don't need to register a signal handler or create our own self-pipe. We + * assume that any system that has Linux epoll() also has Linux signalfd(). + * + * The kqueue() implementation waits for SIGURG with EVFILT_SIGNAL. + * + * The Windows implementation uses Windows events that are inherited by all + * postmaster child processes. There's no need for the self-pipe trick there. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/ipc/latch.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <fcntl.h> +#include <limits.h> +#include <signal.h> +#include <unistd.h> +#ifdef HAVE_SYS_EPOLL_H +#include <sys/epoll.h> +#endif +#ifdef HAVE_SYS_EVENT_H +#include <sys/event.h> +#endif +#ifdef HAVE_SYS_SIGNALFD_H +#include <sys/signalfd.h> +#endif +#ifdef HAVE_POLL_H +#include <poll.h> +#endif + +#include "libpq/pqsignal.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "port/atomics.h" +#include "portability/instr_time.h" +#include "postmaster/postmaster.h" +#include "storage/fd.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/pmsignal.h" +#include "storage/shmem.h" +#include "utils/memutils.h" + +/* + * Select the fd readiness primitive to use. Normally the "most modern" + * primitive supported by the OS will be used, but for testing it can be + * useful to manually specify the used primitive. If desired, just add a + * define somewhere before this block. + */ +#if defined(WAIT_USE_EPOLL) || defined(WAIT_USE_POLL) || \ + defined(WAIT_USE_KQUEUE) || defined(WAIT_USE_WIN32) +/* don't overwrite manual choice */ +#elif defined(HAVE_SYS_EPOLL_H) +#define WAIT_USE_EPOLL +#elif defined(HAVE_KQUEUE) +#define WAIT_USE_KQUEUE +#elif defined(HAVE_POLL) +#define WAIT_USE_POLL +#elif WIN32 +#define WAIT_USE_WIN32 +#else +#error "no wait set implementation available" +#endif + +/* + * By default, we use a self-pipe with poll() and a signalfd with epoll(), if + * available. We avoid signalfd on illumos for now based on problem reports. + * For testing the choice can also be manually specified. + */ +#if defined(WAIT_USE_POLL) || defined(WAIT_USE_EPOLL) +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) +/* don't overwrite manual choice */ +#elif defined(WAIT_USE_EPOLL) && defined(HAVE_SYS_SIGNALFD_H) && \ + !defined(__illumos__) +#define WAIT_USE_SIGNALFD +#else +#define WAIT_USE_SELF_PIPE +#endif +#endif + +/* typedef in latch.h */ +struct WaitEventSet +{ + int nevents; /* number of registered events */ + int nevents_space; /* maximum number of events in this set */ + + /* + * Array, of nevents_space length, storing the definition of events this + * set is waiting for. + */ + WaitEvent *events; + + /* + * If WL_LATCH_SET is specified in any wait event, latch is a pointer to + * said latch, and latch_pos the offset in the ->events array. This is + * useful because we check the state of the latch before performing doing + * syscalls related to waiting. + */ + Latch *latch; + int latch_pos; + + /* + * WL_EXIT_ON_PM_DEATH is converted to WL_POSTMASTER_DEATH, but this flag + * is set so that we'll exit immediately if postmaster death is detected, + * instead of returning. + */ + bool exit_on_postmaster_death; + +#if defined(WAIT_USE_EPOLL) + int epoll_fd; + /* epoll_wait returns events in a user provided arrays, allocate once */ + struct epoll_event *epoll_ret_events; +#elif defined(WAIT_USE_KQUEUE) + int kqueue_fd; + /* kevent returns events in a user provided arrays, allocate once */ + struct kevent *kqueue_ret_events; + bool report_postmaster_not_running; +#elif defined(WAIT_USE_POLL) + /* poll expects events to be waited on every poll() call, prepare once */ + struct pollfd *pollfds; +#elif defined(WAIT_USE_WIN32) + + /* + * Array of windows events. The first element always contains + * pgwin32_signal_event, so the remaining elements are offset by one (i.e. + * event->pos + 1). + */ + HANDLE *handles; +#endif +}; + +/* A common WaitEventSet used to implement WatchLatch() */ +static WaitEventSet *LatchWaitSet; + +/* The position of the latch in LatchWaitSet. */ +#define LatchWaitSetLatchPos 0 + +#ifndef WIN32 +/* Are we currently in WaitLatch? The signal handler would like to know. */ +static volatile sig_atomic_t waiting = false; +#endif + +#ifdef WAIT_USE_SIGNALFD +/* On Linux, we'll receive SIGURG via a signalfd file descriptor. */ +static int signal_fd = -1; +#endif + +#ifdef WAIT_USE_SELF_PIPE +/* Read and write ends of the self-pipe */ +static int selfpipe_readfd = -1; +static int selfpipe_writefd = -1; + +/* Process owning the self-pipe --- needed for checking purposes */ +static int selfpipe_owner_pid = 0; + +/* Private function prototypes */ +static void latch_sigurg_handler(SIGNAL_ARGS); +static void sendSelfPipeByte(void); +#endif + +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) +static void drain(void); +#endif + +#if defined(WAIT_USE_EPOLL) +static void WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action); +#elif defined(WAIT_USE_KQUEUE) +static void WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events); +#elif defined(WAIT_USE_POLL) +static void WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event); +#elif defined(WAIT_USE_WIN32) +static void WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event); +#endif + +static inline int WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents); + +/* + * Initialize the process-local latch infrastructure. + * + * This must be called once during startup of any process that can wait on + * latches, before it issues any InitLatch() or OwnLatch() calls. + */ +void +InitializeLatchSupport(void) +{ +#if defined(WAIT_USE_SELF_PIPE) + int pipefd[2]; + + if (IsUnderPostmaster) + { + /* + * We might have inherited connections to a self-pipe created by the + * postmaster. It's critical that child processes create their own + * self-pipes, of course, and we really want them to close the + * inherited FDs for safety's sake. + */ + if (selfpipe_owner_pid != 0) + { + /* Assert we go through here but once in a child process */ + Assert(selfpipe_owner_pid != MyProcPid); + /* Release postmaster's pipe FDs; ignore any error */ + (void) close(selfpipe_readfd); + (void) close(selfpipe_writefd); + /* Clean up, just for safety's sake; we'll set these below */ + selfpipe_readfd = selfpipe_writefd = -1; + selfpipe_owner_pid = 0; + /* Keep fd.c's accounting straight */ + ReleaseExternalFD(); + ReleaseExternalFD(); + } + else + { + /* + * Postmaster didn't create a self-pipe ... or else we're in an + * EXEC_BACKEND build, in which case it doesn't matter since the + * postmaster's pipe FDs were closed by the action of FD_CLOEXEC. + * fd.c won't have state to clean up, either. + */ + Assert(selfpipe_readfd == -1); + } + } + else + { + /* In postmaster or standalone backend, assert we do this but once */ + Assert(selfpipe_readfd == -1); + Assert(selfpipe_owner_pid == 0); + } + + /* + * Set up the self-pipe that allows a signal handler to wake up the + * poll()/epoll_wait() in WaitLatch. Make the write-end non-blocking, so + * that SetLatch won't block if the event has already been set many times + * filling the kernel buffer. Make the read-end non-blocking too, so that + * we can easily clear the pipe by reading until EAGAIN or EWOULDBLOCK. + * Also, make both FDs close-on-exec, since we surely do not want any + * child processes messing with them. + */ + if (pipe(pipefd) < 0) + elog(FATAL, "pipe() failed: %m"); + if (fcntl(pipefd[0], F_SETFL, O_NONBLOCK) == -1) + elog(FATAL, "fcntl(F_SETFL) failed on read-end of self-pipe: %m"); + if (fcntl(pipefd[1], F_SETFL, O_NONBLOCK) == -1) + elog(FATAL, "fcntl(F_SETFL) failed on write-end of self-pipe: %m"); + if (fcntl(pipefd[0], F_SETFD, FD_CLOEXEC) == -1) + elog(FATAL, "fcntl(F_SETFD) failed on read-end of self-pipe: %m"); + if (fcntl(pipefd[1], F_SETFD, FD_CLOEXEC) == -1) + elog(FATAL, "fcntl(F_SETFD) failed on write-end of self-pipe: %m"); + + selfpipe_readfd = pipefd[0]; + selfpipe_writefd = pipefd[1]; + selfpipe_owner_pid = MyProcPid; + + /* Tell fd.c about these two long-lived FDs */ + ReserveExternalFD(); + ReserveExternalFD(); + + pqsignal(SIGURG, latch_sigurg_handler); +#endif + +#ifdef WAIT_USE_SIGNALFD + sigset_t signalfd_mask; + + /* Block SIGURG, because we'll receive it through a signalfd. */ + sigaddset(&UnBlockSig, SIGURG); + + /* Set up the signalfd to receive SIGURG notifications. */ + sigemptyset(&signalfd_mask); + sigaddset(&signalfd_mask, SIGURG); + signal_fd = signalfd(-1, &signalfd_mask, SFD_NONBLOCK | SFD_CLOEXEC); + if (signal_fd < 0) + elog(FATAL, "signalfd() failed"); + ReserveExternalFD(); +#endif + +#ifdef WAIT_USE_KQUEUE + /* Ignore SIGURG, because we'll receive it via kqueue. */ + pqsignal(SIGURG, SIG_IGN); +#endif +} + +void +InitializeLatchWaitSet(void) +{ + int latch_pos PG_USED_FOR_ASSERTS_ONLY; + + Assert(LatchWaitSet == NULL); + + /* Set up the WaitEventSet used by WaitLatch(). */ + LatchWaitSet = CreateWaitEventSet(TopMemoryContext, 2); + latch_pos = AddWaitEventToSet(LatchWaitSet, WL_LATCH_SET, PGINVALID_SOCKET, + MyLatch, NULL); + if (IsUnderPostmaster) + AddWaitEventToSet(LatchWaitSet, WL_EXIT_ON_PM_DEATH, + PGINVALID_SOCKET, NULL, NULL); + + Assert(latch_pos == LatchWaitSetLatchPos); +} + +void +ShutdownLatchSupport(void) +{ +#if defined(WAIT_USE_POLL) + pqsignal(SIGURG, SIG_IGN); +#endif + + if (LatchWaitSet) + { + FreeWaitEventSet(LatchWaitSet); + LatchWaitSet = NULL; + } + +#if defined(WAIT_USE_SELF_PIPE) + close(selfpipe_readfd); + close(selfpipe_writefd); + selfpipe_readfd = -1; + selfpipe_writefd = -1; + selfpipe_owner_pid = InvalidPid; +#endif + +#if defined(WAIT_USE_SIGNALFD) + close(signal_fd); + signal_fd = -1; +#endif +} + +/* + * Initialize a process-local latch. + */ +void +InitLatch(Latch *latch) +{ + latch->is_set = false; + latch->maybe_sleeping = false; + latch->owner_pid = MyProcPid; + latch->is_shared = false; + +#if defined(WAIT_USE_SELF_PIPE) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid); +#elif defined(WAIT_USE_SIGNALFD) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(signal_fd >= 0); +#elif defined(WAIT_USE_WIN32) + latch->event = CreateEvent(NULL, TRUE, FALSE, NULL); + if (latch->event == NULL) + elog(ERROR, "CreateEvent failed: error code %lu", GetLastError()); +#endif /* WIN32 */ +} + +/* + * Initialize a shared latch that can be set from other processes. The latch + * is initially owned by no-one; use OwnLatch to associate it with the + * current process. + * + * InitSharedLatch needs to be called in postmaster before forking child + * processes, usually right after allocating the shared memory block + * containing the latch with ShmemInitStruct. (The Unix implementation + * doesn't actually require that, but the Windows one does.) Because of + * this restriction, we have no concurrency issues to worry about here. + * + * Note that other handles created in this module are never marked as + * inheritable. Thus we do not need to worry about cleaning up child + * process references to postmaster-private latches or WaitEventSets. + */ +void +InitSharedLatch(Latch *latch) +{ +#ifdef WIN32 + SECURITY_ATTRIBUTES sa; + + /* + * Set up security attributes to specify that the events are inherited. + */ + ZeroMemory(&sa, sizeof(sa)); + sa.nLength = sizeof(sa); + sa.bInheritHandle = TRUE; + + latch->event = CreateEvent(&sa, TRUE, FALSE, NULL); + if (latch->event == NULL) + elog(ERROR, "CreateEvent failed: error code %lu", GetLastError()); +#endif + + latch->is_set = false; + latch->maybe_sleeping = false; + latch->owner_pid = 0; + latch->is_shared = true; +} + +/* + * Associate a shared latch with the current process, allowing it to + * wait on the latch. + * + * Although there is a sanity check for latch-already-owned, we don't do + * any sort of locking here, meaning that we could fail to detect the error + * if two processes try to own the same latch at about the same time. If + * there is any risk of that, caller must provide an interlock to prevent it. + */ +void +OwnLatch(Latch *latch) +{ + /* Sanity checks */ + Assert(latch->is_shared); + +#if defined(WAIT_USE_SELF_PIPE) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(selfpipe_readfd >= 0 && selfpipe_owner_pid == MyProcPid); +#elif defined(WAIT_USE_SIGNALFD) + /* Assert InitializeLatchSupport has been called in this process */ + Assert(signal_fd >= 0); +#endif + + if (latch->owner_pid != 0) + elog(ERROR, "latch already owned"); + + latch->owner_pid = MyProcPid; +} + +/* + * Disown a shared latch currently owned by the current process. + */ +void +DisownLatch(Latch *latch) +{ + Assert(latch->is_shared); + Assert(latch->owner_pid == MyProcPid); + + latch->owner_pid = 0; +} + +/* + * Wait for a given latch to be set, or for postmaster death, or until timeout + * is exceeded. 'wakeEvents' is a bitmask that specifies which of those events + * to wait for. If the latch is already set (and WL_LATCH_SET is given), the + * function returns immediately. + * + * The "timeout" is given in milliseconds. It must be >= 0 if WL_TIMEOUT flag + * is given. Although it is declared as "long", we don't actually support + * timeouts longer than INT_MAX milliseconds. Note that some extra overhead + * is incurred when WL_TIMEOUT is given, so avoid using a timeout if possible. + * + * The latch must be owned by the current process, ie. it must be a + * process-local latch initialized with InitLatch, or a shared latch + * associated with the current process by calling OwnLatch. + * + * Returns bit mask indicating which condition(s) caused the wake-up. Note + * that if multiple wake-up conditions are true, there is no guarantee that + * we return all of them in one call, but we will return at least one. + */ +int +WaitLatch(Latch *latch, int wakeEvents, long timeout, + uint32 wait_event_info) +{ + WaitEvent event; + + /* Postmaster-managed callers must handle postmaster death somehow. */ + Assert(!IsUnderPostmaster || + (wakeEvents & WL_EXIT_ON_PM_DEATH) || + (wakeEvents & WL_POSTMASTER_DEATH)); + + /* + * Some callers may have a latch other than MyLatch, or no latch at all, + * or want to handle postmaster death differently. It's cheap to assign + * those, so just do it every time. + */ + if (!(wakeEvents & WL_LATCH_SET)) + latch = NULL; + ModifyWaitEvent(LatchWaitSet, LatchWaitSetLatchPos, WL_LATCH_SET, latch); + LatchWaitSet->exit_on_postmaster_death = + ((wakeEvents & WL_EXIT_ON_PM_DEATH) != 0); + + if (WaitEventSetWait(LatchWaitSet, + (wakeEvents & WL_TIMEOUT) ? timeout : -1, + &event, 1, + wait_event_info) == 0) + return WL_TIMEOUT; + else + return event.events; +} + +/* + * Like WaitLatch, but with an extra socket argument for WL_SOCKET_* + * conditions. + * + * When waiting on a socket, EOF and error conditions always cause the socket + * to be reported as readable/writable/connected, so that the caller can deal + * with the condition. + * + * wakeEvents must include either WL_EXIT_ON_PM_DEATH for automatic exit + * if the postmaster dies or WL_POSTMASTER_DEATH for a flag set in the + * return value if the postmaster dies. The latter is useful for rare cases + * where some behavior other than immediate exit is needed. + * + * NB: These days this is just a wrapper around the WaitEventSet API. When + * using a latch very frequently, consider creating a longer living + * WaitEventSet instead; that's more efficient. + */ +int +WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, + long timeout, uint32 wait_event_info) +{ + int ret = 0; + int rc; + WaitEvent event; + WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3); + + if (wakeEvents & WL_TIMEOUT) + Assert(timeout >= 0); + else + timeout = -1; + + if (wakeEvents & WL_LATCH_SET) + AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET, + latch, NULL); + + /* Postmaster-managed callers must handle postmaster death somehow. */ + Assert(!IsUnderPostmaster || + (wakeEvents & WL_EXIT_ON_PM_DEATH) || + (wakeEvents & WL_POSTMASTER_DEATH)); + + if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster) + AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, + NULL, NULL); + + if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster) + AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); + + if (wakeEvents & WL_SOCKET_MASK) + { + int ev; + + ev = wakeEvents & WL_SOCKET_MASK; + AddWaitEventToSet(set, ev, sock, NULL, NULL); + } + + rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info); + + if (rc == 0) + ret |= WL_TIMEOUT; + else + { + ret |= event.events & (WL_LATCH_SET | + WL_POSTMASTER_DEATH | + WL_SOCKET_MASK); + } + + FreeWaitEventSet(set); + + return ret; +} + +/* + * Sets a latch and wakes up anyone waiting on it. + * + * This is cheap if the latch is already set, otherwise not so much. + * + * NB: when calling this in a signal handler, be sure to save and restore + * errno around it. (That's standard practice in most signal handlers, of + * course, but we used to omit it in handlers that only set a flag.) + * + * NB: this function is called from critical sections and signal handlers so + * throwing an error is not a good idea. + */ +void +SetLatch(Latch *latch) +{ +#ifndef WIN32 + pid_t owner_pid; +#else + HANDLE handle; +#endif + + /* + * The memory barrier has to be placed here to ensure that any flag + * variables possibly changed by this process have been flushed to main + * memory, before we check/set is_set. + */ + pg_memory_barrier(); + + /* Quick exit if already set */ + if (latch->is_set) + return; + + latch->is_set = true; + + pg_memory_barrier(); + if (!latch->maybe_sleeping) + return; + +#ifndef WIN32 + + /* + * See if anyone's waiting for the latch. It can be the current process if + * we're in a signal handler. We use the self-pipe or SIGURG to ourselves + * to wake up WaitEventSetWaitBlock() without races in that case. If it's + * another process, send a signal. + * + * Fetch owner_pid only once, in case the latch is concurrently getting + * owned or disowned. XXX: This assumes that pid_t is atomic, which isn't + * guaranteed to be true! In practice, the effective range of pid_t fits + * in a 32 bit integer, and so should be atomic. In the worst case, we + * might end up signaling the wrong process. Even then, you're very + * unlucky if a process with that bogus pid exists and belongs to + * Postgres; and PG database processes should handle excess SIGUSR1 + * interrupts without a problem anyhow. + * + * Another sort of race condition that's possible here is for a new + * process to own the latch immediately after we look, so we don't signal + * it. This is okay so long as all callers of ResetLatch/WaitLatch follow + * the standard coding convention of waiting at the bottom of their loops, + * not the top, so that they'll correctly process latch-setting events + * that happen before they enter the loop. + */ + owner_pid = latch->owner_pid; + if (owner_pid == 0) + return; + else if (owner_pid == MyProcPid) + { +#if defined(WAIT_USE_SELF_PIPE) + if (waiting) + sendSelfPipeByte(); +#else + if (waiting) + kill(MyProcPid, SIGURG); +#endif + } + else + kill(owner_pid, SIGURG); + +#else + + /* + * See if anyone's waiting for the latch. It can be the current process if + * we're in a signal handler. + * + * Use a local variable here just in case somebody changes the event field + * concurrently (which really should not happen). + */ + handle = latch->event; + if (handle) + { + SetEvent(handle); + + /* + * Note that we silently ignore any errors. We might be in a signal + * handler or other critical path where it's not safe to call elog(). + */ + } +#endif + +} + +/* + * Clear the latch. Calling WaitLatch after this will sleep, unless + * the latch is set again before the WaitLatch call. + */ +void +ResetLatch(Latch *latch) +{ + /* Only the owner should reset the latch */ + Assert(latch->owner_pid == MyProcPid); + Assert(latch->maybe_sleeping == false); + + latch->is_set = false; + + /* + * Ensure that the write to is_set gets flushed to main memory before we + * examine any flag variables. Otherwise a concurrent SetLatch might + * falsely conclude that it needn't signal us, even though we have missed + * seeing some flag updates that SetLatch was supposed to inform us of. + */ + pg_memory_barrier(); +} + +/* + * Create a WaitEventSet with space for nevents different events to wait for. + * + * These events can then be efficiently waited upon together, using + * WaitEventSetWait(). + */ +WaitEventSet * +CreateWaitEventSet(MemoryContext context, int nevents) +{ + WaitEventSet *set; + char *data; + Size sz = 0; + + /* + * Use MAXALIGN size/alignment to guarantee that later uses of memory are + * aligned correctly. E.g. epoll_event might need 8 byte alignment on some + * platforms, but earlier allocations like WaitEventSet and WaitEvent + * might not be sized to guarantee that when purely using sizeof(). + */ + sz += MAXALIGN(sizeof(WaitEventSet)); + sz += MAXALIGN(sizeof(WaitEvent) * nevents); + +#if defined(WAIT_USE_EPOLL) + sz += MAXALIGN(sizeof(struct epoll_event) * nevents); +#elif defined(WAIT_USE_KQUEUE) + sz += MAXALIGN(sizeof(struct kevent) * nevents); +#elif defined(WAIT_USE_POLL) + sz += MAXALIGN(sizeof(struct pollfd) * nevents); +#elif defined(WAIT_USE_WIN32) + /* need space for the pgwin32_signal_event */ + sz += MAXALIGN(sizeof(HANDLE) * (nevents + 1)); +#endif + + data = (char *) MemoryContextAllocZero(context, sz); + + set = (WaitEventSet *) data; + data += MAXALIGN(sizeof(WaitEventSet)); + + set->events = (WaitEvent *) data; + data += MAXALIGN(sizeof(WaitEvent) * nevents); + +#if defined(WAIT_USE_EPOLL) + set->epoll_ret_events = (struct epoll_event *) data; + data += MAXALIGN(sizeof(struct epoll_event) * nevents); +#elif defined(WAIT_USE_KQUEUE) + set->kqueue_ret_events = (struct kevent *) data; + data += MAXALIGN(sizeof(struct kevent) * nevents); +#elif defined(WAIT_USE_POLL) + set->pollfds = (struct pollfd *) data; + data += MAXALIGN(sizeof(struct pollfd) * nevents); +#elif defined(WAIT_USE_WIN32) + set->handles = (HANDLE) data; + data += MAXALIGN(sizeof(HANDLE) * nevents); +#endif + + set->latch = NULL; + set->nevents_space = nevents; + set->exit_on_postmaster_death = false; + +#if defined(WAIT_USE_EPOLL) + if (!AcquireExternalFD()) + { + /* treat this as though epoll_create1 itself returned EMFILE */ + elog(ERROR, "epoll_create1 failed: %m"); + } + set->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (set->epoll_fd < 0) + { + ReleaseExternalFD(); + elog(ERROR, "epoll_create1 failed: %m"); + } +#elif defined(WAIT_USE_KQUEUE) + if (!AcquireExternalFD()) + { + /* treat this as though kqueue itself returned EMFILE */ + elog(ERROR, "kqueue failed: %m"); + } + set->kqueue_fd = kqueue(); + if (set->kqueue_fd < 0) + { + ReleaseExternalFD(); + elog(ERROR, "kqueue failed: %m"); + } + if (fcntl(set->kqueue_fd, F_SETFD, FD_CLOEXEC) == -1) + { + int save_errno = errno; + + close(set->kqueue_fd); + ReleaseExternalFD(); + errno = save_errno; + elog(ERROR, "fcntl(F_SETFD) failed on kqueue descriptor: %m"); + } + set->report_postmaster_not_running = false; +#elif defined(WAIT_USE_WIN32) + + /* + * To handle signals while waiting, we need to add a win32 specific event. + * We accounted for the additional event at the top of this routine. See + * port/win32/signal.c for more details. + * + * Note: pgwin32_signal_event should be first to ensure that it will be + * reported when multiple events are set. We want to guarantee that + * pending signals are serviced. + */ + set->handles[0] = pgwin32_signal_event; + StaticAssertStmt(WSA_INVALID_EVENT == NULL, ""); +#endif + + return set; +} + +/* + * Free a previously created WaitEventSet. + * + * Note: preferably, this shouldn't have to free any resources that could be + * inherited across an exec(). If it did, we'd likely leak those resources in + * many scenarios. For the epoll case, we ensure that by setting EPOLL_CLOEXEC + * when the FD is created. For the Windows case, we assume that the handles + * involved are non-inheritable. + */ +void +FreeWaitEventSet(WaitEventSet *set) +{ +#if defined(WAIT_USE_EPOLL) + close(set->epoll_fd); + ReleaseExternalFD(); +#elif defined(WAIT_USE_KQUEUE) + close(set->kqueue_fd); + ReleaseExternalFD(); +#elif defined(WAIT_USE_WIN32) + WaitEvent *cur_event; + + for (cur_event = set->events; + cur_event < (set->events + set->nevents); + cur_event++) + { + if (cur_event->events & WL_LATCH_SET) + { + /* uses the latch's HANDLE */ + } + else if (cur_event->events & WL_POSTMASTER_DEATH) + { + /* uses PostmasterHandle */ + } + else + { + /* Clean up the event object we created for the socket */ + WSAEventSelect(cur_event->fd, NULL, 0); + WSACloseEvent(set->handles[cur_event->pos + 1]); + } + } +#endif + + pfree(set); +} + +/* --- + * Add an event to the set. Possible events are: + * - WL_LATCH_SET: Wait for the latch to be set + * - WL_POSTMASTER_DEATH: Wait for postmaster to die + * - WL_SOCKET_READABLE: Wait for socket to become readable, + * can be combined in one event with other WL_SOCKET_* events + * - WL_SOCKET_WRITEABLE: Wait for socket to become writeable, + * can be combined with other WL_SOCKET_* events + * - WL_SOCKET_CONNECTED: Wait for socket connection to be established, + * can be combined with other WL_SOCKET_* events (on non-Windows + * platforms, this is the same as WL_SOCKET_WRITEABLE) + * - WL_EXIT_ON_PM_DEATH: Exit immediately if the postmaster dies + * + * Returns the offset in WaitEventSet->events (starting from 0), which can be + * used to modify previously added wait events using ModifyWaitEvent(). + * + * In the WL_LATCH_SET case the latch must be owned by the current process, + * i.e. it must be a process-local latch initialized with InitLatch, or a + * shared latch associated with the current process by calling OwnLatch. + * + * In the WL_SOCKET_READABLE/WRITEABLE/CONNECTED cases, EOF and error + * conditions cause the socket to be reported as readable/writable/connected, + * so that the caller can deal with the condition. + * + * The user_data pointer specified here will be set for the events returned + * by WaitEventSetWait(), allowing to easily associate additional data with + * events. + */ +int +AddWaitEventToSet(WaitEventSet *set, uint32 events, pgsocket fd, Latch *latch, + void *user_data) +{ + WaitEvent *event; + + /* not enough space */ + Assert(set->nevents < set->nevents_space); + + if (events == WL_EXIT_ON_PM_DEATH) + { + events = WL_POSTMASTER_DEATH; + set->exit_on_postmaster_death = true; + } + + if (latch) + { + if (latch->owner_pid != MyProcPid) + elog(ERROR, "cannot wait on a latch owned by another process"); + if (set->latch) + elog(ERROR, "cannot wait on more than one latch"); + if ((events & WL_LATCH_SET) != WL_LATCH_SET) + elog(ERROR, "latch events only support being set"); + } + else + { + if (events & WL_LATCH_SET) + elog(ERROR, "cannot wait on latch without a specified latch"); + } + + /* waiting for socket readiness without a socket indicates a bug */ + if (fd == PGINVALID_SOCKET && (events & WL_SOCKET_MASK)) + elog(ERROR, "cannot wait on socket event without a socket"); + + event = &set->events[set->nevents]; + event->pos = set->nevents++; + event->fd = fd; + event->events = events; + event->user_data = user_data; +#ifdef WIN32 + event->reset = false; +#endif + + if (events == WL_LATCH_SET) + { + set->latch = latch; + set->latch_pos = event->pos; +#if defined(WAIT_USE_SELF_PIPE) + event->fd = selfpipe_readfd; +#elif defined(WAIT_USE_SIGNALFD) + event->fd = signal_fd; +#else + event->fd = PGINVALID_SOCKET; +#ifdef WAIT_USE_EPOLL + return event->pos; +#endif +#endif + } + else if (events == WL_POSTMASTER_DEATH) + { +#ifndef WIN32 + event->fd = postmaster_alive_fds[POSTMASTER_FD_WATCH]; +#endif + } + + /* perform wait primitive specific initialization, if needed */ +#if defined(WAIT_USE_EPOLL) + WaitEventAdjustEpoll(set, event, EPOLL_CTL_ADD); +#elif defined(WAIT_USE_KQUEUE) + WaitEventAdjustKqueue(set, event, 0); +#elif defined(WAIT_USE_POLL) + WaitEventAdjustPoll(set, event); +#elif defined(WAIT_USE_WIN32) + WaitEventAdjustWin32(set, event); +#endif + + return event->pos; +} + +/* + * Change the event mask and, in the WL_LATCH_SET case, the latch associated + * with the WaitEvent. The latch may be changed to NULL to disable the latch + * temporarily, and then set back to a latch later. + * + * 'pos' is the id returned by AddWaitEventToSet. + */ +void +ModifyWaitEvent(WaitEventSet *set, int pos, uint32 events, Latch *latch) +{ + WaitEvent *event; +#if defined(WAIT_USE_KQUEUE) + int old_events; +#endif + + Assert(pos < set->nevents); + + event = &set->events[pos]; +#if defined(WAIT_USE_KQUEUE) + old_events = event->events; +#endif + + /* + * If neither the event mask nor the associated latch changes, return + * early. That's an important optimization for some sockets, where + * ModifyWaitEvent is frequently used to switch from waiting for reads to + * waiting on writes. + */ + if (events == event->events && + (!(event->events & WL_LATCH_SET) || set->latch == latch)) + return; + + if (event->events & WL_LATCH_SET && + events != event->events) + { + elog(ERROR, "cannot modify latch event"); + } + + if (event->events & WL_POSTMASTER_DEATH) + { + elog(ERROR, "cannot modify postmaster death event"); + } + + /* FIXME: validate event mask */ + event->events = events; + + if (events == WL_LATCH_SET) + { + if (latch && latch->owner_pid != MyProcPid) + elog(ERROR, "cannot wait on a latch owned by another process"); + set->latch = latch; + + /* + * On Unix, we don't need to modify the kernel object because the + * underlying pipe (if there is one) is the same for all latches so we + * can return immediately. On Windows, we need to update our array of + * handles, but we leave the old one in place and tolerate spurious + * wakeups if the latch is disabled. + */ +#if defined(WAIT_USE_WIN32) + if (!latch) + return; +#else + return; +#endif + } + +#if defined(WAIT_USE_EPOLL) + WaitEventAdjustEpoll(set, event, EPOLL_CTL_MOD); +#elif defined(WAIT_USE_KQUEUE) + WaitEventAdjustKqueue(set, event, old_events); +#elif defined(WAIT_USE_POLL) + WaitEventAdjustPoll(set, event); +#elif defined(WAIT_USE_WIN32) + WaitEventAdjustWin32(set, event); +#endif +} + +#if defined(WAIT_USE_EPOLL) +/* + * action can be one of EPOLL_CTL_ADD | EPOLL_CTL_MOD | EPOLL_CTL_DEL + */ +static void +WaitEventAdjustEpoll(WaitEventSet *set, WaitEvent *event, int action) +{ + struct epoll_event epoll_ev; + int rc; + + /* pointer to our event, returned by epoll_wait */ + epoll_ev.data.ptr = event; + /* always wait for errors */ + epoll_ev.events = EPOLLERR | EPOLLHUP; + + /* prepare pollfd entry once */ + if (event->events == WL_LATCH_SET) + { + Assert(set->latch != NULL); + epoll_ev.events |= EPOLLIN; + } + else if (event->events == WL_POSTMASTER_DEATH) + { + epoll_ev.events |= EPOLLIN; + } + else + { + Assert(event->fd != PGINVALID_SOCKET); + Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)); + + if (event->events & WL_SOCKET_READABLE) + epoll_ev.events |= EPOLLIN; + if (event->events & WL_SOCKET_WRITEABLE) + epoll_ev.events |= EPOLLOUT; + } + + /* + * Even though unused, we also pass epoll_ev as the data argument if + * EPOLL_CTL_DEL is passed as action. There used to be an epoll bug + * requiring that, and actually it makes the code simpler... + */ + rc = epoll_ctl(set->epoll_fd, action, event->fd, &epoll_ev); + + if (rc < 0) + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "epoll_ctl"))); +} +#endif + +#if defined(WAIT_USE_POLL) +static void +WaitEventAdjustPoll(WaitEventSet *set, WaitEvent *event) +{ + struct pollfd *pollfd = &set->pollfds[event->pos]; + + pollfd->revents = 0; + pollfd->fd = event->fd; + + /* prepare pollfd entry once */ + if (event->events == WL_LATCH_SET) + { + Assert(set->latch != NULL); + pollfd->events = POLLIN; + } + else if (event->events == WL_POSTMASTER_DEATH) + { + pollfd->events = POLLIN; + } + else + { + Assert(event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)); + pollfd->events = 0; + if (event->events & WL_SOCKET_READABLE) + pollfd->events |= POLLIN; + if (event->events & WL_SOCKET_WRITEABLE) + pollfd->events |= POLLOUT; + } + + Assert(event->fd != PGINVALID_SOCKET); +} +#endif + +#if defined(WAIT_USE_KQUEUE) + +/* + * On most BSD family systems, the udata member of struct kevent is of type + * void *, so we could directly convert to/from WaitEvent *. Unfortunately, + * NetBSD has it as intptr_t, so here we wallpaper over that difference with + * an lvalue cast. + */ +#define AccessWaitEvent(k_ev) (*((WaitEvent **)(&(k_ev)->udata))) + +static inline void +WaitEventAdjustKqueueAdd(struct kevent *k_ev, int filter, int action, + WaitEvent *event) +{ + k_ev->ident = event->fd; + k_ev->filter = filter; + k_ev->flags = action; + k_ev->fflags = 0; + k_ev->data = 0; + AccessWaitEvent(k_ev) = event; +} + +static inline void +WaitEventAdjustKqueueAddPostmaster(struct kevent *k_ev, WaitEvent *event) +{ + /* For now postmaster death can only be added, not removed. */ + k_ev->ident = PostmasterPid; + k_ev->filter = EVFILT_PROC; + k_ev->flags = EV_ADD; + k_ev->fflags = NOTE_EXIT; + k_ev->data = 0; + AccessWaitEvent(k_ev) = event; +} + +static inline void +WaitEventAdjustKqueueAddLatch(struct kevent *k_ev, WaitEvent *event) +{ + /* For now latch can only be added, not removed. */ + k_ev->ident = SIGURG; + k_ev->filter = EVFILT_SIGNAL; + k_ev->flags = EV_ADD; + k_ev->fflags = 0; + k_ev->data = 0; + AccessWaitEvent(k_ev) = event; +} + +/* + * old_events is the previous event mask, used to compute what has changed. + */ +static void +WaitEventAdjustKqueue(WaitEventSet *set, WaitEvent *event, int old_events) +{ + int rc; + struct kevent k_ev[2]; + int count = 0; + bool new_filt_read = false; + bool old_filt_read = false; + bool new_filt_write = false; + bool old_filt_write = false; + + if (old_events == event->events) + return; + + Assert(event->events != WL_LATCH_SET || set->latch != NULL); + Assert(event->events == WL_LATCH_SET || + event->events == WL_POSTMASTER_DEATH || + (event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE))); + + if (event->events == WL_POSTMASTER_DEATH) + { + /* + * Unlike all the other implementations, we detect postmaster death + * using process notification instead of waiting on the postmaster + * alive pipe. + */ + WaitEventAdjustKqueueAddPostmaster(&k_ev[count++], event); + } + else if (event->events == WL_LATCH_SET) + { + /* We detect latch wakeup using a signal event. */ + WaitEventAdjustKqueueAddLatch(&k_ev[count++], event); + } + else + { + /* + * We need to compute the adds and deletes required to get from the + * old event mask to the new event mask, since kevent treats readable + * and writable as separate events. + */ + if (old_events & WL_SOCKET_READABLE) + old_filt_read = true; + if (event->events & WL_SOCKET_READABLE) + new_filt_read = true; + if (old_events & WL_SOCKET_WRITEABLE) + old_filt_write = true; + if (event->events & WL_SOCKET_WRITEABLE) + new_filt_write = true; + if (old_filt_read && !new_filt_read) + WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_DELETE, + event); + else if (!old_filt_read && new_filt_read) + WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_READ, EV_ADD, + event); + if (old_filt_write && !new_filt_write) + WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_DELETE, + event); + else if (!old_filt_write && new_filt_write) + WaitEventAdjustKqueueAdd(&k_ev[count++], EVFILT_WRITE, EV_ADD, + event); + } + + Assert(count > 0); + Assert(count <= 2); + + rc = kevent(set->kqueue_fd, &k_ev[0], count, NULL, 0, NULL); + + /* + * When adding the postmaster's pid, we have to consider that it might + * already have exited and perhaps even been replaced by another process + * with the same pid. If so, we have to defer reporting this as an event + * until the next call to WaitEventSetWaitBlock(). + */ + + if (rc < 0) + { + if (event->events == WL_POSTMASTER_DEATH && + (errno == ESRCH || errno == EACCES)) + set->report_postmaster_not_running = true; + else + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "kevent"))); + } + else if (event->events == WL_POSTMASTER_DEATH && + PostmasterPid != getppid() && + !PostmasterIsAlive()) + { + /* + * The extra PostmasterIsAliveInternal() check prevents false alarms + * on systems that give a different value for getppid() while being + * traced by a debugger. + */ + set->report_postmaster_not_running = true; + } +} + +#endif + +#if defined(WAIT_USE_WIN32) +static void +WaitEventAdjustWin32(WaitEventSet *set, WaitEvent *event) +{ + HANDLE *handle = &set->handles[event->pos + 1]; + + if (event->events == WL_LATCH_SET) + { + Assert(set->latch != NULL); + *handle = set->latch->event; + } + else if (event->events == WL_POSTMASTER_DEATH) + { + *handle = PostmasterHandle; + } + else + { + int flags = FD_CLOSE; /* always check for errors/EOF */ + + if (event->events & WL_SOCKET_READABLE) + flags |= FD_READ; + if (event->events & WL_SOCKET_WRITEABLE) + flags |= FD_WRITE; + if (event->events & WL_SOCKET_CONNECTED) + flags |= FD_CONNECT; + + if (*handle == WSA_INVALID_EVENT) + { + *handle = WSACreateEvent(); + if (*handle == WSA_INVALID_EVENT) + elog(ERROR, "failed to create event for socket: error code %d", + WSAGetLastError()); + } + if (WSAEventSelect(event->fd, *handle, flags) != 0) + elog(ERROR, "failed to set up event for socket: error code %d", + WSAGetLastError()); + + Assert(event->fd != PGINVALID_SOCKET); + } +} +#endif + +/* + * Wait for events added to the set to happen, or until the timeout is + * reached. At most nevents occurred events are returned. + * + * If timeout = -1, block until an event occurs; if 0, check sockets for + * readiness, but don't block; if > 0, block for at most timeout milliseconds. + * + * Returns the number of events occurred, or 0 if the timeout was reached. + * + * Returned events will have the fd, pos, user_data fields set to the + * values associated with the registered event. + */ +int +WaitEventSetWait(WaitEventSet *set, long timeout, + WaitEvent *occurred_events, int nevents, + uint32 wait_event_info) +{ + int returned_events = 0; + instr_time start_time; + instr_time cur_time; + long cur_timeout = -1; + + Assert(nevents > 0); + + /* + * Initialize timeout if requested. We must record the current time so + * that we can determine the remaining timeout if interrupted. + */ + if (timeout >= 0) + { + INSTR_TIME_SET_CURRENT(start_time); + Assert(timeout >= 0 && timeout <= INT_MAX); + cur_timeout = timeout; + } + + pgstat_report_wait_start(wait_event_info); + +#ifndef WIN32 + waiting = true; +#else + /* Ensure that signals are serviced even if latch is already set */ + pgwin32_dispatch_queued_signals(); +#endif + while (returned_events == 0) + { + int rc; + + /* + * Check if the latch is set already. If so, leave the loop + * immediately, avoid blocking again. We don't attempt to report any + * other events that might also be satisfied. + * + * If someone sets the latch between this and the + * WaitEventSetWaitBlock() below, the setter will write a byte to the + * pipe (or signal us and the signal handler will do that), and the + * readiness routine will return immediately. + * + * On unix, If there's a pending byte in the self pipe, we'll notice + * whenever blocking. Only clearing the pipe in that case avoids + * having to drain it every time WaitLatchOrSocket() is used. Should + * the pipe-buffer fill up we're still ok, because the pipe is in + * nonblocking mode. It's unlikely for that to happen, because the + * self pipe isn't filled unless we're blocking (waiting = true), or + * from inside a signal handler in latch_sigurg_handler(). + * + * On windows, we'll also notice if there's a pending event for the + * latch when blocking, but there's no danger of anything filling up, + * as "Setting an event that is already set has no effect.". + * + * Note: we assume that the kernel calls involved in latch management + * will provide adequate synchronization on machines with weak memory + * ordering, so that we cannot miss seeing is_set if a notification + * has already been queued. + */ + if (set->latch && !set->latch->is_set) + { + /* about to sleep on a latch */ + set->latch->maybe_sleeping = true; + pg_memory_barrier(); + /* and recheck */ + } + + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->pos = set->latch_pos; + occurred_events->user_data = + set->events[set->latch_pos].user_data; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + + /* could have been set above */ + set->latch->maybe_sleeping = false; + + break; + } + + /* + * Wait for events using the readiness primitive chosen at the top of + * this file. If -1 is returned, a timeout has occurred, if 0 we have + * to retry, everything >= 1 is the number of returned events. + */ + rc = WaitEventSetWaitBlock(set, cur_timeout, + occurred_events, nevents); + + if (set->latch) + { + Assert(set->latch->maybe_sleeping); + set->latch->maybe_sleeping = false; + } + + if (rc == -1) + break; /* timeout occurred */ + else + returned_events = rc; + + /* If we're not done, update cur_timeout for next iteration */ + if (returned_events == 0 && timeout >= 0) + { + INSTR_TIME_SET_CURRENT(cur_time); + INSTR_TIME_SUBTRACT(cur_time, start_time); + cur_timeout = timeout - (long) INSTR_TIME_GET_MILLISEC(cur_time); + if (cur_timeout <= 0) + break; + } + } +#ifndef WIN32 + waiting = false; +#endif + + pgstat_report_wait_end(); + + return returned_events; +} + + +#if defined(WAIT_USE_EPOLL) + +/* + * Wait using linux's epoll_wait(2). + * + * This is the preferable wait method, as several readiness notifications are + * delivered, without having to iterate through all of set->events. The return + * epoll_event struct contain a pointer to our events, making association + * easy. + */ +static inline int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + int rc; + WaitEvent *cur_event; + struct epoll_event *cur_epoll_event; + + /* Sleep */ + rc = epoll_wait(set->epoll_fd, set->epoll_ret_events, + nevents, cur_timeout); + + /* Check return code */ + if (rc < 0) + { + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "epoll_wait"))); + } + return 0; + } + else if (rc == 0) + { + /* timeout exceeded */ + return -1; + } + + /* + * At least one event occurred, iterate over the returned epoll events + * until they're either all processed, or we've returned all the events + * the caller desired. + */ + for (cur_epoll_event = set->epoll_ret_events; + cur_epoll_event < (set->epoll_ret_events + rc) && + returned_events < nevents; + cur_epoll_event++) + { + /* epoll's data pointer is set to the associated WaitEvent */ + cur_event = (WaitEvent *) cur_epoll_event->data.ptr; + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET && + cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) + { + /* Drain the signalfd. */ + drain(); + + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH && + cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP)) + { + /* + * We expect an EPOLLHUP when the remote end is closed, but + * because we don't expect the pipe to become readable or to have + * any errors either, treat those cases as postmaster death, too. + * + * Be paranoid about a spurious event signaling the postmaster as + * being dead. There have been reports about that happening with + * older primitives (select(2) to be specific), and a spurious + * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't + * cost much. + */ + if (!PostmasterIsAliveInternal()) + { + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) + { + Assert(cur_event->fd != PGINVALID_SOCKET); + + if ((cur_event->events & WL_SOCKET_READABLE) && + (cur_epoll_event->events & (EPOLLIN | EPOLLERR | EPOLLHUP))) + { + /* data available in socket, or EOF */ + occurred_events->events |= WL_SOCKET_READABLE; + } + + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (cur_epoll_event->events & (EPOLLOUT | EPOLLERR | EPOLLHUP))) + { + /* writable, or EOF */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + + if (occurred_events->events != 0) + { + occurred_events->fd = cur_event->fd; + occurred_events++; + returned_events++; + } + } + } + + return returned_events; +} + +#elif defined(WAIT_USE_KQUEUE) + +/* + * Wait using kevent(2) on BSD-family systems and macOS. + * + * For now this mirrors the epoll code, but in future it could modify the fd + * set in the same call to kevent as it uses for waiting instead of doing that + * with separate system calls. + */ +static int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + int rc; + WaitEvent *cur_event; + struct kevent *cur_kqueue_event; + struct timespec timeout; + struct timespec *timeout_p; + + if (cur_timeout < 0) + timeout_p = NULL; + else + { + timeout.tv_sec = cur_timeout / 1000; + timeout.tv_nsec = (cur_timeout % 1000) * 1000000; + timeout_p = &timeout; + } + + /* + * Report postmaster events discovered by WaitEventAdjustKqueue() or an + * earlier call to WaitEventSetWait(). + */ + if (unlikely(set->report_postmaster_not_running)) + { + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + return 1; + } + + /* Sleep */ + rc = kevent(set->kqueue_fd, NULL, 0, + set->kqueue_ret_events, nevents, + timeout_p); + + /* Check return code */ + if (rc < 0) + { + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "kevent"))); + } + return 0; + } + else if (rc == 0) + { + /* timeout exceeded */ + return -1; + } + + /* + * At least one event occurred, iterate over the returned kqueue events + * until they're either all processed, or we've returned all the events + * the caller desired. + */ + for (cur_kqueue_event = set->kqueue_ret_events; + cur_kqueue_event < (set->kqueue_ret_events + rc) && + returned_events < nevents; + cur_kqueue_event++) + { + /* kevent's udata points to the associated WaitEvent */ + cur_event = AccessWaitEvent(cur_kqueue_event); + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET && + cur_kqueue_event->filter == EVFILT_SIGNAL) + { + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH && + cur_kqueue_event->filter == EVFILT_PROC && + (cur_kqueue_event->fflags & NOTE_EXIT) != 0) + { + /* + * The kernel will tell this kqueue object only once about the + * exit of the postmaster, so let's remember that for next time so + * that we provide level-triggered semantics. + */ + set->report_postmaster_not_running = true; + + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) + { + Assert(cur_event->fd >= 0); + + if ((cur_event->events & WL_SOCKET_READABLE) && + (cur_kqueue_event->filter == EVFILT_READ)) + { + /* readable, or EOF */ + occurred_events->events |= WL_SOCKET_READABLE; + } + + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (cur_kqueue_event->filter == EVFILT_WRITE)) + { + /* writable, or EOF */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + + if (occurred_events->events != 0) + { + occurred_events->fd = cur_event->fd; + occurred_events++; + returned_events++; + } + } + } + + return returned_events; +} + +#elif defined(WAIT_USE_POLL) + +/* + * Wait using poll(2). + * + * This allows to receive readiness notifications for several events at once, + * but requires iterating through all of set->pollfds. + */ +static inline int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + int rc; + WaitEvent *cur_event; + struct pollfd *cur_pollfd; + + /* Sleep */ + rc = poll(set->pollfds, set->nevents, (int) cur_timeout); + + /* Check return code */ + if (rc < 0) + { + /* EINTR is okay, otherwise complain */ + if (errno != EINTR) + { + waiting = false; + ereport(ERROR, + (errcode_for_socket_access(), + errmsg("%s() failed: %m", + "poll"))); + } + return 0; + } + else if (rc == 0) + { + /* timeout exceeded */ + return -1; + } + + for (cur_event = set->events, cur_pollfd = set->pollfds; + cur_event < (set->events + set->nevents) && + returned_events < nevents; + cur_event++, cur_pollfd++) + { + /* no activity on this FD, skip */ + if (cur_pollfd->revents == 0) + continue; + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET && + (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) + { + /* There's data in the self-pipe, clear it. */ + drain(); + + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH && + (cur_pollfd->revents & (POLLIN | POLLHUP | POLLERR | POLLNVAL))) + { + /* + * We expect an POLLHUP when the remote end is closed, but because + * we don't expect the pipe to become readable or to have any + * errors either, treat those cases as postmaster death, too. + * + * Be paranoid about a spurious event signaling the postmaster as + * being dead. There have been reports about that happening with + * older primitives (select(2) to be specific), and a spurious + * WL_POSTMASTER_DEATH event would be painful. Re-checking doesn't + * cost much. + */ + if (!PostmasterIsAliveInternal()) + { + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events & (WL_SOCKET_READABLE | WL_SOCKET_WRITEABLE)) + { + int errflags = POLLHUP | POLLERR | POLLNVAL; + + Assert(cur_event->fd >= PGINVALID_SOCKET); + + if ((cur_event->events & WL_SOCKET_READABLE) && + (cur_pollfd->revents & (POLLIN | errflags))) + { + /* data available in socket, or EOF */ + occurred_events->events |= WL_SOCKET_READABLE; + } + + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (cur_pollfd->revents & (POLLOUT | errflags))) + { + /* writeable, or EOF */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + + if (occurred_events->events != 0) + { + occurred_events->fd = cur_event->fd; + occurred_events++; + returned_events++; + } + } + } + return returned_events; +} + +#elif defined(WAIT_USE_WIN32) + +/* + * Wait using Windows' WaitForMultipleObjects(). + * + * Unfortunately this will only ever return a single readiness notification at + * a time. Note that while the official documentation for + * WaitForMultipleObjects is ambiguous about multiple events being "consumed" + * with a single bWaitAll = FALSE call, + * https://blogs.msdn.microsoft.com/oldnewthing/20150409-00/?p=44273 confirms + * that only one event is "consumed". + */ +static inline int +WaitEventSetWaitBlock(WaitEventSet *set, int cur_timeout, + WaitEvent *occurred_events, int nevents) +{ + int returned_events = 0; + DWORD rc; + WaitEvent *cur_event; + + /* Reset any wait events that need it */ + for (cur_event = set->events; + cur_event < (set->events + set->nevents); + cur_event++) + { + if (cur_event->reset) + { + WaitEventAdjustWin32(set, cur_event); + cur_event->reset = false; + } + + /* + * Windows does not guarantee to log an FD_WRITE network event + * indicating that more data can be sent unless the previous send() + * failed with WSAEWOULDBLOCK. While our caller might well have made + * such a call, we cannot assume that here. Therefore, if waiting for + * write-ready, force the issue by doing a dummy send(). If the dummy + * send() succeeds, assume that the socket is in fact write-ready, and + * return immediately. Also, if it fails with something other than + * WSAEWOULDBLOCK, return a write-ready indication to let our caller + * deal with the error condition. + */ + if (cur_event->events & WL_SOCKET_WRITEABLE) + { + char c; + WSABUF buf; + DWORD sent; + int r; + + buf.buf = &c; + buf.len = 0; + + r = WSASend(cur_event->fd, &buf, 1, &sent, 0, NULL, NULL); + if (r == 0 || WSAGetLastError() != WSAEWOULDBLOCK) + { + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = WL_SOCKET_WRITEABLE; + occurred_events->fd = cur_event->fd; + return 1; + } + } + } + + /* + * Sleep. + * + * Need to wait for ->nevents + 1, because signal handle is in [0]. + */ + rc = WaitForMultipleObjects(set->nevents + 1, set->handles, FALSE, + cur_timeout); + + /* Check return code */ + if (rc == WAIT_FAILED) + elog(ERROR, "WaitForMultipleObjects() failed: error code %lu", + GetLastError()); + else if (rc == WAIT_TIMEOUT) + { + /* timeout exceeded */ + return -1; + } + + if (rc == WAIT_OBJECT_0) + { + /* Service newly-arrived signals */ + pgwin32_dispatch_queued_signals(); + return 0; /* retry */ + } + + /* + * With an offset of one, due to the always present pgwin32_signal_event, + * the handle offset directly corresponds to a wait event. + */ + cur_event = (WaitEvent *) &set->events[rc - WAIT_OBJECT_0 - 1]; + + occurred_events->pos = cur_event->pos; + occurred_events->user_data = cur_event->user_data; + occurred_events->events = 0; + + if (cur_event->events == WL_LATCH_SET) + { + /* + * We cannot use set->latch->event to reset the fired event if we + * aren't waiting on this latch now. + */ + if (!ResetEvent(set->handles[cur_event->pos + 1])) + elog(ERROR, "ResetEvent failed: error code %lu", GetLastError()); + + if (set->latch && set->latch->is_set) + { + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_LATCH_SET; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events == WL_POSTMASTER_DEATH) + { + /* + * Postmaster apparently died. Since the consequences of falsely + * returning WL_POSTMASTER_DEATH could be pretty unpleasant, we take + * the trouble to positively verify this with PostmasterIsAlive(), + * even though there is no known reason to think that the event could + * be falsely set on Windows. + */ + if (!PostmasterIsAliveInternal()) + { + if (set->exit_on_postmaster_death) + proc_exit(1); + occurred_events->fd = PGINVALID_SOCKET; + occurred_events->events = WL_POSTMASTER_DEATH; + occurred_events++; + returned_events++; + } + } + else if (cur_event->events & WL_SOCKET_MASK) + { + WSANETWORKEVENTS resEvents; + HANDLE handle = set->handles[cur_event->pos + 1]; + + Assert(cur_event->fd); + + occurred_events->fd = cur_event->fd; + + ZeroMemory(&resEvents, sizeof(resEvents)); + if (WSAEnumNetworkEvents(cur_event->fd, handle, &resEvents) != 0) + elog(ERROR, "failed to enumerate network events: error code %d", + WSAGetLastError()); + if ((cur_event->events & WL_SOCKET_READABLE) && + (resEvents.lNetworkEvents & FD_READ)) + { + /* data available in socket */ + occurred_events->events |= WL_SOCKET_READABLE; + + /*------ + * WaitForMultipleObjects doesn't guarantee that a read event will + * be returned if the latch is set at the same time. Even if it + * did, the caller might drop that event expecting it to reoccur + * on next call. So, we must force the event to be reset if this + * WaitEventSet is used again in order to avoid an indefinite + * hang. Refer https://msdn.microsoft.com/en-us/library/windows/desktop/ms741576(v=vs.85).aspx + * for the behavior of socket events. + *------ + */ + cur_event->reset = true; + } + if ((cur_event->events & WL_SOCKET_WRITEABLE) && + (resEvents.lNetworkEvents & FD_WRITE)) + { + /* writeable */ + occurred_events->events |= WL_SOCKET_WRITEABLE; + } + if ((cur_event->events & WL_SOCKET_CONNECTED) && + (resEvents.lNetworkEvents & FD_CONNECT)) + { + /* connected */ + occurred_events->events |= WL_SOCKET_CONNECTED; + } + if (resEvents.lNetworkEvents & FD_CLOSE) + { + /* EOF/error, so signal all caller-requested socket flags */ + occurred_events->events |= (cur_event->events & WL_SOCKET_MASK); + } + + if (occurred_events->events != 0) + { + occurred_events++; + returned_events++; + } + } + + return returned_events; +} +#endif + +/* + * Get the number of wait events registered in a given WaitEventSet. + */ +int +GetNumRegisteredWaitEvents(WaitEventSet *set) +{ + return set->nevents; +} + +#if defined(WAIT_USE_SELF_PIPE) + +/* + * SetLatch uses SIGURG to wake up the process waiting on the latch. + * + * Wake up WaitLatch, if we're waiting. + */ +static void +latch_sigurg_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + if (waiting) + sendSelfPipeByte(); + + errno = save_errno; +} + +/* Send one byte to the self-pipe, to wake up WaitLatch */ +static void +sendSelfPipeByte(void) +{ + int rc; + char dummy = 0; + +retry: + rc = write(selfpipe_writefd, &dummy, 1); + if (rc < 0) + { + /* If interrupted by signal, just retry */ + if (errno == EINTR) + goto retry; + + /* + * If the pipe is full, we don't need to retry, the data that's there + * already is enough to wake up WaitLatch. + */ + if (errno == EAGAIN || errno == EWOULDBLOCK) + return; + + /* + * Oops, the write() failed for some other reason. We might be in a + * signal handler, so it's not safe to elog(). We have no choice but + * silently ignore the error. + */ + return; + } +} + +#endif + +#if defined(WAIT_USE_SELF_PIPE) || defined(WAIT_USE_SIGNALFD) + +/* + * Read all available data from self-pipe or signalfd. + * + * Note: this is only called when waiting = true. If it fails and doesn't + * return, it must reset that flag first (though ideally, this will never + * happen). + */ +static void +drain(void) +{ + char buf[1024]; + int rc; + int fd; + +#ifdef WAIT_USE_SELF_PIPE + fd = selfpipe_readfd; +#else + fd = signal_fd; +#endif + + for (;;) + { + rc = read(fd, buf, sizeof(buf)); + if (rc < 0) + { + if (errno == EAGAIN || errno == EWOULDBLOCK) + break; /* the descriptor is empty */ + else if (errno == EINTR) + continue; /* retry */ + else + { + waiting = false; +#ifdef WAIT_USE_SELF_PIPE + elog(ERROR, "read() on self-pipe failed: %m"); +#else + elog(ERROR, "read() on signalfd failed: %m"); +#endif + } + } + else if (rc == 0) + { + waiting = false; +#ifdef WAIT_USE_SELF_PIPE + elog(ERROR, "unexpected EOF on self-pipe"); +#else + elog(ERROR, "unexpected EOF on signalfd"); +#endif + } + else if (rc < sizeof(buf)) + { + /* we successfully drained the pipe; no need to read() again */ + break; + } + /* else buffer wasn't big enough, so read again */ + } +} + +#endif diff --git a/src/backend/storage/ipc/pmsignal.c b/src/backend/storage/ipc/pmsignal.c new file mode 100644 index 0000000..280c239 --- /dev/null +++ b/src/backend/storage/ipc/pmsignal.c @@ -0,0 +1,430 @@ +/*------------------------------------------------------------------------- + * + * pmsignal.c + * routines for signaling between the postmaster and its child processes + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/ipc/pmsignal.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> + +#ifdef HAVE_SYS_PRCTL_H +#include <sys/prctl.h> +#endif + +#include "miscadmin.h" +#include "postmaster/postmaster.h" +#include "replication/walsender.h" +#include "storage/pmsignal.h" +#include "storage/shmem.h" + + +/* + * The postmaster is signaled by its children by sending SIGUSR1. The + * specific reason is communicated via flags in shared memory. We keep + * a boolean flag for each possible "reason", so that different reasons + * can be signaled by different backends at the same time. (However, + * if the same reason is signaled more than once simultaneously, the + * postmaster will observe it only once.) + * + * The flags are actually declared as "volatile sig_atomic_t" for maximum + * portability. This should ensure that loads and stores of the flag + * values are atomic, allowing us to dispense with any explicit locking. + * + * In addition to the per-reason flags, we store a set of per-child-process + * flags that are currently used only for detecting whether a backend has + * exited without performing proper shutdown. The per-child-process flags + * have three possible states: UNUSED, ASSIGNED, ACTIVE. An UNUSED slot is + * available for assignment. An ASSIGNED slot is associated with a postmaster + * child process, but either the process has not touched shared memory yet, + * or it has successfully cleaned up after itself. A ACTIVE slot means the + * process is actively using shared memory. The slots are assigned to + * child processes at random, and postmaster.c is responsible for tracking + * which one goes with which PID. + * + * Actually there is a fourth state, WALSENDER. This is just like ACTIVE, + * but carries the extra information that the child is a WAL sender. + * WAL senders too start in ACTIVE state, but switch to WALSENDER once they + * start streaming the WAL (and they never go back to ACTIVE after that). + * + * We also have a shared-memory field that is used for communication in + * the opposite direction, from postmaster to children: it tells why the + * postmaster has broadcasted SIGQUIT signals, if indeed it has done so. + */ + +#define PM_CHILD_UNUSED 0 /* these values must fit in sig_atomic_t */ +#define PM_CHILD_ASSIGNED 1 +#define PM_CHILD_ACTIVE 2 +#define PM_CHILD_WALSENDER 3 + +/* "typedef struct PMSignalData PMSignalData" appears in pmsignal.h */ +struct PMSignalData +{ + /* per-reason flags for signaling the postmaster */ + sig_atomic_t PMSignalFlags[NUM_PMSIGNALS]; + /* global flags for signals from postmaster to children */ + QuitSignalReason sigquit_reason; /* why SIGQUIT was sent */ + /* per-child-process flags */ + int num_child_flags; /* # of entries in PMChildFlags[] */ + int next_child_flag; /* next slot to try to assign */ + sig_atomic_t PMChildFlags[FLEXIBLE_ARRAY_MEMBER]; +}; + +NON_EXEC_STATIC volatile PMSignalData *PMSignalState = NULL; + +/* + * Signal handler to be notified if postmaster dies. + */ +#ifdef USE_POSTMASTER_DEATH_SIGNAL +volatile sig_atomic_t postmaster_possibly_dead = false; + +static void +postmaster_death_handler(int signo) +{ + postmaster_possibly_dead = true; +} + +/* + * The available signals depend on the OS. SIGUSR1 and SIGUSR2 are already + * used for other things, so choose another one. + * + * Currently, we assume that we can always find a signal to use. That + * seems like a reasonable assumption for all platforms that are modern + * enough to have a parent-death signaling mechanism. + */ +#if defined(SIGINFO) +#define POSTMASTER_DEATH_SIGNAL SIGINFO +#elif defined(SIGPWR) +#define POSTMASTER_DEATH_SIGNAL SIGPWR +#else +#error "cannot find a signal to use for postmaster death" +#endif + +#endif /* USE_POSTMASTER_DEATH_SIGNAL */ + +/* + * PMSignalShmemSize + * Compute space needed for pmsignal.c's shared memory + */ +Size +PMSignalShmemSize(void) +{ + Size size; + + size = offsetof(PMSignalData, PMChildFlags); + size = add_size(size, mul_size(MaxLivePostmasterChildren(), + sizeof(sig_atomic_t))); + + return size; +} + +/* + * PMSignalShmemInit - initialize during shared-memory creation + */ +void +PMSignalShmemInit(void) +{ + bool found; + + PMSignalState = (PMSignalData *) + ShmemInitStruct("PMSignalState", PMSignalShmemSize(), &found); + + if (!found) + { + /* initialize all flags to zeroes */ + MemSet(unvolatize(PMSignalData *, PMSignalState), 0, PMSignalShmemSize()); + PMSignalState->num_child_flags = MaxLivePostmasterChildren(); + } +} + +/* + * SendPostmasterSignal - signal the postmaster from a child process + */ +void +SendPostmasterSignal(PMSignalReason reason) +{ + /* If called in a standalone backend, do nothing */ + if (!IsUnderPostmaster) + return; + /* Atomically set the proper flag */ + PMSignalState->PMSignalFlags[reason] = true; + /* Send signal to postmaster */ + kill(PostmasterPid, SIGUSR1); +} + +/* + * CheckPostmasterSignal - check to see if a particular reason has been + * signaled, and clear the signal flag. Should be called by postmaster + * after receiving SIGUSR1. + */ +bool +CheckPostmasterSignal(PMSignalReason reason) +{ + /* Careful here --- don't clear flag if we haven't seen it set */ + if (PMSignalState->PMSignalFlags[reason]) + { + PMSignalState->PMSignalFlags[reason] = false; + return true; + } + return false; +} + +/* + * SetQuitSignalReason - broadcast the reason for a system shutdown. + * Should be called by postmaster before sending SIGQUIT to children. + * + * Note: in a crash-and-restart scenario, the "reason" field gets cleared + * as a part of rebuilding shared memory; the postmaster need not do it + * explicitly. + */ +void +SetQuitSignalReason(QuitSignalReason reason) +{ + PMSignalState->sigquit_reason = reason; +} + +/* + * GetQuitSignalReason - obtain the reason for a system shutdown. + * Called by child processes when they receive SIGQUIT. + * If the postmaster hasn't actually sent SIGQUIT, will return PMQUIT_NOT_SENT. + */ +QuitSignalReason +GetQuitSignalReason(void) +{ + /* This is called in signal handlers, so be extra paranoid. */ + if (!IsUnderPostmaster || PMSignalState == NULL) + return PMQUIT_NOT_SENT; + return PMSignalState->sigquit_reason; +} + + +/* + * AssignPostmasterChildSlot - select an unused slot for a new postmaster + * child process, and set its state to ASSIGNED. Returns a slot number + * (one to N). + * + * Only the postmaster is allowed to execute this routine, so we need no + * special locking. + */ +int +AssignPostmasterChildSlot(void) +{ + int slot = PMSignalState->next_child_flag; + int n; + + /* + * Scan for a free slot. We track the last slot assigned so as not to + * waste time repeatedly rescanning low-numbered slots. + */ + for (n = PMSignalState->num_child_flags; n > 0; n--) + { + if (--slot < 0) + slot = PMSignalState->num_child_flags - 1; + if (PMSignalState->PMChildFlags[slot] == PM_CHILD_UNUSED) + { + PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED; + PMSignalState->next_child_flag = slot; + return slot + 1; + } + } + + /* Out of slots ... should never happen, else postmaster.c messed up */ + elog(FATAL, "no free slots in PMChildFlags array"); + return 0; /* keep compiler quiet */ +} + +/* + * ReleasePostmasterChildSlot - release a slot after death of a postmaster + * child process. This must be called in the postmaster process. + * + * Returns true if the slot had been in ASSIGNED state (the expected case), + * false otherwise (implying that the child failed to clean itself up). + */ +bool +ReleasePostmasterChildSlot(int slot) +{ + bool result; + + Assert(slot > 0 && slot <= PMSignalState->num_child_flags); + slot--; + + /* + * Note: the slot state might already be unused, because the logic in + * postmaster.c is such that this might get called twice when a child + * crashes. So we don't try to Assert anything about the state. + */ + result = (PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED); + PMSignalState->PMChildFlags[slot] = PM_CHILD_UNUSED; + return result; +} + +/* + * IsPostmasterChildWalSender - check if given slot is in use by a + * walsender process. + */ +bool +IsPostmasterChildWalSender(int slot) +{ + Assert(slot > 0 && slot <= PMSignalState->num_child_flags); + slot--; + + if (PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER) + return true; + else + return false; +} + +/* + * MarkPostmasterChildActive - mark a postmaster child as about to begin + * actively using shared memory. This is called in the child process. + */ +void +MarkPostmasterChildActive(void) +{ + int slot = MyPMChildSlot; + + Assert(slot > 0 && slot <= PMSignalState->num_child_flags); + slot--; + Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ASSIGNED); + PMSignalState->PMChildFlags[slot] = PM_CHILD_ACTIVE; +} + +/* + * MarkPostmasterChildWalSender - mark a postmaster child as a WAL sender + * process. This is called in the child process, sometime after marking the + * child as active. + */ +void +MarkPostmasterChildWalSender(void) +{ + int slot = MyPMChildSlot; + + Assert(am_walsender); + + Assert(slot > 0 && slot <= PMSignalState->num_child_flags); + slot--; + Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE); + PMSignalState->PMChildFlags[slot] = PM_CHILD_WALSENDER; +} + +/* + * MarkPostmasterChildInactive - mark a postmaster child as done using + * shared memory. This is called in the child process. + */ +void +MarkPostmasterChildInactive(void) +{ + int slot = MyPMChildSlot; + + Assert(slot > 0 && slot <= PMSignalState->num_child_flags); + slot--; + Assert(PMSignalState->PMChildFlags[slot] == PM_CHILD_ACTIVE || + PMSignalState->PMChildFlags[slot] == PM_CHILD_WALSENDER); + PMSignalState->PMChildFlags[slot] = PM_CHILD_ASSIGNED; +} + + +/* + * PostmasterIsAliveInternal - check whether postmaster process is still alive + * + * This is the slow path of PostmasterIsAlive(), where the caller has already + * checked 'postmaster_possibly_dead'. (On platforms that don't support + * a signal for parent death, PostmasterIsAlive() is just an alias for this.) + */ +bool +PostmasterIsAliveInternal(void) +{ +#ifdef USE_POSTMASTER_DEATH_SIGNAL + /* + * Reset the flag before checking, so that we don't miss a signal if + * postmaster dies right after the check. If postmaster was indeed dead, + * we'll re-arm it before returning to caller. + */ + postmaster_possibly_dead = false; +#endif + +#ifndef WIN32 + { + char c; + ssize_t rc; + + rc = read(postmaster_alive_fds[POSTMASTER_FD_WATCH], &c, 1); + + /* + * In the usual case, the postmaster is still alive, and there is no + * data in the pipe. + */ + if (rc < 0 && (errno == EAGAIN || errno == EWOULDBLOCK)) + return true; + else + { + /* + * Postmaster is dead, or something went wrong with the read() + * call. + */ + +#ifdef USE_POSTMASTER_DEATH_SIGNAL + postmaster_possibly_dead = true; +#endif + + if (rc < 0) + elog(FATAL, "read on postmaster death monitoring pipe failed: %m"); + else if (rc > 0) + elog(FATAL, "unexpected data in postmaster death monitoring pipe"); + + return false; + } + } + +#else /* WIN32 */ + if (WaitForSingleObject(PostmasterHandle, 0) == WAIT_TIMEOUT) + return true; + else + { +#ifdef USE_POSTMASTER_DEATH_SIGNAL + postmaster_possibly_dead = true; +#endif + return false; + } +#endif /* WIN32 */ +} + +/* + * PostmasterDeathSignalInit - request signal on postmaster death if possible + */ +void +PostmasterDeathSignalInit(void) +{ +#ifdef USE_POSTMASTER_DEATH_SIGNAL + int signum = POSTMASTER_DEATH_SIGNAL; + + /* Register our signal handler. */ + pqsignal(signum, postmaster_death_handler); + + /* Request a signal on parent exit. */ +#if defined(PR_SET_PDEATHSIG) + if (prctl(PR_SET_PDEATHSIG, signum) < 0) + elog(ERROR, "could not request parent death signal: %m"); +#elif defined(PROC_PDEATHSIG_CTL) + if (procctl(P_PID, 0, PROC_PDEATHSIG_CTL, &signum) < 0) + elog(ERROR, "could not request parent death signal: %m"); +#else +#error "USE_POSTMASTER_DEATH_SIGNAL set, but there is no mechanism to request the signal" +#endif + + /* + * Just in case the parent was gone already and we missed it, we'd better + * check the slow way on the first call. + */ + postmaster_possibly_dead = true; +#endif /* USE_POSTMASTER_DEATH_SIGNAL */ +} diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c new file mode 100644 index 0000000..755f842 --- /dev/null +++ b/src/backend/storage/ipc/procarray.c @@ -0,0 +1,5220 @@ +/*------------------------------------------------------------------------- + * + * procarray.c + * POSTGRES process array code. + * + * + * This module maintains arrays of PGPROC substructures, as well as associated + * arrays in ProcGlobal, for all active backends. Although there are several + * uses for this, the principal one is as a means of determining the set of + * currently running transactions. + * + * Because of various subtle race conditions it is critical that a backend + * hold the correct locks while setting or clearing its xid (in + * ProcGlobal->xids[]/MyProc->xid). See notes in + * src/backend/access/transam/README. + * + * The process arrays now also include structures representing prepared + * transactions. The xid and subxids fields of these are valid, as are the + * myProcLocks lists. They can be distinguished from regular backend PGPROCs + * at need by checking for pid == 0. + * + * During hot standby, we also keep a list of XIDs representing transactions + * that are known to be running on the primary (or more precisely, were running + * as of the current point in the WAL stream). This list is kept in the + * KnownAssignedXids array, and is updated by watching the sequence of + * arriving XIDs. This is necessary because if we leave those XIDs out of + * snapshots taken for standby queries, then they will appear to be already + * complete, leading to MVCC failures. Note that in hot standby, the PGPROC + * array represents standby processes, which by definition are not running + * transactions that have XIDs. + * + * It is perhaps possible for a backend on the primary to terminate without + * writing an abort record for its transaction. While that shouldn't really + * happen, it would tie up KnownAssignedXids indefinitely, so we protect + * ourselves by pruning the array when a valid list of running XIDs arrives. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/procarray.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> + +#include "access/clog.h" +#include "access/subtrans.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/catalog.h" +#include "catalog/pg_authid.h" +#include "commands/dbcommands.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/spin.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) + +/* Our shared memory area */ +typedef struct ProcArrayStruct +{ + int numProcs; /* number of valid procs entries */ + int maxProcs; /* allocated size of procs array */ + + /* + * Known assigned XIDs handling + */ + int maxKnownAssignedXids; /* allocated size of array */ + int numKnownAssignedXids; /* current # of valid entries */ + int tailKnownAssignedXids; /* index of oldest valid element */ + int headKnownAssignedXids; /* index of newest element, + 1 */ + slock_t known_assigned_xids_lck; /* protects head/tail pointers */ + + /* + * Highest subxid that has been removed from KnownAssignedXids array to + * prevent overflow; or InvalidTransactionId if none. We track this for + * similar reasons to tracking overflowing cached subxids in PGPROC + * entries. Must hold exclusive ProcArrayLock to change this, and shared + * lock to read it. + */ + TransactionId lastOverflowedXid; + + /* oldest xmin of any replication slot */ + TransactionId replication_slot_xmin; + /* oldest catalog xmin of any replication slot */ + TransactionId replication_slot_catalog_xmin; + + /* indexes into allProcs[], has PROCARRAY_MAXPROCS entries */ + int pgprocnos[FLEXIBLE_ARRAY_MEMBER]; +} ProcArrayStruct; + +/* + * State for the GlobalVisTest* family of functions. Those functions can + * e.g. be used to decide if a deleted row can be removed without violating + * MVCC semantics: If the deleted row's xmax is not considered to be running + * by anyone, the row can be removed. + * + * To avoid slowing down GetSnapshotData(), we don't calculate a precise + * cutoff XID while building a snapshot (looking at the frequently changing + * xmins scales badly). Instead we compute two boundaries while building the + * snapshot: + * + * 1) definitely_needed, indicating that rows deleted by XIDs >= + * definitely_needed are definitely still visible. + * + * 2) maybe_needed, indicating that rows deleted by XIDs < maybe_needed can + * definitely be removed + * + * When testing an XID that falls in between the two (i.e. XID >= maybe_needed + * && XID < definitely_needed), the boundaries can be recomputed (using + * ComputeXidHorizons()) to get a more accurate answer. This is cheaper than + * maintaining an accurate value all the time. + * + * As it is not cheap to compute accurate boundaries, we limit the number of + * times that happens in short succession. See GlobalVisTestShouldUpdate(). + * + * + * There are three backend lifetime instances of this struct, optimized for + * different types of relations. As e.g. a normal user defined table in one + * database is inaccessible to backends connected to another database, a test + * specific to a relation can be more aggressive than a test for a shared + * relation. Currently we track four different states: + * + * 1) GlobalVisSharedRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in any database, nor a + * replication slot's xmin, nor a replication slot's catalog_xmin might + * still consider XID as running. + * + * 2) GlobalVisCatalogRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in the current + * database, nor a replication slot's xmin, nor a replication slot's + * catalog_xmin might still consider XID as running. + * + * I.e. the difference to GlobalVisSharedRels is that + * snapshot in other databases are ignored. + * + * 3) GlobalVisDataRels, which only considers an XID's + * effects visible-to-everyone if neither snapshots in the current + * database, nor a replication slot's xmin consider XID as running. + * + * I.e. the difference to GlobalVisCatalogRels is that + * replication slot's catalog_xmin is not taken into account. + * + * 4) GlobalVisTempRels, which only considers the current session, as temp + * tables are not visible to other sessions. + * + * GlobalVisTestFor(relation) returns the appropriate state + * for the relation. + * + * The boundaries are FullTransactionIds instead of TransactionIds to avoid + * wraparound dangers. There e.g. would otherwise exist no procarray state to + * prevent maybe_needed to become old enough after the GetSnapshotData() + * call. + * + * The typedef is in the header. + */ +struct GlobalVisState +{ + /* XIDs >= are considered running by some backend */ + FullTransactionId definitely_needed; + + /* XIDs < are not considered to be running by any backend */ + FullTransactionId maybe_needed; +}; + +/* + * Result of ComputeXidHorizons(). + */ +typedef struct ComputeXidHorizonsResult +{ + /* + * The value of ShmemVariableCache->latestCompletedXid when + * ComputeXidHorizons() held ProcArrayLock. + */ + FullTransactionId latest_completed; + + /* + * The same for procArray->replication_slot_xmin and. + * procArray->replication_slot_catalog_xmin. + */ + TransactionId slot_xmin; + TransactionId slot_catalog_xmin; + + /* + * Oldest xid that any backend might still consider running. This needs to + * include processes running VACUUM, in contrast to the normal visibility + * cutoffs, as vacuum needs to be able to perform pg_subtrans lookups when + * determining visibility, but doesn't care about rows above its xmin to + * be removed. + * + * This likely should only be needed to determine whether pg_subtrans can + * be truncated. It currently includes the effects of replication slots, + * for historical reasons. But that could likely be changed. + */ + TransactionId oldest_considered_running; + + /* + * Oldest xid for which deleted tuples need to be retained in shared + * tables. + * + * This includes the effects of replication slots. If that's not desired, + * look at shared_oldest_nonremovable_raw; + */ + TransactionId shared_oldest_nonremovable; + + /* + * Oldest xid that may be necessary to retain in shared tables. This is + * the same as shared_oldest_nonremovable, except that is not affected by + * replication slot's catalog_xmin. + * + * This is mainly useful to be able to send the catalog_xmin to upstream + * streaming replication servers via hot_standby_feedback, so they can + * apply the limit only when accessing catalog tables. + */ + TransactionId shared_oldest_nonremovable_raw; + + /* + * Oldest xid for which deleted tuples need to be retained in non-shared + * catalog tables. + */ + TransactionId catalog_oldest_nonremovable; + + /* + * Oldest xid for which deleted tuples need to be retained in normal user + * defined tables. + */ + TransactionId data_oldest_nonremovable; + + /* + * Oldest xid for which deleted tuples need to be retained in this + * session's temporary tables. + */ + TransactionId temp_oldest_nonremovable; + +} ComputeXidHorizonsResult; + +/* + * Return value for GlobalVisHorizonKindForRel(). + */ +typedef enum GlobalVisHorizonKind +{ + VISHORIZON_SHARED, + VISHORIZON_CATALOG, + VISHORIZON_DATA, + VISHORIZON_TEMP +} GlobalVisHorizonKind; + + +static ProcArrayStruct *procArray; + +static PGPROC *allProcs; + +/* + * Cache to reduce overhead of repeated calls to TransactionIdIsInProgress() + */ +static TransactionId cachedXidIsNotInProgress = InvalidTransactionId; + +/* + * Bookkeeping for tracking emulated transactions in recovery + */ +static TransactionId *KnownAssignedXids; +static bool *KnownAssignedXidsValid; +static TransactionId latestObservedXid = InvalidTransactionId; + +/* + * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is + * the highest xid that might still be running that we don't have in + * KnownAssignedXids. + */ +static TransactionId standbySnapshotPendingXmin; + +/* + * State for visibility checks on different types of relations. See struct + * GlobalVisState for details. As shared, catalog, normal and temporary + * relations can have different horizons, one such state exists for each. + */ +static GlobalVisState GlobalVisSharedRels; +static GlobalVisState GlobalVisCatalogRels; +static GlobalVisState GlobalVisDataRels; +static GlobalVisState GlobalVisTempRels; + +/* + * This backend's RecentXmin at the last time the accurate xmin horizon was + * recomputed, or InvalidTransactionId if it has not. Used to limit how many + * times accurate horizons are recomputed. See GlobalVisTestShouldUpdate(). + */ +static TransactionId ComputeXidHorizonsResultLastXmin; + +#ifdef XIDCACHE_DEBUG + +/* counters for XidCache measurement */ +static long xc_by_recent_xmin = 0; +static long xc_by_known_xact = 0; +static long xc_by_my_xact = 0; +static long xc_by_latest_xid = 0; +static long xc_by_main_xid = 0; +static long xc_by_child_xid = 0; +static long xc_by_known_assigned = 0; +static long xc_no_overflow = 0; +static long xc_slow_answer = 0; + +#define xc_by_recent_xmin_inc() (xc_by_recent_xmin++) +#define xc_by_known_xact_inc() (xc_by_known_xact++) +#define xc_by_my_xact_inc() (xc_by_my_xact++) +#define xc_by_latest_xid_inc() (xc_by_latest_xid++) +#define xc_by_main_xid_inc() (xc_by_main_xid++) +#define xc_by_child_xid_inc() (xc_by_child_xid++) +#define xc_by_known_assigned_inc() (xc_by_known_assigned++) +#define xc_no_overflow_inc() (xc_no_overflow++) +#define xc_slow_answer_inc() (xc_slow_answer++) + +static void DisplayXidCache(void); +#else /* !XIDCACHE_DEBUG */ + +#define xc_by_recent_xmin_inc() ((void) 0) +#define xc_by_known_xact_inc() ((void) 0) +#define xc_by_my_xact_inc() ((void) 0) +#define xc_by_latest_xid_inc() ((void) 0) +#define xc_by_main_xid_inc() ((void) 0) +#define xc_by_child_xid_inc() ((void) 0) +#define xc_by_known_assigned_inc() ((void) 0) +#define xc_no_overflow_inc() ((void) 0) +#define xc_slow_answer_inc() ((void) 0) +#endif /* XIDCACHE_DEBUG */ + +static VirtualTransactionId *GetVirtualXIDsDelayingChkptGuts(int *nvxids, + int type); +static bool HaveVirtualXIDsDelayingChkptGuts(VirtualTransactionId *vxids, + int nvxids, int type); + +/* Primitives for KnownAssignedXids array handling for standby */ +static void KnownAssignedXidsCompress(bool force); +static void KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, + bool exclusive_lock); +static bool KnownAssignedXidsSearch(TransactionId xid, bool remove); +static bool KnownAssignedXidExists(TransactionId xid); +static void KnownAssignedXidsRemove(TransactionId xid); +static void KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, + TransactionId *subxids); +static void KnownAssignedXidsRemovePreceding(TransactionId xid); +static int KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax); +static int KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, + TransactionId *xmin, + TransactionId xmax); +static TransactionId KnownAssignedXidsGetOldestXmin(void); +static void KnownAssignedXidsDisplay(int trace_level); +static void KnownAssignedXidsReset(void); +static inline void ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid); +static void ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid); +static void MaintainLatestCompletedXid(TransactionId latestXid); +static void MaintainLatestCompletedXidRecovery(TransactionId latestXid); + +static inline FullTransactionId FullXidRelativeTo(FullTransactionId rel, + TransactionId xid); +static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons); + +/* + * Report shared-memory space needed by CreateSharedProcArray. + */ +Size +ProcArrayShmemSize(void) +{ + Size size; + + /* Size of the ProcArray structure itself */ +#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts) + + size = offsetof(ProcArrayStruct, pgprocnos); + size = add_size(size, mul_size(sizeof(int), PROCARRAY_MAXPROCS)); + + /* + * During Hot Standby processing we have a data structure called + * KnownAssignedXids, created in shared memory. Local data structures are + * also created in various backends during GetSnapshotData(), + * TransactionIdIsInProgress() and GetRunningTransactionData(). All of the + * main structures created in those functions must be identically sized, + * since we may at times copy the whole of the data structures around. We + * refer to this size as TOTAL_MAX_CACHED_SUBXIDS. + * + * Ideally we'd only create this structure if we were actually doing hot + * standby in the current run, but we don't know that yet at the time + * shared memory is being set up. + */ +#define TOTAL_MAX_CACHED_SUBXIDS \ + ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) + + if (EnableHotStandby) + { + size = add_size(size, + mul_size(sizeof(TransactionId), + TOTAL_MAX_CACHED_SUBXIDS)); + size = add_size(size, + mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS)); + } + + return size; +} + +/* + * Initialize the shared PGPROC array during postmaster startup. + */ +void +CreateSharedProcArray(void) +{ + bool found; + + /* Create or attach to the ProcArray shared structure */ + procArray = (ProcArrayStruct *) + ShmemInitStruct("Proc Array", + add_size(offsetof(ProcArrayStruct, pgprocnos), + mul_size(sizeof(int), + PROCARRAY_MAXPROCS)), + &found); + + if (!found) + { + /* + * We're the first - initialize. + */ + procArray->numProcs = 0; + procArray->maxProcs = PROCARRAY_MAXPROCS; + procArray->maxKnownAssignedXids = TOTAL_MAX_CACHED_SUBXIDS; + procArray->numKnownAssignedXids = 0; + procArray->tailKnownAssignedXids = 0; + procArray->headKnownAssignedXids = 0; + SpinLockInit(&procArray->known_assigned_xids_lck); + procArray->lastOverflowedXid = InvalidTransactionId; + procArray->replication_slot_xmin = InvalidTransactionId; + procArray->replication_slot_catalog_xmin = InvalidTransactionId; + ShmemVariableCache->xactCompletionCount = 1; + } + + allProcs = ProcGlobal->allProcs; + + /* Create or attach to the KnownAssignedXids arrays too, if needed */ + if (EnableHotStandby) + { + KnownAssignedXids = (TransactionId *) + ShmemInitStruct("KnownAssignedXids", + mul_size(sizeof(TransactionId), + TOTAL_MAX_CACHED_SUBXIDS), + &found); + KnownAssignedXidsValid = (bool *) + ShmemInitStruct("KnownAssignedXidsValid", + mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS), + &found); + } +} + +/* + * Add the specified PGPROC to the shared array. + */ +void +ProcArrayAdd(PGPROC *proc) +{ + ProcArrayStruct *arrayP = procArray; + int index; + int movecount; + + /* See ProcGlobal comment explaining why both locks are held */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + + if (arrayP->numProcs >= arrayP->maxProcs) + { + /* + * Oops, no room. (This really shouldn't happen, since there is a + * fixed supply of PGPROC structs too, and so we should have failed + * earlier.) + */ + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("sorry, too many clients already"))); + } + + /* + * Keep the procs array sorted by (PGPROC *) so that we can utilize + * locality of references much better. This is useful while traversing the + * ProcArray because there is an increased likelihood of finding the next + * PGPROC structure in the cache. + * + * Since the occurrence of adding/removing a proc is much lower than the + * access to the ProcArray itself, the overhead should be marginal + */ + for (index = 0; index < arrayP->numProcs; index++) + { + int procno PG_USED_FOR_ASSERTS_ONLY = arrayP->pgprocnos[index]; + + Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); + Assert(allProcs[procno].pgxactoff == index); + + /* If we have found our right position in the array, break */ + if (arrayP->pgprocnos[index] > proc->pgprocno) + break; + } + + movecount = arrayP->numProcs - index; + memmove(&arrayP->pgprocnos[index + 1], + &arrayP->pgprocnos[index], + movecount * sizeof(*arrayP->pgprocnos)); + memmove(&ProcGlobal->xids[index + 1], + &ProcGlobal->xids[index], + movecount * sizeof(*ProcGlobal->xids)); + memmove(&ProcGlobal->subxidStates[index + 1], + &ProcGlobal->subxidStates[index], + movecount * sizeof(*ProcGlobal->subxidStates)); + memmove(&ProcGlobal->statusFlags[index + 1], + &ProcGlobal->statusFlags[index], + movecount * sizeof(*ProcGlobal->statusFlags)); + + arrayP->pgprocnos[index] = proc->pgprocno; + proc->pgxactoff = index; + ProcGlobal->xids[index] = proc->xid; + ProcGlobal->subxidStates[index] = proc->subxidStatus; + ProcGlobal->statusFlags[index] = proc->statusFlags; + + arrayP->numProcs++; + + /* adjust pgxactoff for all following PGPROCs */ + index++; + for (; index < arrayP->numProcs; index++) + { + int procno = arrayP->pgprocnos[index]; + + Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); + Assert(allProcs[procno].pgxactoff == index - 1); + + allProcs[procno].pgxactoff = index; + } + + /* + * Release in reversed acquisition order, to reduce frequency of having to + * wait for XidGenLock while holding ProcArrayLock. + */ + LWLockRelease(XidGenLock); + LWLockRelease(ProcArrayLock); +} + +/* + * Remove the specified PGPROC from the shared array. + * + * When latestXid is a valid XID, we are removing a live 2PC gxact from the + * array, and thus causing it to appear as "not running" anymore. In this + * case we must advance latestCompletedXid. (This is essentially the same + * as ProcArrayEndTransaction followed by removal of the PGPROC, but we take + * the ProcArrayLock only once, and don't damage the content of the PGPROC; + * twophase.c depends on the latter.) + */ +void +ProcArrayRemove(PGPROC *proc, TransactionId latestXid) +{ + ProcArrayStruct *arrayP = procArray; + int myoff; + int movecount; + +#ifdef XIDCACHE_DEBUG + /* dump stats at backend shutdown, but not prepared-xact end */ + if (proc->pid != 0) + DisplayXidCache(); +#endif + + /* See ProcGlobal comment explaining why both locks are held */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + LWLockAcquire(XidGenLock, LW_EXCLUSIVE); + + myoff = proc->pgxactoff; + + Assert(myoff >= 0 && myoff < arrayP->numProcs); + Assert(ProcGlobal->allProcs[arrayP->pgprocnos[myoff]].pgxactoff == myoff); + + if (TransactionIdIsValid(latestXid)) + { + Assert(TransactionIdIsValid(ProcGlobal->xids[myoff])); + + /* Advance global latestCompletedXid while holding the lock */ + MaintainLatestCompletedXid(latestXid); + + /* Same with xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; + + ProcGlobal->xids[myoff] = InvalidTransactionId; + ProcGlobal->subxidStates[myoff].overflowed = false; + ProcGlobal->subxidStates[myoff].count = 0; + } + else + { + /* Shouldn't be trying to remove a live transaction here */ + Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff])); + } + + Assert(!TransactionIdIsValid(ProcGlobal->xids[myoff])); + Assert(ProcGlobal->subxidStates[myoff].count == 0); + Assert(ProcGlobal->subxidStates[myoff].overflowed == false); + + ProcGlobal->statusFlags[myoff] = 0; + + /* Keep the PGPROC array sorted. See notes above */ + movecount = arrayP->numProcs - myoff - 1; + memmove(&arrayP->pgprocnos[myoff], + &arrayP->pgprocnos[myoff + 1], + movecount * sizeof(*arrayP->pgprocnos)); + memmove(&ProcGlobal->xids[myoff], + &ProcGlobal->xids[myoff + 1], + movecount * sizeof(*ProcGlobal->xids)); + memmove(&ProcGlobal->subxidStates[myoff], + &ProcGlobal->subxidStates[myoff + 1], + movecount * sizeof(*ProcGlobal->subxidStates)); + memmove(&ProcGlobal->statusFlags[myoff], + &ProcGlobal->statusFlags[myoff + 1], + movecount * sizeof(*ProcGlobal->statusFlags)); + + arrayP->pgprocnos[arrayP->numProcs - 1] = -1; /* for debugging */ + arrayP->numProcs--; + + /* + * Adjust pgxactoff of following procs for removed PGPROC (note that + * numProcs already has been decremented). + */ + for (int index = myoff; index < arrayP->numProcs; index++) + { + int procno = arrayP->pgprocnos[index]; + + Assert(procno >= 0 && procno < (arrayP->maxProcs + NUM_AUXILIARY_PROCS)); + Assert(allProcs[procno].pgxactoff - 1 == index); + + allProcs[procno].pgxactoff = index; + } + + /* + * Release in reversed acquisition order, to reduce frequency of having to + * wait for XidGenLock while holding ProcArrayLock. + */ + LWLockRelease(XidGenLock); + LWLockRelease(ProcArrayLock); +} + + +/* + * ProcArrayEndTransaction -- mark a transaction as no longer running + * + * This is used interchangeably for commit and abort cases. The transaction + * commit/abort must already be reported to WAL and pg_xact. + * + * proc is currently always MyProc, but we pass it explicitly for flexibility. + * latestXid is the latest Xid among the transaction's main XID and + * subtransactions, or InvalidTransactionId if it has no XID. (We must ask + * the caller to pass latestXid, instead of computing it from the PGPROC's + * contents, because the subxid information in the PGPROC might be + * incomplete.) + */ +void +ProcArrayEndTransaction(PGPROC *proc, TransactionId latestXid) +{ + if (TransactionIdIsValid(latestXid)) + { + /* + * We must lock ProcArrayLock while clearing our advertised XID, so + * that we do not exit the set of "running" transactions while someone + * else is taking a snapshot. See discussion in + * src/backend/access/transam/README. + */ + Assert(TransactionIdIsValid(proc->xid)); + + /* + * If we can immediately acquire ProcArrayLock, we clear our own XID + * and release the lock. If not, use group XID clearing to improve + * efficiency. + */ + if (LWLockConditionalAcquire(ProcArrayLock, LW_EXCLUSIVE)) + { + ProcArrayEndTransactionInternal(proc, latestXid); + LWLockRelease(ProcArrayLock); + } + else + ProcArrayGroupClearXid(proc, latestXid); + } + else + { + /* + * If we have no XID, we don't need to lock, since we won't affect + * anyone else's calculation of a snapshot. We might change their + * estimate of global xmin, but that's OK. + */ + Assert(!TransactionIdIsValid(proc->xid)); + Assert(proc->subxidStatus.count == 0); + Assert(!proc->subxidStatus.overflowed); + + proc->lxid = InvalidLocalTransactionId; + proc->xmin = InvalidTransactionId; + + /* be sure these are cleared in abort */ + proc->delayChkpt = false; + proc->delayChkptEnd = false; + + proc->recoveryConflictPending = false; + + /* must be cleared with xid/xmin: */ + /* avoid unnecessarily dirtying shared cachelines */ + if (proc->statusFlags & PROC_VACUUM_STATE_MASK) + { + Assert(!LWLockHeldByMe(ProcArrayLock)); + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + Assert(proc->statusFlags == ProcGlobal->statusFlags[proc->pgxactoff]); + proc->statusFlags &= ~PROC_VACUUM_STATE_MASK; + ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags; + LWLockRelease(ProcArrayLock); + } + } +} + +/* + * Mark a write transaction as no longer running. + * + * We don't do any locking here; caller must handle that. + */ +static inline void +ProcArrayEndTransactionInternal(PGPROC *proc, TransactionId latestXid) +{ + int pgxactoff = proc->pgxactoff; + + /* + * Note: we need exclusive lock here because we're going to change other + * processes' PGPROC entries. + */ + Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE)); + Assert(TransactionIdIsValid(ProcGlobal->xids[pgxactoff])); + Assert(ProcGlobal->xids[pgxactoff] == proc->xid); + + ProcGlobal->xids[pgxactoff] = InvalidTransactionId; + proc->xid = InvalidTransactionId; + proc->lxid = InvalidLocalTransactionId; + proc->xmin = InvalidTransactionId; + + /* be sure these are cleared in abort */ + proc->delayChkpt = false; + proc->delayChkptEnd = false; + + proc->recoveryConflictPending = false; + + /* must be cleared with xid/xmin: */ + /* avoid unnecessarily dirtying shared cachelines */ + if (proc->statusFlags & PROC_VACUUM_STATE_MASK) + { + proc->statusFlags &= ~PROC_VACUUM_STATE_MASK; + ProcGlobal->statusFlags[proc->pgxactoff] = proc->statusFlags; + } + + /* Clear the subtransaction-XID cache too while holding the lock */ + Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count && + ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed); + if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed) + { + ProcGlobal->subxidStates[pgxactoff].count = 0; + ProcGlobal->subxidStates[pgxactoff].overflowed = false; + proc->subxidStatus.count = 0; + proc->subxidStatus.overflowed = false; + } + + /* Also advance global latestCompletedXid while holding the lock */ + MaintainLatestCompletedXid(latestXid); + + /* Same with xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; +} + +/* + * ProcArrayGroupClearXid -- group XID clearing + * + * When we cannot immediately acquire ProcArrayLock in exclusive mode at + * commit time, add ourselves to a list of processes that need their XIDs + * cleared. The first process to add itself to the list will acquire + * ProcArrayLock in exclusive mode and perform ProcArrayEndTransactionInternal + * on behalf of all group members. This avoids a great deal of contention + * around ProcArrayLock when many processes are trying to commit at once, + * since the lock need not be repeatedly handed off from one committing + * process to the next. + */ +static void +ProcArrayGroupClearXid(PGPROC *proc, TransactionId latestXid) +{ + PROC_HDR *procglobal = ProcGlobal; + uint32 nextidx; + uint32 wakeidx; + + /* We should definitely have an XID to clear. */ + Assert(TransactionIdIsValid(proc->xid)); + + /* Add ourselves to the list of processes needing a group XID clear. */ + proc->procArrayGroupMember = true; + proc->procArrayGroupMemberXid = latestXid; + nextidx = pg_atomic_read_u32(&procglobal->procArrayGroupFirst); + while (true) + { + pg_atomic_write_u32(&proc->procArrayGroupNext, nextidx); + + if (pg_atomic_compare_exchange_u32(&procglobal->procArrayGroupFirst, + &nextidx, + (uint32) proc->pgprocno)) + break; + } + + /* + * If the list was not empty, the leader will clear our XID. It is + * impossible to have followers without a leader because the first process + * that has added itself to the list will always have nextidx as + * INVALID_PGPROCNO. + */ + if (nextidx != INVALID_PGPROCNO) + { + int extraWaits = 0; + + /* Sleep until the leader clears our XID. */ + pgstat_report_wait_start(WAIT_EVENT_PROCARRAY_GROUP_UPDATE); + for (;;) + { + /* acts as a read barrier */ + PGSemaphoreLock(proc->sem); + if (!proc->procArrayGroupMember) + break; + extraWaits++; + } + pgstat_report_wait_end(); + + Assert(pg_atomic_read_u32(&proc->procArrayGroupNext) == INVALID_PGPROCNO); + + /* Fix semaphore count for any absorbed wakeups */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + return; + } + + /* We are the leader. Acquire the lock on behalf of everyone. */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Now that we've got the lock, clear the list of processes waiting for + * group XID clearing, saving a pointer to the head of the list. Trying + * to pop elements one at a time could lead to an ABA problem. + */ + nextidx = pg_atomic_exchange_u32(&procglobal->procArrayGroupFirst, + INVALID_PGPROCNO); + + /* Remember head of list so we can perform wakeups after dropping lock. */ + wakeidx = nextidx; + + /* Walk the list and clear all XIDs. */ + while (nextidx != INVALID_PGPROCNO) + { + PGPROC *nextproc = &allProcs[nextidx]; + + ProcArrayEndTransactionInternal(nextproc, nextproc->procArrayGroupMemberXid); + + /* Move to next proc in list. */ + nextidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext); + } + + /* We're done with the lock now. */ + LWLockRelease(ProcArrayLock); + + /* + * Now that we've released the lock, go back and wake everybody up. We + * don't do this under the lock so as to keep lock hold times to a + * minimum. The system calls we need to perform to wake other processes + * up are probably much slower than the simple memory writes we did while + * holding the lock. + */ + while (wakeidx != INVALID_PGPROCNO) + { + PGPROC *nextproc = &allProcs[wakeidx]; + + wakeidx = pg_atomic_read_u32(&nextproc->procArrayGroupNext); + pg_atomic_write_u32(&nextproc->procArrayGroupNext, INVALID_PGPROCNO); + + /* ensure all previous writes are visible before follower continues. */ + pg_write_barrier(); + + nextproc->procArrayGroupMember = false; + + if (nextproc != MyProc) + PGSemaphoreUnlock(nextproc->sem); + } +} + +/* + * ProcArrayClearTransaction -- clear the transaction fields + * + * This is used after successfully preparing a 2-phase transaction. We are + * not actually reporting the transaction's XID as no longer running --- it + * will still appear as running because the 2PC's gxact is in the ProcArray + * too. We just have to clear out our own PGPROC. + */ +void +ProcArrayClearTransaction(PGPROC *proc) +{ + int pgxactoff; + + /* + * Currently we need to lock ProcArrayLock exclusively here, as we + * increment xactCompletionCount below. We also need it at least in shared + * mode for pgproc->pgxactoff to stay the same below. + * + * We could however, as this action does not actually change anyone's view + * of the set of running XIDs (our entry is duplicate with the gxact that + * has already been inserted into the ProcArray), lower the lock level to + * shared if we were to make xactCompletionCount an atomic variable. But + * that doesn't seem worth it currently, as a 2PC commit is heavyweight + * enough for this not to be the bottleneck. If it ever becomes a + * bottleneck it may also be worth considering to combine this with the + * subsequent ProcArrayRemove() + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + pgxactoff = proc->pgxactoff; + + ProcGlobal->xids[pgxactoff] = InvalidTransactionId; + proc->xid = InvalidTransactionId; + + proc->lxid = InvalidLocalTransactionId; + proc->xmin = InvalidTransactionId; + proc->recoveryConflictPending = false; + + Assert(!(proc->statusFlags & PROC_VACUUM_STATE_MASK)); + Assert(!proc->delayChkpt); + + /* + * Need to increment completion count even though transaction hasn't + * really committed yet. The reason for that is that GetSnapshotData() + * omits the xid of the current transaction, thus without the increment we + * otherwise could end up reusing the snapshot later. Which would be bad, + * because it might not count the prepared transaction as running. + */ + ShmemVariableCache->xactCompletionCount++; + + /* Clear the subtransaction-XID cache too */ + Assert(ProcGlobal->subxidStates[pgxactoff].count == proc->subxidStatus.count && + ProcGlobal->subxidStates[pgxactoff].overflowed == proc->subxidStatus.overflowed); + if (proc->subxidStatus.count > 0 || proc->subxidStatus.overflowed) + { + ProcGlobal->subxidStates[pgxactoff].count = 0; + ProcGlobal->subxidStates[pgxactoff].overflowed = false; + proc->subxidStatus.count = 0; + proc->subxidStatus.overflowed = false; + } + + LWLockRelease(ProcArrayLock); +} + +/* + * Update ShmemVariableCache->latestCompletedXid to point to latestXid if + * currently older. + */ +static void +MaintainLatestCompletedXid(TransactionId latestXid) +{ + FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid; + + Assert(FullTransactionIdIsValid(cur_latest)); + Assert(!RecoveryInProgress()); + Assert(LWLockHeldByMe(ProcArrayLock)); + + if (TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) + { + ShmemVariableCache->latestCompletedXid = + FullXidRelativeTo(cur_latest, latestXid); + } + + Assert(IsBootstrapProcessingMode() || + FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); +} + +/* + * Same as MaintainLatestCompletedXid, except for use during WAL replay. + */ +static void +MaintainLatestCompletedXidRecovery(TransactionId latestXid) +{ + FullTransactionId cur_latest = ShmemVariableCache->latestCompletedXid; + FullTransactionId rel; + + Assert(AmStartupProcess() || !IsUnderPostmaster); + Assert(LWLockHeldByMe(ProcArrayLock)); + + /* + * Need a FullTransactionId to compare latestXid with. Can't rely on + * latestCompletedXid to be initialized in recovery. But in recovery it's + * safe to access nextXid without a lock for the startup process. + */ + rel = ShmemVariableCache->nextXid; + Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid)); + + if (!FullTransactionIdIsValid(cur_latest) || + TransactionIdPrecedes(XidFromFullTransactionId(cur_latest), latestXid)) + { + ShmemVariableCache->latestCompletedXid = + FullXidRelativeTo(rel, latestXid); + } + + Assert(FullTransactionIdIsNormal(ShmemVariableCache->latestCompletedXid)); +} + +/* + * ProcArrayInitRecovery -- initialize recovery xid mgmt environment + * + * Remember up to where the startup process initialized the CLOG and subtrans + * so we can ensure it's initialized gaplessly up to the point where necessary + * while in recovery. + */ +void +ProcArrayInitRecovery(TransactionId initializedUptoXID) +{ + Assert(standbyState == STANDBY_INITIALIZED); + Assert(TransactionIdIsNormal(initializedUptoXID)); + + /* + * we set latestObservedXid to the xid SUBTRANS has been initialized up + * to, so we can extend it from that point onwards in + * RecordKnownAssignedTransactionIds, and when we get consistent in + * ProcArrayApplyRecoveryInfo(). + */ + latestObservedXid = initializedUptoXID; + TransactionIdRetreat(latestObservedXid); +} + +/* + * ProcArrayApplyRecoveryInfo -- apply recovery info about xids + * + * Takes us through 3 states: Initialized, Pending and Ready. + * Normal case is to go all the way to Ready straight away, though there + * are atypical cases where we need to take it in steps. + * + * Use the data about running transactions on the primary to create the initial + * state of KnownAssignedXids. We also use these records to regularly prune + * KnownAssignedXids because we know it is possible that some transactions + * with FATAL errors fail to write abort records, which could cause eventual + * overflow. + * + * See comments for LogStandbySnapshot(). + */ +void +ProcArrayApplyRecoveryInfo(RunningTransactions running) +{ + TransactionId *xids; + int nxids; + int i; + + Assert(standbyState >= STANDBY_INITIALIZED); + Assert(TransactionIdIsValid(running->nextXid)); + Assert(TransactionIdIsValid(running->oldestRunningXid)); + Assert(TransactionIdIsNormal(running->latestCompletedXid)); + + /* + * Remove stale transactions, if any. + */ + ExpireOldKnownAssignedTransactionIds(running->oldestRunningXid); + + /* + * Remove stale locks, if any. + */ + StandbyReleaseOldLocks(running->oldestRunningXid); + + /* + * If our snapshot is already valid, nothing else to do... + */ + if (standbyState == STANDBY_SNAPSHOT_READY) + return; + + /* + * If our initial RunningTransactionsData had an overflowed snapshot then + * we knew we were missing some subxids from our snapshot. If we continue + * to see overflowed snapshots then we might never be able to start up, so + * we make another test to see if our snapshot is now valid. We know that + * the missing subxids are equal to or earlier than nextXid. After we + * initialise we continue to apply changes during recovery, so once the + * oldestRunningXid is later than the nextXid from the initial snapshot we + * know that we no longer have missing information and can mark the + * snapshot as valid. + */ + if (standbyState == STANDBY_SNAPSHOT_PENDING) + { + /* + * If the snapshot isn't overflowed or if its empty we can reset our + * pending state and use this snapshot instead. + */ + if (!running->subxid_overflow || running->xcnt == 0) + { + /* + * If we have already collected known assigned xids, we need to + * throw them away before we apply the recovery snapshot. + */ + KnownAssignedXidsReset(); + standbyState = STANDBY_INITIALIZED; + } + else + { + if (TransactionIdPrecedes(standbySnapshotPendingXmin, + running->oldestRunningXid)) + { + standbyState = STANDBY_SNAPSHOT_READY; + elog(trace_recovery(DEBUG1), + "recovery snapshots are now enabled"); + } + else + elog(trace_recovery(DEBUG1), + "recovery snapshot waiting for non-overflowed snapshot or " + "until oldest active xid on standby is at least %u (now %u)", + standbySnapshotPendingXmin, + running->oldestRunningXid); + return; + } + } + + Assert(standbyState == STANDBY_INITIALIZED); + + /* + * NB: this can be reached at least twice, so make sure new code can deal + * with that. + */ + + /* + * Nobody else is running yet, but take locks anyhow + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * KnownAssignedXids is sorted so we cannot just add the xids, we have to + * sort them first. + * + * Some of the new xids are top-level xids and some are subtransactions. + * We don't call SubTransSetParent because it doesn't matter yet. If we + * aren't overflowed then all xids will fit in snapshot and so we don't + * need subtrans. If we later overflow, an xid assignment record will add + * xids to subtrans. If RunningTransactionsData is overflowed then we + * don't have enough information to correctly update subtrans anyway. + */ + + /* + * Allocate a temporary array to avoid modifying the array passed as + * argument. + */ + xids = palloc(sizeof(TransactionId) * (running->xcnt + running->subxcnt)); + + /* + * Add to the temp array any xids which have not already completed. + */ + nxids = 0; + for (i = 0; i < running->xcnt + running->subxcnt; i++) + { + TransactionId xid = running->xids[i]; + + /* + * The running-xacts snapshot can contain xids that were still visible + * in the procarray when the snapshot was taken, but were already + * WAL-logged as completed. They're not running anymore, so ignore + * them. + */ + if (TransactionIdDidCommit(xid) || TransactionIdDidAbort(xid)) + continue; + + xids[nxids++] = xid; + } + + if (nxids > 0) + { + if (procArray->numKnownAssignedXids != 0) + { + LWLockRelease(ProcArrayLock); + elog(ERROR, "KnownAssignedXids is not empty"); + } + + /* + * Sort the array so that we can add them safely into + * KnownAssignedXids. + * + * We have to sort them logically, because in KnownAssignedXidsAdd we + * call TransactionIdFollowsOrEquals and so on. But we know these XIDs + * come from RUNNING_XACTS, which means there are only normal XIDs from + * the same epoch, so this is safe. + */ + qsort(xids, nxids, sizeof(TransactionId), xidLogicalComparator); + + /* + * Add the sorted snapshot into KnownAssignedXids. The running-xacts + * snapshot may include duplicated xids because of prepared + * transactions, so ignore them. + */ + for (i = 0; i < nxids; i++) + { + if (i > 0 && TransactionIdEquals(xids[i - 1], xids[i])) + { + elog(DEBUG1, + "found duplicated transaction %u for KnownAssignedXids insertion", + xids[i]); + continue; + } + KnownAssignedXidsAdd(xids[i], xids[i], true); + } + + KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); + } + + pfree(xids); + + /* + * latestObservedXid is at least set to the point where SUBTRANS was + * started up to (cf. ProcArrayInitRecovery()) or to the biggest xid + * RecordKnownAssignedTransactionIds() was called for. Initialize + * subtrans from thereon, up to nextXid - 1. + * + * We need to duplicate parts of RecordKnownAssignedTransactionId() here, + * because we've just added xids to the known assigned xids machinery that + * haven't gone through RecordKnownAssignedTransactionId(). + */ + Assert(TransactionIdIsNormal(latestObservedXid)); + TransactionIdAdvance(latestObservedXid); + while (TransactionIdPrecedes(latestObservedXid, running->nextXid)) + { + ExtendSUBTRANS(latestObservedXid); + TransactionIdAdvance(latestObservedXid); + } + TransactionIdRetreat(latestObservedXid); /* = running->nextXid - 1 */ + + /* ---------- + * Now we've got the running xids we need to set the global values that + * are used to track snapshots as they evolve further. + * + * - latestCompletedXid which will be the xmax for snapshots + * - lastOverflowedXid which shows whether snapshots overflow + * - nextXid + * + * If the snapshot overflowed, then we still initialise with what we know, + * but the recovery snapshot isn't fully valid yet because we know there + * are some subxids missing. We don't know the specific subxids that are + * missing, so conservatively assume the last one is latestObservedXid. + * ---------- + */ + if (running->subxid_overflow) + { + standbyState = STANDBY_SNAPSHOT_PENDING; + + standbySnapshotPendingXmin = latestObservedXid; + procArray->lastOverflowedXid = latestObservedXid; + } + else + { + standbyState = STANDBY_SNAPSHOT_READY; + + standbySnapshotPendingXmin = InvalidTransactionId; + } + + /* + * If a transaction wrote a commit record in the gap between taking and + * logging the snapshot then latestCompletedXid may already be higher than + * the value from the snapshot, so check before we use the incoming value. + * It also might not yet be set at all. + */ + MaintainLatestCompletedXidRecovery(running->latestCompletedXid); + + /* + * NB: No need to increment ShmemVariableCache->xactCompletionCount here, + * nobody can see it yet. + */ + + LWLockRelease(ProcArrayLock); + + /* ShmemVariableCache->nextXid must be beyond any observed xid. */ + AdvanceNextFullTransactionIdPastXid(latestObservedXid); + + Assert(FullTransactionIdIsValid(ShmemVariableCache->nextXid)); + + KnownAssignedXidsDisplay(trace_recovery(DEBUG3)); + if (standbyState == STANDBY_SNAPSHOT_READY) + elog(trace_recovery(DEBUG1), "recovery snapshots are now enabled"); + else + elog(trace_recovery(DEBUG1), + "recovery snapshot waiting for non-overflowed snapshot or " + "until oldest active xid on standby is at least %u (now %u)", + standbySnapshotPendingXmin, + running->oldestRunningXid); +} + +/* + * ProcArrayApplyXidAssignment + * Process an XLOG_XACT_ASSIGNMENT WAL record + */ +void +ProcArrayApplyXidAssignment(TransactionId topxid, + int nsubxids, TransactionId *subxids) +{ + TransactionId max_xid; + int i; + + Assert(standbyState >= STANDBY_INITIALIZED); + + max_xid = TransactionIdLatest(topxid, nsubxids, subxids); + + /* + * Mark all the subtransactions as observed. + * + * NOTE: This will fail if the subxid contains too many previously + * unobserved xids to fit into known-assigned-xids. That shouldn't happen + * as the code stands, because xid-assignment records should never contain + * more than PGPROC_MAX_CACHED_SUBXIDS entries. + */ + RecordKnownAssignedTransactionIds(max_xid); + + /* + * Notice that we update pg_subtrans with the top-level xid, rather than + * the parent xid. This is a difference between normal processing and + * recovery, yet is still correct in all cases. The reason is that + * subtransaction commit is not marked in clog until commit processing, so + * all aborted subtransactions have already been clearly marked in clog. + * As a result we are able to refer directly to the top-level + * transaction's state rather than skipping through all the intermediate + * states in the subtransaction tree. This should be the first time we + * have attempted to SubTransSetParent(). + */ + for (i = 0; i < nsubxids; i++) + SubTransSetParent(subxids[i], topxid); + + /* KnownAssignedXids isn't maintained yet, so we're done for now */ + if (standbyState == STANDBY_INITIALIZED) + return; + + /* + * Uses same locking as transaction commit + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Remove subxids from known-assigned-xacts. + */ + KnownAssignedXidsRemoveTree(InvalidTransactionId, nsubxids, subxids); + + /* + * Advance lastOverflowedXid to be at least the last of these subxids. + */ + if (TransactionIdPrecedes(procArray->lastOverflowedXid, max_xid)) + procArray->lastOverflowedXid = max_xid; + + LWLockRelease(ProcArrayLock); +} + +/* + * TransactionIdIsInProgress -- is given transaction running in some backend + * + * Aside from some shortcuts such as checking RecentXmin and our own Xid, + * there are four possibilities for finding a running transaction: + * + * 1. The given Xid is a main transaction Id. We will find this out cheaply + * by looking at ProcGlobal->xids. + * + * 2. The given Xid is one of the cached subxact Xids in the PGPROC array. + * We can find this out cheaply too. + * + * 3. In Hot Standby mode, we must search the KnownAssignedXids list to see + * if the Xid is running on the primary. + * + * 4. Search the SubTrans tree to find the Xid's topmost parent, and then see + * if that is running according to ProcGlobal->xids[] or KnownAssignedXids. + * This is the slowest way, but sadly it has to be done always if the others + * failed, unless we see that the cached subxact sets are complete (none have + * overflowed). + * + * ProcArrayLock has to be held while we do 1, 2, 3. If we save the top Xids + * while doing 1 and 3, we can release the ProcArrayLock while we do 4. + * This buys back some concurrency (and we can't retrieve the main Xids from + * ProcGlobal->xids[] again anyway; see GetNewTransactionId). + */ +bool +TransactionIdIsInProgress(TransactionId xid) +{ + static TransactionId *xids = NULL; + static TransactionId *other_xids; + XidCacheStatus *other_subxidstates; + int nxids = 0; + ProcArrayStruct *arrayP = procArray; + TransactionId topxid; + TransactionId latestCompletedXid; + int mypgxactoff; + int numProcs; + int j; + + /* + * Don't bother checking a transaction older than RecentXmin; it could not + * possibly still be running. (Note: in particular, this guarantees that + * we reject InvalidTransactionId, FrozenTransactionId, etc as not + * running.) + */ + if (TransactionIdPrecedes(xid, RecentXmin)) + { + xc_by_recent_xmin_inc(); + return false; + } + + /* + * We may have just checked the status of this transaction, so if it is + * already known to be completed, we can fall out without any access to + * shared memory. + */ + if (TransactionIdEquals(cachedXidIsNotInProgress, xid)) + { + xc_by_known_xact_inc(); + return false; + } + + /* + * Also, we can handle our own transaction (and subtransactions) without + * any access to shared memory. + */ + if (TransactionIdIsCurrentTransactionId(xid)) + { + xc_by_my_xact_inc(); + return true; + } + + /* + * If first time through, get workspace to remember main XIDs in. We + * malloc it permanently to avoid repeated palloc/pfree overhead. + */ + if (xids == NULL) + { + /* + * In hot standby mode, reserve enough space to hold all xids in the + * known-assigned list. If we later finish recovery, we no longer need + * the bigger array, but we don't bother to shrink it. + */ + int maxxids = RecoveryInProgress() ? TOTAL_MAX_CACHED_SUBXIDS : arrayP->maxProcs; + + xids = (TransactionId *) malloc(maxxids * sizeof(TransactionId)); + if (xids == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + other_xids = ProcGlobal->xids; + other_subxidstates = ProcGlobal->subxidStates; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + /* + * Now that we have the lock, we can check latestCompletedXid; if the + * target Xid is after that, it's surely still running. + */ + latestCompletedXid = + XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid); + if (TransactionIdPrecedes(latestCompletedXid, xid)) + { + LWLockRelease(ProcArrayLock); + xc_by_latest_xid_inc(); + return true; + } + + /* No shortcuts, gotta grovel through the array */ + mypgxactoff = MyProc->pgxactoff; + numProcs = arrayP->numProcs; + for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) + { + int pgprocno; + PGPROC *proc; + TransactionId pxid; + int pxids; + + /* Ignore ourselves --- dealt with it above */ + if (pgxactoff == mypgxactoff) + continue; + + /* Fetch xid just once - see GetNewTransactionId */ + pxid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); + + if (!TransactionIdIsValid(pxid)) + continue; + + /* + * Step 1: check the main Xid + */ + if (TransactionIdEquals(pxid, xid)) + { + LWLockRelease(ProcArrayLock); + xc_by_main_xid_inc(); + return true; + } + + /* + * We can ignore main Xids that are younger than the target Xid, since + * the target could not possibly be their child. + */ + if (TransactionIdPrecedes(xid, pxid)) + continue; + + /* + * Step 2: check the cached child-Xids arrays + */ + pxids = other_subxidstates[pgxactoff].count; + pg_read_barrier(); /* pairs with barrier in GetNewTransactionId() */ + pgprocno = arrayP->pgprocnos[pgxactoff]; + proc = &allProcs[pgprocno]; + for (j = pxids - 1; j >= 0; j--) + { + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId cxid = UINT32_ACCESS_ONCE(proc->subxids.xids[j]); + + if (TransactionIdEquals(cxid, xid)) + { + LWLockRelease(ProcArrayLock); + xc_by_child_xid_inc(); + return true; + } + } + + /* + * Save the main Xid for step 4. We only need to remember main Xids + * that have uncached children. (Note: there is no race condition + * here because the overflowed flag cannot be cleared, only set, while + * we hold ProcArrayLock. So we can't miss an Xid that we need to + * worry about.) + */ + if (other_subxidstates[pgxactoff].overflowed) + xids[nxids++] = pxid; + } + + /* + * Step 3: in hot standby mode, check the known-assigned-xids list. XIDs + * in the list must be treated as running. + */ + if (RecoveryInProgress()) + { + /* none of the PGPROC entries should have XIDs in hot standby mode */ + Assert(nxids == 0); + + if (KnownAssignedXidExists(xid)) + { + LWLockRelease(ProcArrayLock); + xc_by_known_assigned_inc(); + return true; + } + + /* + * If the KnownAssignedXids overflowed, we have to check pg_subtrans + * too. Fetch all xids from KnownAssignedXids that are lower than + * xid, since if xid is a subtransaction its parent will always have a + * lower value. Note we will collect both main and subXIDs here, but + * there's no help for it. + */ + if (TransactionIdPrecedesOrEquals(xid, procArray->lastOverflowedXid)) + nxids = KnownAssignedXidsGet(xids, xid); + } + + LWLockRelease(ProcArrayLock); + + /* + * If none of the relevant caches overflowed, we know the Xid is not + * running without even looking at pg_subtrans. + */ + if (nxids == 0) + { + xc_no_overflow_inc(); + cachedXidIsNotInProgress = xid; + return false; + } + + /* + * Step 4: have to check pg_subtrans. + * + * At this point, we know it's either a subtransaction of one of the Xids + * in xids[], or it's not running. If it's an already-failed + * subtransaction, we want to say "not running" even though its parent may + * still be running. So first, check pg_xact to see if it's been aborted. + */ + xc_slow_answer_inc(); + + if (TransactionIdDidAbort(xid)) + { + cachedXidIsNotInProgress = xid; + return false; + } + + /* + * It isn't aborted, so check whether the transaction tree it belongs to + * is still running (or, more precisely, whether it was running when we + * held ProcArrayLock). + */ + topxid = SubTransGetTopmostTransaction(xid); + Assert(TransactionIdIsValid(topxid)); + if (!TransactionIdEquals(topxid, xid)) + { + for (int i = 0; i < nxids; i++) + { + if (TransactionIdEquals(xids[i], topxid)) + return true; + } + } + + cachedXidIsNotInProgress = xid; + return false; +} + +/* + * TransactionIdIsActive -- is xid the top-level XID of an active backend? + * + * This differs from TransactionIdIsInProgress in that it ignores prepared + * transactions, as well as transactions running on the primary if we're in + * hot standby. Also, we ignore subtransactions since that's not needed + * for current uses. + */ +bool +TransactionIdIsActive(TransactionId xid) +{ + bool result = false; + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + int i; + + /* + * Don't bother checking a transaction older than RecentXmin; it could not + * possibly still be running. + */ + if (TransactionIdPrecedes(xid, RecentXmin)) + return false; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (i = 0; i < arrayP->numProcs; i++) + { + int pgprocno = arrayP->pgprocnos[i]; + PGPROC *proc = &allProcs[pgprocno]; + TransactionId pxid; + + /* Fetch xid just once - see GetNewTransactionId */ + pxid = UINT32_ACCESS_ONCE(other_xids[i]); + + if (!TransactionIdIsValid(pxid)) + continue; + + if (proc->pid == 0) + continue; /* ignore prepared transactions */ + + if (TransactionIdEquals(pxid, xid)) + { + result = true; + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; +} + + +/* + * Determine XID horizons. + * + * This is used by wrapper functions like GetOldestNonRemovableTransactionId() + * (for VACUUM), GetReplicationHorizons() (for hot_standby_feedback), etc as + * well as "internally" by GlobalVisUpdate() (see comment above struct + * GlobalVisState). + * + * See the definition of ComputeXidHorizonsResult for the various computed + * horizons. + * + * For VACUUM separate horizons (used to decide which deleted tuples must + * be preserved), for shared and non-shared tables are computed. For shared + * relations backends in all databases must be considered, but for non-shared + * relations that's not required, since only backends in my own database could + * ever see the tuples in them. Also, we can ignore concurrently running lazy + * VACUUMs because (a) they must be working on other tables, and (b) they + * don't need to do snapshot-based lookups. + * + * This also computes a horizon used to truncate pg_subtrans. For that + * backends in all databases have to be considered, and concurrently running + * lazy VACUUMs cannot be ignored, as they still may perform pg_subtrans + * accesses. + * + * Note: we include all currently running xids in the set of considered xids. + * This ensures that if a just-started xact has not yet set its snapshot, + * when it does set the snapshot it cannot set xmin less than what we compute. + * See notes in src/backend/access/transam/README. + * + * Note: despite the above, it's possible for the calculated values to move + * backwards on repeated calls. The calculated values are conservative, so + * that anything older is definitely not considered as running by anyone + * anymore, but the exact values calculated depend on a number of things. For + * example, if there are no transactions running in the current database, the + * horizon for normal tables will be latestCompletedXid. If a transaction + * begins after that, its xmin will include in-progress transactions in other + * databases that started earlier, so another call will return a lower value. + * Nonetheless it is safe to vacuum a table in the current database with the + * first result. There are also replication-related effects: a walsender + * process can set its xmin based on transactions that are no longer running + * on the primary but are still being replayed on the standby, thus possibly + * making the values go backwards. In this case there is a possibility that + * we lose data that the standby would like to have, but unless the standby + * uses a replication slot to make its xmin persistent there is little we can + * do about that --- data is only protected if the walsender runs continuously + * while queries are executed on the standby. (The Hot Standby code deals + * with such cases by failing standby queries that needed to access + * already-removed data, so there's no integrity bug.) The computed values + * are also adjusted with vacuum_defer_cleanup_age, so increasing that setting + * on the fly is another easy way to make horizons move backwards, with no + * consequences for data integrity. + * + * Note: the approximate horizons (see definition of GlobalVisState) are + * updated by the computations done here. That's currently required for + * correctness and a small optimization. Without doing so it's possible that + * heap vacuum's call to heap_page_prune() uses a more conservative horizon + * than later when deciding which tuples can be removed - which the code + * doesn't expect (breaking HOT). + */ +static void +ComputeXidHorizons(ComputeXidHorizonsResult *h) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId kaxmin; + bool in_recovery = RecoveryInProgress(); + TransactionId *other_xids = ProcGlobal->xids; + + /* inferred after ProcArrayLock is released */ + h->catalog_oldest_nonremovable = InvalidTransactionId; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + h->latest_completed = ShmemVariableCache->latestCompletedXid; + + /* + * We initialize the MIN() calculation with latestCompletedXid + 1. This + * is a lower bound for the XIDs that might appear in the ProcArray later, + * and so protects us against overestimating the result due to future + * additions. + */ + { + TransactionId initial; + + initial = XidFromFullTransactionId(h->latest_completed); + Assert(TransactionIdIsValid(initial)); + TransactionIdAdvance(initial); + + h->oldest_considered_running = initial; + h->shared_oldest_nonremovable = initial; + h->data_oldest_nonremovable = initial; + + /* + * Only modifications made by this backend affect the horizon for + * temporary relations. Instead of a check in each iteration of the + * loop over all PGPROCs it is cheaper to just initialize to the + * current top-level xid any. + * + * Without an assigned xid we could use a horizon as aggressive as + * ReadNewTransactionid(), but we can get away with the much cheaper + * latestCompletedXid + 1: If this backend has no xid there, by + * definition, can't be any newer changes in the temp table than + * latestCompletedXid. + */ + if (TransactionIdIsValid(MyProc->xid)) + h->temp_oldest_nonremovable = MyProc->xid; + else + h->temp_oldest_nonremovable = initial; + } + + /* + * Fetch slot horizons while ProcArrayLock is held - the + * LWLockAcquire/LWLockRelease are a barrier, ensuring this happens inside + * the lock. + */ + h->slot_xmin = procArray->replication_slot_xmin; + h->slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + + for (int index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + int8 statusFlags = ProcGlobal->statusFlags[index]; + TransactionId xid; + TransactionId xmin; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(other_xids[index]); + xmin = UINT32_ACCESS_ONCE(proc->xmin); + + /* + * Consider both the transaction's Xmin, and its Xid. + * + * We must check both because a transaction might have an Xmin but not + * (yet) an Xid; conversely, if it has an Xid, that could determine + * some not-yet-set Xmin. + */ + xmin = TransactionIdOlder(xmin, xid); + + /* if neither is set, this proc doesn't influence the horizon */ + if (!TransactionIdIsValid(xmin)) + continue; + + /* + * Don't ignore any procs when determining which transactions might be + * considered running. While slots should ensure logical decoding + * backends are protected even without this check, it can't hurt to + * include them here as well.. + */ + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, xmin); + + /* + * Skip over backends either vacuuming (which is ok with rows being + * removed, as long as pg_subtrans is not truncated) or doing logical + * decoding (which manages xmin separately, check below). + */ + if (statusFlags & (PROC_IN_VACUUM | PROC_IN_LOGICAL_DECODING)) + continue; + + /* shared tables need to take backends in all databases into account */ + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, xmin); + + /* + * Normally queries in other databases are ignored for anything but + * the shared horizon. But in recovery we cannot compute an accurate + * per-database horizon as all xids are managed via the + * KnownAssignedXids machinery. + * + * Be careful to compute a pessimistic value when MyDatabaseId is not + * set. If this is a backend in the process of starting up, we may not + * use a "too aggressive" horizon (otherwise we could end up using it + * to prune still needed data away). If the current backend never + * connects to a database that is harmless, because + * data_oldest_nonremovable will never be utilized. + */ + if (in_recovery || + MyDatabaseId == InvalidOid || proc->databaseId == MyDatabaseId || + proc->databaseId == 0) /* always include WalSender */ + { + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, xmin); + } + } + + /* + * If in recovery fetch oldest xid in KnownAssignedXids, will be applied + * after lock is released. + */ + if (in_recovery) + kaxmin = KnownAssignedXidsGetOldestXmin(); + + /* + * No other information from shared state is needed, release the lock + * immediately. The rest of the computations can be done without a lock. + */ + LWLockRelease(ProcArrayLock); + + if (in_recovery) + { + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, kaxmin); + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, kaxmin); + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, kaxmin); + /* temp relations cannot be accessed in recovery */ + } + else + { + /* + * Compute the cutoff XID by subtracting vacuum_defer_cleanup_age. + * + * vacuum_defer_cleanup_age provides some additional "slop" for the + * benefit of hot standby queries on standby servers. This is quick + * and dirty, and perhaps not all that useful unless the primary has a + * predictable transaction rate, but it offers some protection when + * there's no walsender connection. Note that we are assuming + * vacuum_defer_cleanup_age isn't large enough to cause wraparound --- + * so guc.c should limit it to no more than the xidStopLimit threshold + * in varsup.c. Also note that we intentionally don't apply + * vacuum_defer_cleanup_age on standby servers. + */ + h->oldest_considered_running = + TransactionIdRetreatedBy(h->oldest_considered_running, + vacuum_defer_cleanup_age); + h->shared_oldest_nonremovable = + TransactionIdRetreatedBy(h->shared_oldest_nonremovable, + vacuum_defer_cleanup_age); + h->data_oldest_nonremovable = + TransactionIdRetreatedBy(h->data_oldest_nonremovable, + vacuum_defer_cleanup_age); + /* defer doesn't apply to temp relations */ + } + + /* + * Check whether there are replication slots requiring an older xmin. + */ + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, h->slot_xmin); + h->data_oldest_nonremovable = + TransactionIdOlder(h->data_oldest_nonremovable, h->slot_xmin); + + /* + * The only difference between catalog / data horizons is that the slot's + * catalog xmin is applied to the catalog one (so catalogs can be accessed + * for logical decoding). Initialize with data horizon, and then back up + * further if necessary. Have to back up the shared horizon as well, since + * that also can contain catalogs. + */ + h->shared_oldest_nonremovable_raw = h->shared_oldest_nonremovable; + h->shared_oldest_nonremovable = + TransactionIdOlder(h->shared_oldest_nonremovable, + h->slot_catalog_xmin); + h->catalog_oldest_nonremovable = h->data_oldest_nonremovable; + h->catalog_oldest_nonremovable = + TransactionIdOlder(h->catalog_oldest_nonremovable, + h->slot_catalog_xmin); + + /* + * It's possible that slots / vacuum_defer_cleanup_age backed up the + * horizons further than oldest_considered_running. Fix. + */ + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->shared_oldest_nonremovable); + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->catalog_oldest_nonremovable); + h->oldest_considered_running = + TransactionIdOlder(h->oldest_considered_running, + h->data_oldest_nonremovable); + + /* + * shared horizons have to be at least as old as the oldest visible in + * current db + */ + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable, + h->data_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->shared_oldest_nonremovable, + h->catalog_oldest_nonremovable)); + + /* + * Horizons need to ensure that pg_subtrans access is still possible for + * the relevant backends. + */ + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->shared_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->catalog_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->data_oldest_nonremovable)); + Assert(TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->temp_oldest_nonremovable)); + Assert(!TransactionIdIsValid(h->slot_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->slot_xmin)); + Assert(!TransactionIdIsValid(h->slot_catalog_xmin) || + TransactionIdPrecedesOrEquals(h->oldest_considered_running, + h->slot_catalog_xmin)); + + /* update approximate horizons with the computed horizons */ + GlobalVisUpdateApply(h); +} + +/* + * Determine what kind of visibility horizon needs to be used for a + * relation. If rel is NULL, the most conservative horizon is used. + */ +static inline GlobalVisHorizonKind +GlobalVisHorizonKindForRel(Relation rel) +{ + /* + * Other relkkinds currently don't contain xids, nor always the necessary + * logical decoding markers. + */ + Assert(!rel || + rel->rd_rel->relkind == RELKIND_RELATION || + rel->rd_rel->relkind == RELKIND_MATVIEW || + rel->rd_rel->relkind == RELKIND_TOASTVALUE); + + if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress()) + return VISHORIZON_SHARED; + else if (IsCatalogRelation(rel) || + RelationIsAccessibleInLogicalDecoding(rel)) + return VISHORIZON_CATALOG; + else if (!RELATION_IS_LOCAL(rel)) + return VISHORIZON_DATA; + else + return VISHORIZON_TEMP; +} + +/* + * Return the oldest XID for which deleted tuples must be preserved in the + * passed table. + * + * If rel is not NULL the horizon may be considerably more recent than + * otherwise (i.e. fewer tuples will be removable). In the NULL case a horizon + * that is correct (but not optimal) for all relations will be returned. + * + * This is used by VACUUM to decide which deleted tuples must be preserved in + * the passed in table. + */ +TransactionId +GetOldestNonRemovableTransactionId(Relation rel) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + switch (GlobalVisHorizonKindForRel(rel)) + { + case VISHORIZON_SHARED: + return horizons.shared_oldest_nonremovable; + case VISHORIZON_CATALOG: + return horizons.catalog_oldest_nonremovable; + case VISHORIZON_DATA: + return horizons.data_oldest_nonremovable; + case VISHORIZON_TEMP: + return horizons.temp_oldest_nonremovable; + } + + return InvalidTransactionId; +} + +/* + * Return the oldest transaction id any currently running backend might still + * consider running. This should not be used for visibility / pruning + * determinations (see GetOldestNonRemovableTransactionId()), but for + * decisions like up to where pg_subtrans can be truncated. + */ +TransactionId +GetOldestTransactionIdConsideredRunning(void) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + return horizons.oldest_considered_running; +} + +/* + * Return the visibility horizons for a hot standby feedback message. + */ +void +GetReplicationHorizons(TransactionId *xmin, TransactionId *catalog_xmin) +{ + ComputeXidHorizonsResult horizons; + + ComputeXidHorizons(&horizons); + + /* + * Don't want to use shared_oldest_nonremovable here, as that contains the + * effect of replication slot's catalog_xmin. We want to send a separate + * feedback for the catalog horizon, so the primary can remove data table + * contents more aggressively. + */ + *xmin = horizons.shared_oldest_nonremovable_raw; + *catalog_xmin = horizons.slot_catalog_xmin; +} + +/* + * GetMaxSnapshotXidCount -- get max size for snapshot XID array + * + * We have to export this for use by snapmgr.c. + */ +int +GetMaxSnapshotXidCount(void) +{ + return procArray->maxProcs; +} + +/* + * GetMaxSnapshotSubxidCount -- get max size for snapshot sub-XID array + * + * We have to export this for use by snapmgr.c. + */ +int +GetMaxSnapshotSubxidCount(void) +{ + return TOTAL_MAX_CACHED_SUBXIDS; +} + +/* + * Initialize old_snapshot_threshold specific parts of a newly build snapshot. + */ +static void +GetSnapshotDataInitOldSnapshot(Snapshot snapshot) +{ + if (!OldSnapshotThresholdActive()) + { + /* + * If not using "snapshot too old" feature, fill related fields with + * dummy values that don't require any locking. + */ + snapshot->lsn = InvalidXLogRecPtr; + snapshot->whenTaken = 0; + } + else + { + /* + * Capture the current time and WAL stream location in case this + * snapshot becomes old enough to need to fall back on the special + * "old snapshot" logic. + */ + snapshot->lsn = GetXLogInsertRecPtr(); + snapshot->whenTaken = GetSnapshotCurrentTimestamp(); + MaintainOldSnapshotTimeMapping(snapshot->whenTaken, snapshot->xmin); + } +} + +/* + * Helper function for GetSnapshotData() that checks if the bulk of the + * visibility information in the snapshot is still valid. If so, it updates + * the fields that need to change and returns true. Otherwise it returns + * false. + * + * This very likely can be evolved to not need ProcArrayLock held (at very + * least in the case we already hold a snapshot), but that's for another day. + */ +static bool +GetSnapshotDataReuse(Snapshot snapshot) +{ + uint64 curXactCompletionCount; + + Assert(LWLockHeldByMe(ProcArrayLock)); + + if (unlikely(snapshot->snapXactCompletionCount == 0)) + return false; + + curXactCompletionCount = ShmemVariableCache->xactCompletionCount; + if (curXactCompletionCount != snapshot->snapXactCompletionCount) + return false; + + /* + * If the current xactCompletionCount is still the same as it was at the + * time the snapshot was built, we can be sure that rebuilding the + * contents of the snapshot the hard way would result in the same snapshot + * contents: + * + * As explained in transam/README, the set of xids considered running by + * GetSnapshotData() cannot change while ProcArrayLock is held. Snapshot + * contents only depend on transactions with xids and xactCompletionCount + * is incremented whenever a transaction with an xid finishes (while + * holding ProcArrayLock) exclusively). Thus the xactCompletionCount check + * ensures we would detect if the snapshot would have changed. + * + * As the snapshot contents are the same as it was before, it is safe to + * re-enter the snapshot's xmin into the PGPROC array. None of the rows + * visible under the snapshot could already have been removed (that'd + * require the set of running transactions to change) and it fulfills the + * requirement that concurrent GetSnapshotData() calls yield the same + * xmin. + */ + if (!TransactionIdIsValid(MyProc->xmin)) + MyProc->xmin = TransactionXmin = snapshot->xmin; + + RecentXmin = snapshot->xmin; + Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin)); + + snapshot->curcid = GetCurrentCommandId(false); + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->copied = false; + + GetSnapshotDataInitOldSnapshot(snapshot); + + return true; +} + +/* + * GetSnapshotData -- returns information about running transactions. + * + * The returned snapshot includes xmin (lowest still-running xact ID), + * xmax (highest completed xact ID + 1), and a list of running xact IDs + * in the range xmin <= xid < xmax. It is used as follows: + * All xact IDs < xmin are considered finished. + * All xact IDs >= xmax are considered still running. + * For an xact ID xmin <= xid < xmax, consult list to see whether + * it is considered running or not. + * This ensures that the set of transactions seen as "running" by the + * current xact will not change after it takes the snapshot. + * + * All running top-level XIDs are included in the snapshot, except for lazy + * VACUUM processes. We also try to include running subtransaction XIDs, + * but since PGPROC has only a limited cache area for subxact XIDs, full + * information may not be available. If we find any overflowed subxid arrays, + * we have to mark the snapshot's subxid data as overflowed, and extra work + * *may* need to be done to determine what's running (see XidInMVCCSnapshot() + * in heapam_visibility.c). + * + * We also update the following backend-global variables: + * TransactionXmin: the oldest xmin of any snapshot in use in the + * current transaction (this is the same as MyProc->xmin). + * RecentXmin: the xmin computed for the most recent snapshot. XIDs + * older than this are known not running any more. + * + * And try to advance the bounds of GlobalVis{Shared,Catalog,Data,Temp}Rels + * for the benefit of the GlobalVisTest* family of functions. + * + * Note: this function should probably not be called with an argument that's + * not statically allocated (see xip allocation below). + */ +Snapshot +GetSnapshotData(Snapshot snapshot) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + TransactionId xmin; + TransactionId xmax; + int count = 0; + int subcount = 0; + bool suboverflowed = false; + FullTransactionId latest_completed; + TransactionId oldestxid; + int mypgxactoff; + TransactionId myxid; + uint64 curXactCompletionCount; + + TransactionId replication_slot_xmin = InvalidTransactionId; + TransactionId replication_slot_catalog_xmin = InvalidTransactionId; + + Assert(snapshot != NULL); + + /* + * Allocating space for maxProcs xids is usually overkill; numProcs would + * be sufficient. But it seems better to do the malloc while not holding + * the lock, so we can't look at numProcs. Likewise, we allocate much + * more subxip storage than is probably needed. + * + * This does open a possibility for avoiding repeated malloc/free: since + * maxProcs does not change at runtime, we can simply reuse the previous + * xip arrays if any. (This relies on the fact that all callers pass + * static SnapshotData structs.) + */ + if (snapshot->xip == NULL) + { + /* + * First call for this snapshot. Snapshot is same size whether or not + * we are in recovery, see later comments. + */ + snapshot->xip = (TransactionId *) + malloc(GetMaxSnapshotXidCount() * sizeof(TransactionId)); + if (snapshot->xip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + Assert(snapshot->subxip == NULL); + snapshot->subxip = (TransactionId *) + malloc(GetMaxSnapshotSubxidCount() * sizeof(TransactionId)); + if (snapshot->subxip == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + /* + * It is sufficient to get shared lock on ProcArrayLock, even if we are + * going to set MyProc->xmin. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + if (GetSnapshotDataReuse(snapshot)) + { + LWLockRelease(ProcArrayLock); + return snapshot; + } + + latest_completed = ShmemVariableCache->latestCompletedXid; + mypgxactoff = MyProc->pgxactoff; + myxid = other_xids[mypgxactoff]; + Assert(myxid == MyProc->xid); + + oldestxid = ShmemVariableCache->oldestXid; + curXactCompletionCount = ShmemVariableCache->xactCompletionCount; + + /* xmax is always latestCompletedXid + 1 */ + xmax = XidFromFullTransactionId(latest_completed); + TransactionIdAdvance(xmax); + Assert(TransactionIdIsNormal(xmax)); + + /* initialize xmin calculation with xmax */ + xmin = xmax; + + /* take own xid into account, saves a check inside the loop */ + if (TransactionIdIsNormal(myxid) && NormalTransactionIdPrecedes(myxid, xmin)) + xmin = myxid; + + snapshot->takenDuringRecovery = RecoveryInProgress(); + + if (!snapshot->takenDuringRecovery) + { + int numProcs = arrayP->numProcs; + TransactionId *xip = snapshot->xip; + int *pgprocnos = arrayP->pgprocnos; + XidCacheStatus *subxidStates = ProcGlobal->subxidStates; + uint8 *allStatusFlags = ProcGlobal->statusFlags; + + /* + * First collect set of pgxactoff/xids that need to be included in the + * snapshot. + */ + for (int pgxactoff = 0; pgxactoff < numProcs; pgxactoff++) + { + /* Fetch xid just once - see GetNewTransactionId */ + TransactionId xid = UINT32_ACCESS_ONCE(other_xids[pgxactoff]); + uint8 statusFlags; + + Assert(allProcs[arrayP->pgprocnos[pgxactoff]].pgxactoff == pgxactoff); + + /* + * If the transaction has no XID assigned, we can skip it; it + * won't have sub-XIDs either. + */ + if (likely(xid == InvalidTransactionId)) + continue; + + /* + * We don't include our own XIDs (if any) in the snapshot. It + * needs to be includeded in the xmin computation, but we did so + * outside the loop. + */ + if (pgxactoff == mypgxactoff) + continue; + + /* + * The only way we are able to get here with a non-normal xid is + * during bootstrap - with this backend using + * BootstrapTransactionId. But the above test should filter that + * out. + */ + Assert(TransactionIdIsNormal(xid)); + + /* + * If the XID is >= xmax, we can skip it; such transactions will + * be treated as running anyway (and any sub-XIDs will also be >= + * xmax). + */ + if (!NormalTransactionIdPrecedes(xid, xmax)) + continue; + + /* + * Skip over backends doing logical decoding which manages xmin + * separately (check below) and ones running LAZY VACUUM. + */ + statusFlags = allStatusFlags[pgxactoff]; + if (statusFlags & (PROC_IN_LOGICAL_DECODING | PROC_IN_VACUUM)) + continue; + + if (NormalTransactionIdPrecedes(xid, xmin)) + xmin = xid; + + /* Add XID to snapshot. */ + xip[count++] = xid; + + /* + * Save subtransaction XIDs if possible (if we've already + * overflowed, there's no point). Note that the subxact XIDs must + * be later than their parent, so no need to check them against + * xmin. We could filter against xmax, but it seems better not to + * do that much work while holding the ProcArrayLock. + * + * The other backend can add more subxids concurrently, but cannot + * remove any. Hence it's important to fetch nxids just once. + * Should be safe to use memcpy, though. (We needn't worry about + * missing any xids added concurrently, because they must postdate + * xmax.) + * + * Again, our own XIDs are not included in the snapshot. + */ + if (!suboverflowed) + { + + if (subxidStates[pgxactoff].overflowed) + suboverflowed = true; + else + { + int nsubxids = subxidStates[pgxactoff].count; + + if (nsubxids > 0) + { + int pgprocno = pgprocnos[pgxactoff]; + PGPROC *proc = &allProcs[pgprocno]; + + pg_read_barrier(); /* pairs with GetNewTransactionId */ + + memcpy(snapshot->subxip + subcount, + (void *) proc->subxids.xids, + nsubxids * sizeof(TransactionId)); + subcount += nsubxids; + } + } + } + } + } + else + { + /* + * We're in hot standby, so get XIDs from KnownAssignedXids. + * + * We store all xids directly into subxip[]. Here's why: + * + * In recovery we don't know which xids are top-level and which are + * subxacts, a design choice that greatly simplifies xid processing. + * + * It seems like we would want to try to put xids into xip[] only, but + * that is fairly small. We would either need to make that bigger or + * to increase the rate at which we WAL-log xid assignment; neither is + * an appealing choice. + * + * We could try to store xids into xip[] first and then into subxip[] + * if there are too many xids. That only works if the snapshot doesn't + * overflow because we do not search subxip[] in that case. A simpler + * way is to just store all xids in the subxact array because this is + * by far the bigger array. We just leave the xip array empty. + * + * Either way we need to change the way XidInMVCCSnapshot() works + * depending upon when the snapshot was taken, or change normal + * snapshot processing so it matches. + * + * Note: It is possible for recovery to end before we finish taking + * the snapshot, and for newly assigned transaction ids to be added to + * the ProcArray. xmax cannot change while we hold ProcArrayLock, so + * those newly added transaction ids would be filtered away, so we + * need not be concerned about them. + */ + subcount = KnownAssignedXidsGetAndSetXmin(snapshot->subxip, &xmin, + xmax); + + if (TransactionIdPrecedesOrEquals(xmin, procArray->lastOverflowedXid)) + suboverflowed = true; + } + + + /* + * Fetch into local variable while ProcArrayLock is held - the + * LWLockRelease below is a barrier, ensuring this happens inside the + * lock. + */ + replication_slot_xmin = procArray->replication_slot_xmin; + replication_slot_catalog_xmin = procArray->replication_slot_catalog_xmin; + + if (!TransactionIdIsValid(MyProc->xmin)) + MyProc->xmin = TransactionXmin = xmin; + + LWLockRelease(ProcArrayLock); + + /* maintain state for GlobalVis* */ + { + TransactionId def_vis_xid; + TransactionId def_vis_xid_data; + FullTransactionId def_vis_fxid; + FullTransactionId def_vis_fxid_data; + FullTransactionId oldestfxid; + + /* + * Converting oldestXid is only safe when xid horizon cannot advance, + * i.e. holding locks. While we don't hold the lock anymore, all the + * necessary data has been gathered with lock held. + */ + oldestfxid = FullXidRelativeTo(latest_completed, oldestxid); + + /* apply vacuum_defer_cleanup_age */ + def_vis_xid_data = + TransactionIdRetreatedBy(xmin, vacuum_defer_cleanup_age); + + /* Check whether there's a replication slot requiring an older xmin. */ + def_vis_xid_data = + TransactionIdOlder(def_vis_xid_data, replication_slot_xmin); + + /* + * Rows in non-shared, non-catalog tables possibly could be vacuumed + * if older than this xid. + */ + def_vis_xid = def_vis_xid_data; + + /* + * Check whether there's a replication slot requiring an older catalog + * xmin. + */ + def_vis_xid = + TransactionIdOlder(replication_slot_catalog_xmin, def_vis_xid); + + def_vis_fxid = FullXidRelativeTo(latest_completed, def_vis_xid); + def_vis_fxid_data = FullXidRelativeTo(latest_completed, def_vis_xid_data); + + /* + * Check if we can increase upper bound. As a previous + * GlobalVisUpdate() might have computed more aggressive values, don't + * overwrite them if so. + */ + GlobalVisSharedRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid, + GlobalVisSharedRels.definitely_needed); + GlobalVisCatalogRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid, + GlobalVisCatalogRels.definitely_needed); + GlobalVisDataRels.definitely_needed = + FullTransactionIdNewer(def_vis_fxid_data, + GlobalVisDataRels.definitely_needed); + /* See temp_oldest_nonremovable computation in ComputeXidHorizons() */ + if (TransactionIdIsNormal(myxid)) + GlobalVisTempRels.definitely_needed = + FullXidRelativeTo(latest_completed, myxid); + else + { + GlobalVisTempRels.definitely_needed = latest_completed; + FullTransactionIdAdvance(&GlobalVisTempRels.definitely_needed); + } + + /* + * Check if we know that we can initialize or increase the lower + * bound. Currently the only cheap way to do so is to use + * ShmemVariableCache->oldestXid as input. + * + * We should definitely be able to do better. We could e.g. put a + * global lower bound value into ShmemVariableCache. + */ + GlobalVisSharedRels.maybe_needed = + FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed, + oldestfxid); + GlobalVisCatalogRels.maybe_needed = + FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed, + oldestfxid); + GlobalVisDataRels.maybe_needed = + FullTransactionIdNewer(GlobalVisDataRels.maybe_needed, + oldestfxid); + /* accurate value known */ + GlobalVisTempRels.maybe_needed = GlobalVisTempRels.definitely_needed; + } + + RecentXmin = xmin; + Assert(TransactionIdPrecedesOrEquals(TransactionXmin, RecentXmin)); + + snapshot->xmin = xmin; + snapshot->xmax = xmax; + snapshot->xcnt = count; + snapshot->subxcnt = subcount; + snapshot->suboverflowed = suboverflowed; + snapshot->snapXactCompletionCount = curXactCompletionCount; + + snapshot->curcid = GetCurrentCommandId(false); + + /* + * This is a new snapshot, so set both refcounts are zero, and mark it as + * not copied in persistent memory. + */ + snapshot->active_count = 0; + snapshot->regd_count = 0; + snapshot->copied = false; + + GetSnapshotDataInitOldSnapshot(snapshot); + + return snapshot; +} + +/* + * ProcArrayInstallImportedXmin -- install imported xmin into MyProc->xmin + * + * This is called when installing a snapshot imported from another + * transaction. To ensure that OldestXmin doesn't go backwards, we must + * check that the source transaction is still running, and we'd better do + * that atomically with installing the new xmin. + * + * Returns true if successful, false if source xact is no longer running. + */ +bool +ProcArrayInstallImportedXmin(TransactionId xmin, + VirtualTransactionId *sourcevxid) +{ + bool result = false; + ProcArrayStruct *arrayP = procArray; + int index; + + Assert(TransactionIdIsNormal(xmin)); + if (!sourcevxid) + return false; + + /* Get lock so source xact can't end while we're doing this */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + int statusFlags = ProcGlobal->statusFlags[index]; + TransactionId xid; + + /* Ignore procs running LAZY VACUUM */ + if (statusFlags & PROC_IN_VACUUM) + continue; + + /* We are only interested in the specific virtual transaction. */ + if (proc->backendId != sourcevxid->backendId) + continue; + if (proc->lxid != sourcevxid->localTransactionId) + continue; + + /* + * We check the transaction's database ID for paranoia's sake: if it's + * in another DB then its xmin does not cover us. Caller should have + * detected this already, so we just treat any funny cases as + * "transaction not found". + */ + if (proc->databaseId != MyDatabaseId) + continue; + + /* + * Likewise, let's just make real sure its xmin does cover us. + */ + xid = UINT32_ACCESS_ONCE(proc->xmin); + if (!TransactionIdIsNormal(xid) || + !TransactionIdPrecedesOrEquals(xid, xmin)) + continue; + + /* + * We're good. Install the new xmin. As in GetSnapshotData, set + * TransactionXmin too. (Note that because snapmgr.c called + * GetSnapshotData first, we'll be overwriting a valid xmin here, so + * we don't check that.) + */ + MyProc->xmin = TransactionXmin = xmin; + + result = true; + break; + } + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * ProcArrayInstallRestoredXmin -- install restored xmin into MyProc->xmin + * + * This is like ProcArrayInstallImportedXmin, but we have a pointer to the + * PGPROC of the transaction from which we imported the snapshot, rather than + * an XID. + * + * Note that this function also copies statusFlags from the source `proc` in + * order to avoid the case where MyProc's xmin needs to be skipped for + * computing xid horizon. + * + * Returns true if successful, false if source xact is no longer running. + */ +bool +ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) +{ + bool result = false; + TransactionId xid; + + Assert(TransactionIdIsNormal(xmin)); + Assert(proc != NULL); + + /* + * Get an exclusive lock so that we can copy statusFlags from source proc. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Be certain that the referenced PGPROC has an advertised xmin which is + * no later than the one we're installing, so that the system-wide xmin + * can't go backwards. Also, make sure it's running in the same database, + * so that the per-database xmin cannot go backwards. + */ + xid = UINT32_ACCESS_ONCE(proc->xmin); + if (proc->databaseId == MyDatabaseId && + TransactionIdIsNormal(xid) && + TransactionIdPrecedesOrEquals(xid, xmin)) + { + /* + * Install xmin and propagate the statusFlags that affect how the + * value is interpreted by vacuum. + */ + MyProc->xmin = TransactionXmin = xmin; + MyProc->statusFlags = (MyProc->statusFlags & ~PROC_XMIN_FLAGS) | + (proc->statusFlags & PROC_XMIN_FLAGS); + ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags; + + result = true; + } + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * GetRunningTransactionData -- returns information about running transactions. + * + * Similar to GetSnapshotData but returns more information. We include + * all PGPROCs with an assigned TransactionId, even VACUUM processes and + * prepared transactions. + * + * We acquire XidGenLock and ProcArrayLock, but the caller is responsible for + * releasing them. Acquiring XidGenLock ensures that no new XIDs enter the proc + * array until the caller has WAL-logged this snapshot, and releases the + * lock. Acquiring ProcArrayLock ensures that no transactions commit until the + * lock is released. + * + * The returned data structure is statically allocated; caller should not + * modify it, and must not assume it is valid past the next call. + * + * This is never executed during recovery so there is no need to look at + * KnownAssignedXids. + * + * Dummy PGPROCs from prepared transaction are included, meaning that this + * may return entries with duplicated TransactionId values coming from + * transaction finishing to prepare. Nothing is done about duplicated + * entries here to not hold on ProcArrayLock more than necessary. + * + * We don't worry about updating other counters, we want to keep this as + * simple as possible and leave GetSnapshotData() as the primary code for + * that bookkeeping. + * + * Note that if any transaction has overflowed its cached subtransactions + * then there is no real need include any subtransactions. + */ +RunningTransactions +GetRunningTransactionData(void) +{ + /* result workspace */ + static RunningTransactionsData CurrentRunningXactsData; + + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + RunningTransactions CurrentRunningXacts = &CurrentRunningXactsData; + TransactionId latestCompletedXid; + TransactionId oldestRunningXid; + TransactionId *xids; + int index; + int count; + int subcount; + bool suboverflowed; + + Assert(!RecoveryInProgress()); + + /* + * Allocating space for maxProcs xids is usually overkill; numProcs would + * be sufficient. But it seems better to do the malloc while not holding + * the lock, so we can't look at numProcs. Likewise, we allocate much + * more subxip storage than is probably needed. + * + * Should only be allocated in bgwriter, since only ever executed during + * checkpoints. + */ + if (CurrentRunningXacts->xids == NULL) + { + /* + * First call + */ + CurrentRunningXacts->xids = (TransactionId *) + malloc(TOTAL_MAX_CACHED_SUBXIDS * sizeof(TransactionId)); + if (CurrentRunningXacts->xids == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + xids = CurrentRunningXacts->xids; + + count = subcount = 0; + suboverflowed = false; + + /* + * Ensure that no xids enter or leave the procarray while we obtain + * snapshot. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + LWLockAcquire(XidGenLock, LW_SHARED); + + latestCompletedXid = + XidFromFullTransactionId(ShmemVariableCache->latestCompletedXid); + oldestRunningXid = + XidFromFullTransactionId(ShmemVariableCache->nextXid); + + /* + * Spin over procArray collecting all xids + */ + for (index = 0; index < arrayP->numProcs; index++) + { + TransactionId xid; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(other_xids[index]); + + /* + * We don't need to store transactions that don't have a TransactionId + * yet because they will not show as running on a standby server. + */ + if (!TransactionIdIsValid(xid)) + continue; + + /* + * Be careful not to exclude any xids before calculating the values of + * oldestRunningXid and suboverflowed, since these are used to clean + * up transaction information held on standbys. + */ + if (TransactionIdPrecedes(xid, oldestRunningXid)) + oldestRunningXid = xid; + + if (ProcGlobal->subxidStates[index].overflowed) + suboverflowed = true; + + /* + * If we wished to exclude xids this would be the right place for it. + * Procs with the PROC_IN_VACUUM flag set don't usually assign xids, + * but they do during truncation at the end when they get the lock and + * truncate, so it is not much of a problem to include them if they + * are seen and it is cleaner to include them. + */ + + xids[count++] = xid; + } + + /* + * Spin over procArray collecting all subxids, but only if there hasn't + * been a suboverflow. + */ + if (!suboverflowed) + { + XidCacheStatus *other_subxidstates = ProcGlobal->subxidStates; + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + int nsubxids; + + /* + * Save subtransaction XIDs. Other backends can't add or remove + * entries while we're holding XidGenLock. + */ + nsubxids = other_subxidstates[index].count; + if (nsubxids > 0) + { + /* barrier not really required, as XidGenLock is held, but ... */ + pg_read_barrier(); /* pairs with GetNewTransactionId */ + + memcpy(&xids[count], (void *) proc->subxids.xids, + nsubxids * sizeof(TransactionId)); + count += nsubxids; + subcount += nsubxids; + + /* + * Top-level XID of a transaction is always less than any of + * its subxids, so we don't need to check if any of the + * subxids are smaller than oldestRunningXid + */ + } + } + } + + /* + * It's important *not* to include the limits set by slots here because + * snapbuild.c uses oldestRunningXid to manage its xmin horizon. If those + * were to be included here the initial value could never increase because + * of a circular dependency where slots only increase their limits when + * running xacts increases oldestRunningXid and running xacts only + * increases if slots do. + */ + + CurrentRunningXacts->xcnt = count - subcount; + CurrentRunningXacts->subxcnt = subcount; + CurrentRunningXacts->subxid_overflow = suboverflowed; + CurrentRunningXacts->nextXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + CurrentRunningXacts->oldestRunningXid = oldestRunningXid; + CurrentRunningXacts->latestCompletedXid = latestCompletedXid; + + Assert(TransactionIdIsValid(CurrentRunningXacts->nextXid)); + Assert(TransactionIdIsValid(CurrentRunningXacts->oldestRunningXid)); + Assert(TransactionIdIsNormal(CurrentRunningXacts->latestCompletedXid)); + + /* We don't release the locks here, the caller is responsible for that */ + + return CurrentRunningXacts; +} + +/* + * GetOldestActiveTransactionId() + * + * Similar to GetSnapshotData but returns just oldestActiveXid. We include + * all PGPROCs with an assigned TransactionId, even VACUUM processes. + * We look at all databases, though there is no need to include WALSender + * since this has no effect on hot standby conflicts. + * + * This is never executed during recovery so there is no need to look at + * KnownAssignedXids. + * + * We don't worry about updating other counters, we want to keep this as + * simple as possible and leave GetSnapshotData() as the primary code for + * that bookkeeping. + */ +TransactionId +GetOldestActiveTransactionId(void) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + TransactionId oldestRunningXid; + int index; + + Assert(!RecoveryInProgress()); + + /* + * Read nextXid, as the upper bound of what's still active. + * + * Reading a TransactionId is atomic, but we must grab the lock to make + * sure that all XIDs < nextXid are already present in the proc array (or + * have already completed), when we spin over it. + */ + LWLockAcquire(XidGenLock, LW_SHARED); + oldestRunningXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + LWLockRelease(XidGenLock); + + /* + * Spin over procArray collecting all xids and subxids. + */ + LWLockAcquire(ProcArrayLock, LW_SHARED); + for (index = 0; index < arrayP->numProcs; index++) + { + TransactionId xid; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(other_xids[index]); + + if (!TransactionIdIsNormal(xid)) + continue; + + if (TransactionIdPrecedes(xid, oldestRunningXid)) + oldestRunningXid = xid; + + /* + * Top-level XID of a transaction is always less than any of its + * subxids, so we don't need to check if any of the subxids are + * smaller than oldestRunningXid + */ + } + LWLockRelease(ProcArrayLock); + + return oldestRunningXid; +} + +/* + * GetOldestSafeDecodingTransactionId -- lowest xid not affected by vacuum + * + * Returns the oldest xid that we can guarantee not to have been affected by + * vacuum, i.e. no rows >= that xid have been vacuumed away unless the + * transaction aborted. Note that the value can (and most of the time will) be + * much more conservative than what really has been affected by vacuum, but we + * currently don't have better data available. + * + * This is useful to initialize the cutoff xid after which a new changeset + * extraction replication slot can start decoding changes. + * + * Must be called with ProcArrayLock held either shared or exclusively, + * although most callers will want to use exclusive mode since it is expected + * that the caller will immediately use the xid to peg the xmin horizon. + */ +TransactionId +GetOldestSafeDecodingTransactionId(bool catalogOnly) +{ + ProcArrayStruct *arrayP = procArray; + TransactionId oldestSafeXid; + int index; + bool recovery_in_progress = RecoveryInProgress(); + + Assert(LWLockHeldByMe(ProcArrayLock)); + + /* + * Acquire XidGenLock, so no transactions can acquire an xid while we're + * running. If no transaction with xid were running concurrently a new xid + * could influence the RecentXmin et al. + * + * We initialize the computation to nextXid since that's guaranteed to be + * a safe, albeit pessimal, value. + */ + LWLockAcquire(XidGenLock, LW_SHARED); + oldestSafeXid = XidFromFullTransactionId(ShmemVariableCache->nextXid); + + /* + * If there's already a slot pegging the xmin horizon, we can start with + * that value, it's guaranteed to be safe since it's computed by this + * routine initially and has been enforced since. We can always use the + * slot's general xmin horizon, but the catalog horizon is only usable + * when only catalog data is going to be looked at. + */ + if (TransactionIdIsValid(procArray->replication_slot_xmin) && + TransactionIdPrecedes(procArray->replication_slot_xmin, + oldestSafeXid)) + oldestSafeXid = procArray->replication_slot_xmin; + + if (catalogOnly && + TransactionIdIsValid(procArray->replication_slot_catalog_xmin) && + TransactionIdPrecedes(procArray->replication_slot_catalog_xmin, + oldestSafeXid)) + oldestSafeXid = procArray->replication_slot_catalog_xmin; + + /* + * If we're not in recovery, we walk over the procarray and collect the + * lowest xid. Since we're called with ProcArrayLock held and have + * acquired XidGenLock, no entries can vanish concurrently, since + * ProcGlobal->xids[i] is only set with XidGenLock held and only cleared + * with ProcArrayLock held. + * + * In recovery we can't lower the safe value besides what we've computed + * above, so we'll have to wait a bit longer there. We unfortunately can + * *not* use KnownAssignedXidsGetOldestXmin() since the KnownAssignedXids + * machinery can miss values and return an older value than is safe. + */ + if (!recovery_in_progress) + { + TransactionId *other_xids = ProcGlobal->xids; + + /* + * Spin over procArray collecting min(ProcGlobal->xids[i]) + */ + for (index = 0; index < arrayP->numProcs; index++) + { + TransactionId xid; + + /* Fetch xid just once - see GetNewTransactionId */ + xid = UINT32_ACCESS_ONCE(other_xids[index]); + + if (!TransactionIdIsNormal(xid)) + continue; + + if (TransactionIdPrecedes(xid, oldestSafeXid)) + oldestSafeXid = xid; + } + } + + LWLockRelease(XidGenLock); + + return oldestSafeXid; +} + +/* + * GetVirtualXIDsDelayingChkptGuts -- Get the VXIDs of transactions that are + * delaying the start or end of a checkpoint because they have critical + * actions in progress. + * + * Constructs an array of VXIDs of transactions that are currently in commit + * critical sections, as shown by having delayChkpt or delayChkptEnd set in + * their PGPROC. + * + * Returns a palloc'd array that should be freed by the caller. + * *nvxids is the number of valid entries. + * + * Note that because backends set or clear delayChkpt and delayChkptEnd + * without holding any lock, the result is somewhat indeterminate, but we + * don't really care. Even in a multiprocessor with delayed writes to + * shared memory, it should be certain that setting of delayChkpt will + * propagate to shared memory when the backend takes a lock, so we cannot + * fail to see a virtual xact as delayChkpt if it's already inserted its + * commit record. Whether it takes a little while for clearing of + * delayChkpt to propagate is unimportant for correctness. + */ +static VirtualTransactionId * +GetVirtualXIDsDelayingChkptGuts(int *nvxids, int type) +{ + VirtualTransactionId *vxids; + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + Assert(type != 0); + + /* allocate what's certainly enough result space */ + vxids = (VirtualTransactionId *) + palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (((type & DELAY_CHKPT_START) && proc->delayChkpt) || + ((type & DELAY_CHKPT_COMPLETE) && proc->delayChkptEnd)) + { + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + } + } + + LWLockRelease(ProcArrayLock); + + *nvxids = count; + return vxids; +} + +/* + * GetVirtualXIDsDelayingChkpt - Get the VXIDs of transactions that are + * delaying the start of a checkpoint. + */ +VirtualTransactionId * +GetVirtualXIDsDelayingChkpt(int *nvxids) +{ + return GetVirtualXIDsDelayingChkptGuts(nvxids, DELAY_CHKPT_START); +} + +/* + * GetVirtualXIDsDelayingChkptEnd - Get the VXIDs of transactions that are + * delaying the end of a checkpoint. + */ +VirtualTransactionId * +GetVirtualXIDsDelayingChkptEnd(int *nvxids) +{ + return GetVirtualXIDsDelayingChkptGuts(nvxids, DELAY_CHKPT_COMPLETE); +} + +/* + * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying? + * + * This is used with the results of GetVirtualXIDsDelayingChkpt to see if any + * of the specified VXIDs are still in critical sections of code. + * + * Note: this is O(N^2) in the number of vxacts that are/were delaying, but + * those numbers should be small enough for it not to be a problem. + */ +static bool +HaveVirtualXIDsDelayingChkptGuts(VirtualTransactionId *vxids, int nvxids, + int type) +{ + bool result = false; + ProcArrayStruct *arrayP = procArray; + int index; + + Assert(type != 0); + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + + if ((((type & DELAY_CHKPT_START) && proc->delayChkpt) || + ((type & DELAY_CHKPT_COMPLETE) && proc->delayChkptEnd)) && + VirtualTransactionIdIsValid(vxid)) + { + int i; + + for (i = 0; i < nvxids; i++) + { + if (VirtualTransactionIdEquals(vxid, vxids[i])) + { + result = true; + break; + } + } + if (result) + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * HaveVirtualXIDsDelayingChkpt -- Are any of the specified VXIDs delaying + * the start of a checkpoint? + */ +bool +HaveVirtualXIDsDelayingChkpt(VirtualTransactionId *vxids, int nvxids) +{ + return HaveVirtualXIDsDelayingChkptGuts(vxids, nvxids, + DELAY_CHKPT_START); +} + +/* + * HaveVirtualXIDsDelayingChkptEnd -- Are any of the specified VXIDs delaying + * the end of a checkpoint? + */ +bool +HaveVirtualXIDsDelayingChkptEnd(VirtualTransactionId *vxids, int nvxids) +{ + return HaveVirtualXIDsDelayingChkptGuts(vxids, nvxids, + DELAY_CHKPT_COMPLETE); +} + +/* + * BackendPidGetProc -- get a backend's PGPROC given its PID + * + * Returns NULL if not found. Note that it is up to the caller to be + * sure that the question remains meaningful for long enough for the + * answer to be used ... + */ +PGPROC * +BackendPidGetProc(int pid) +{ + PGPROC *result; + + if (pid == 0) /* never match dummy PGPROCs */ + return NULL; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + result = BackendPidGetProcWithLock(pid); + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * BackendPidGetProcWithLock -- get a backend's PGPROC given its PID + * + * Same as above, except caller must be holding ProcArrayLock. The found + * entry, if any, can be assumed to be valid as long as the lock remains held. + */ +PGPROC * +BackendPidGetProcWithLock(int pid) +{ + PGPROC *result = NULL; + ProcArrayStruct *arrayP = procArray; + int index; + + if (pid == 0) /* never match dummy PGPROCs */ + return NULL; + + for (index = 0; index < arrayP->numProcs; index++) + { + PGPROC *proc = &allProcs[arrayP->pgprocnos[index]]; + + if (proc->pid == pid) + { + result = proc; + break; + } + } + + return result; +} + +/* + * BackendXidGetPid -- get a backend's pid given its XID + * + * Returns 0 if not found or it's a prepared transaction. Note that + * it is up to the caller to be sure that the question remains + * meaningful for long enough for the answer to be used ... + * + * Only main transaction Ids are considered. This function is mainly + * useful for determining what backend owns a lock. + * + * Beware that not every xact has an XID assigned. However, as long as you + * only call this using an XID found on disk, you're safe. + */ +int +BackendXidGetPid(TransactionId xid) +{ + int result = 0; + ProcArrayStruct *arrayP = procArray; + TransactionId *other_xids = ProcGlobal->xids; + int index; + + if (xid == InvalidTransactionId) /* never match invalid xid */ + return 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (other_xids[index] == xid) + { + result = proc->pid; + break; + } + } + + LWLockRelease(ProcArrayLock); + + return result; +} + +/* + * IsBackendPid -- is a given pid a running backend + * + * This is not called by the backend, but is called by external modules. + */ +bool +IsBackendPid(int pid) +{ + return (BackendPidGetProc(pid) != NULL); +} + + +/* + * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs. + * + * The array is palloc'd. The number of valid entries is returned into *nvxids. + * + * The arguments allow filtering the set of VXIDs returned. Our own process + * is always skipped. In addition: + * If limitXmin is not InvalidTransactionId, skip processes with + * xmin > limitXmin. + * If excludeXmin0 is true, skip processes with xmin = 0. + * If allDbs is false, skip processes attached to other databases. + * If excludeVacuum isn't zero, skip processes for which + * (statusFlags & excludeVacuum) is not zero. + * + * Note: the purpose of the limitXmin and excludeXmin0 parameters is to + * allow skipping backends whose oldest live snapshot is no older than + * some snapshot we have. Since we examine the procarray with only shared + * lock, there are race conditions: a backend could set its xmin just after + * we look. Indeed, on multiprocessors with weak memory ordering, the + * other backend could have set its xmin *before* we look. We know however + * that such a backend must have held shared ProcArrayLock overlapping our + * own hold of ProcArrayLock, else we would see its xmin update. Therefore, + * any snapshot the other backend is taking concurrently with our scan cannot + * consider any transactions as still running that we think are committed + * (since backends must hold ProcArrayLock exclusive to commit). + */ +VirtualTransactionId * +GetCurrentVirtualXIDs(TransactionId limitXmin, bool excludeXmin0, + bool allDbs, int excludeVacuum, + int *nvxids) +{ + VirtualTransactionId *vxids; + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + /* allocate what's certainly enough result space */ + vxids = (VirtualTransactionId *) + palloc(sizeof(VirtualTransactionId) * arrayP->maxProcs); + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + uint8 statusFlags = ProcGlobal->statusFlags[index]; + + if (proc == MyProc) + continue; + + if (excludeVacuum & statusFlags) + continue; + + if (allDbs || proc->databaseId == MyDatabaseId) + { + /* Fetch xmin just once - might change on us */ + TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin); + + if (excludeXmin0 && !TransactionIdIsValid(pxmin)) + continue; + + /* + * InvalidTransactionId precedes all other XIDs, so a proc that + * hasn't set xmin yet will not be rejected by this test. + */ + if (!TransactionIdIsValid(limitXmin) || + TransactionIdPrecedesOrEquals(pxmin, limitXmin)) + { + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + } + } + } + + LWLockRelease(ProcArrayLock); + + *nvxids = count; + return vxids; +} + +/* + * GetConflictingVirtualXIDs -- returns an array of currently active VXIDs. + * + * Usage is limited to conflict resolution during recovery on standby servers. + * limitXmin is supplied as either latestRemovedXid, or InvalidTransactionId + * in cases where we cannot accurately determine a value for latestRemovedXid. + * + * If limitXmin is InvalidTransactionId then we want to kill everybody, + * so we're not worried if they have a snapshot or not, nor does it really + * matter what type of lock we hold. + * + * All callers that are checking xmins always now supply a valid and useful + * value for limitXmin. The limitXmin is always lower than the lowest + * numbered KnownAssignedXid that is not already a FATAL error. This is + * because we only care about cleanup records that are cleaning up tuple + * versions from committed transactions. In that case they will only occur + * at the point where the record is less than the lowest running xid. That + * allows us to say that if any backend takes a snapshot concurrently with + * us then the conflict assessment made here would never include the snapshot + * that is being derived. So we take LW_SHARED on the ProcArray and allow + * concurrent snapshots when limitXmin is valid. We might think about adding + * Assert(limitXmin < lowest(KnownAssignedXids)) + * but that would not be true in the case of FATAL errors lagging in array, + * but we already know those are bogus anyway, so we skip that test. + * + * If dbOid is valid we skip backends attached to other databases. + * + * Be careful to *not* pfree the result from this function. We reuse + * this array sufficiently often that we use malloc for the result. + */ +VirtualTransactionId * +GetConflictingVirtualXIDs(TransactionId limitXmin, Oid dbOid) +{ + static VirtualTransactionId *vxids; + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + /* + * If first time through, get workspace to remember main XIDs in. We + * malloc it permanently to avoid repeated palloc/pfree overhead. Allow + * result space, remembering room for a terminator. + */ + if (vxids == NULL) + { + vxids = (VirtualTransactionId *) + malloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1)); + if (vxids == NULL) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + /* Exclude prepared transactions */ + if (proc->pid == 0) + continue; + + if (!OidIsValid(dbOid) || + proc->databaseId == dbOid) + { + /* Fetch xmin just once - can't change on us, but good coding */ + TransactionId pxmin = UINT32_ACCESS_ONCE(proc->xmin); + + /* + * We ignore an invalid pxmin because this means that backend has + * no snapshot currently. We hold a Share lock to avoid contention + * with users taking snapshots. That is not a problem because the + * current xmin is always at least one higher than the latest + * removed xid, so any new snapshot would never conflict with the + * test here. + */ + if (!TransactionIdIsValid(limitXmin) || + (TransactionIdIsValid(pxmin) && !TransactionIdFollows(pxmin, limitXmin))) + { + VirtualTransactionId vxid; + + GET_VXID_FROM_PGPROC(vxid, *proc); + if (VirtualTransactionIdIsValid(vxid)) + vxids[count++] = vxid; + } + } + } + + LWLockRelease(ProcArrayLock); + + /* add the terminator */ + vxids[count].backendId = InvalidBackendId; + vxids[count].localTransactionId = InvalidLocalTransactionId; + + return vxids; +} + +/* + * CancelVirtualTransaction - used in recovery conflict processing + * + * Returns pid of the process signaled, or 0 if not found. + */ +pid_t +CancelVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode) +{ + return SignalVirtualTransaction(vxid, sigmode, true); +} + +pid_t +SignalVirtualTransaction(VirtualTransactionId vxid, ProcSignalReason sigmode, + bool conflictPending) +{ + ProcArrayStruct *arrayP = procArray; + int index; + pid_t pid = 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + VirtualTransactionId procvxid; + + GET_VXID_FROM_PGPROC(procvxid, *proc); + + if (procvxid.backendId == vxid.backendId && + procvxid.localTransactionId == vxid.localTransactionId) + { + proc->recoveryConflictPending = conflictPending; + pid = proc->pid; + if (pid != 0) + { + /* + * Kill the pid if it's still here. If not, that's what we + * wanted so ignore any errors. + */ + (void) SendProcSignal(pid, sigmode, vxid.backendId); + } + break; + } + } + + LWLockRelease(ProcArrayLock); + + return pid; +} + +/* + * MinimumActiveBackends --- count backends (other than myself) that are + * in active transactions. Return true if the count exceeds the + * minimum threshold passed. This is used as a heuristic to decide if + * a pre-XLOG-flush delay is worthwhile during commit. + * + * Do not count backends that are blocked waiting for locks, since they are + * not going to get to run until someone else commits. + */ +bool +MinimumActiveBackends(int min) +{ + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + /* Quick short-circuit if no minimum is specified */ + if (min == 0) + return true; + + /* + * Note: for speed, we don't acquire ProcArrayLock. This is a little bit + * bogus, but since we are only testing fields for zero or nonzero, it + * should be OK. The result is only used for heuristic purposes anyway... + */ + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + /* + * Since we're not holding a lock, need to be prepared to deal with + * garbage, as someone could have incremented numProcs but not yet + * filled the structure. + * + * If someone just decremented numProcs, 'proc' could also point to a + * PGPROC entry that's no longer in the array. It still points to a + * PGPROC struct, though, because freed PGPROC entries just go to the + * free list and are recycled. Its contents are nonsense in that case, + * but that's acceptable for this function. + */ + if (pgprocno == -1) + continue; /* do not count deleted entries */ + if (proc == MyProc) + continue; /* do not count myself */ + if (proc->xid == InvalidTransactionId) + continue; /* do not count if no XID assigned */ + if (proc->pid == 0) + continue; /* do not count prepared xacts */ + if (proc->waitLock != NULL) + continue; /* do not count if blocked on a lock */ + count++; + if (count >= min) + break; + } + + return count >= min; +} + +/* + * CountDBBackends --- count backends that are using specified database + */ +int +CountDBBackends(Oid databaseid) +{ + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (proc->pid == 0) + continue; /* do not count prepared xacts */ + if (!OidIsValid(databaseid) || + proc->databaseId == databaseid) + count++; + } + + LWLockRelease(ProcArrayLock); + + return count; +} + +/* + * CountDBConnections --- counts database backends ignoring any background + * worker processes + */ +int +CountDBConnections(Oid databaseid) +{ + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (proc->pid == 0) + continue; /* do not count prepared xacts */ + if (proc->isBackgroundWorker) + continue; /* do not count background workers */ + if (!OidIsValid(databaseid) || + proc->databaseId == databaseid) + count++; + } + + LWLockRelease(ProcArrayLock); + + return count; +} + +/* + * CancelDBBackends --- cancel backends that are using specified database + */ +void +CancelDBBackends(Oid databaseid, ProcSignalReason sigmode, bool conflictPending) +{ + ProcArrayStruct *arrayP = procArray; + int index; + + /* tell all backends to die */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (databaseid == InvalidOid || proc->databaseId == databaseid) + { + VirtualTransactionId procvxid; + pid_t pid; + + GET_VXID_FROM_PGPROC(procvxid, *proc); + + proc->recoveryConflictPending = conflictPending; + pid = proc->pid; + if (pid != 0) + { + /* + * Kill the pid if it's still here. If not, that's what we + * wanted so ignore any errors. + */ + (void) SendProcSignal(pid, sigmode, procvxid.backendId); + } + } + } + + LWLockRelease(ProcArrayLock); +} + +/* + * CountUserBackends --- count backends that are used by specified user + */ +int +CountUserBackends(Oid roleid) +{ + ProcArrayStruct *arrayP = procArray; + int count = 0; + int index; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + + if (proc->pid == 0) + continue; /* do not count prepared xacts */ + if (proc->isBackgroundWorker) + continue; /* do not count background workers */ + if (proc->roleId == roleid) + count++; + } + + LWLockRelease(ProcArrayLock); + + return count; +} + +/* + * CountOtherDBBackends -- check for other backends running in the given DB + * + * If there are other backends in the DB, we will wait a maximum of 5 seconds + * for them to exit. Autovacuum backends are encouraged to exit early by + * sending them SIGTERM, but normal user backends are just waited for. + * + * The current backend is always ignored; it is caller's responsibility to + * check whether the current backend uses the given DB, if it's important. + * + * Returns true if there are (still) other backends in the DB, false if not. + * Also, *nbackends and *nprepared are set to the number of other backends + * and prepared transactions in the DB, respectively. + * + * This function is used to interlock DROP DATABASE and related commands + * against there being any active backends in the target DB --- dropping the + * DB while active backends remain would be a Bad Thing. Note that we cannot + * detect here the possibility of a newly-started backend that is trying to + * connect to the doomed database, so additional interlocking is needed during + * backend startup. The caller should normally hold an exclusive lock on the + * target DB before calling this, which is one reason we mustn't wait + * indefinitely. + */ +bool +CountOtherDBBackends(Oid databaseId, int *nbackends, int *nprepared) +{ + ProcArrayStruct *arrayP = procArray; + +#define MAXAUTOVACPIDS 10 /* max autovacs to SIGTERM per iteration */ + int autovac_pids[MAXAUTOVACPIDS]; + int tries; + + /* 50 tries with 100ms sleep between tries makes 5 sec total wait */ + for (tries = 0; tries < 50; tries++) + { + int nautovacs = 0; + bool found = false; + int index; + + CHECK_FOR_INTERRUPTS(); + + *nbackends = *nprepared = 0; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (index = 0; index < arrayP->numProcs; index++) + { + int pgprocno = arrayP->pgprocnos[index]; + PGPROC *proc = &allProcs[pgprocno]; + uint8 statusFlags = ProcGlobal->statusFlags[index]; + + if (proc->databaseId != databaseId) + continue; + if (proc == MyProc) + continue; + + found = true; + + if (proc->pid == 0) + (*nprepared)++; + else + { + (*nbackends)++; + if ((statusFlags & PROC_IS_AUTOVACUUM) && + nautovacs < MAXAUTOVACPIDS) + autovac_pids[nautovacs++] = proc->pid; + } + } + + LWLockRelease(ProcArrayLock); + + if (!found) + return false; /* no conflicting backends, so done */ + + /* + * Send SIGTERM to any conflicting autovacuums before sleeping. We + * postpone this step until after the loop because we don't want to + * hold ProcArrayLock while issuing kill(). We have no idea what might + * block kill() inside the kernel... + */ + for (index = 0; index < nautovacs; index++) + (void) kill(autovac_pids[index], SIGTERM); /* ignore any error */ + + /* sleep, then try again */ + pg_usleep(100 * 1000L); /* 100ms */ + } + + return true; /* timed out, still conflicts */ +} + +/* + * Terminate existing connections to the specified database. This routine + * is used by the DROP DATABASE command when user has asked to forcefully + * drop the database. + * + * The current backend is always ignored; it is caller's responsibility to + * check whether the current backend uses the given DB, if it's important. + * + * It doesn't allow to terminate the connections even if there is a one + * backend with the prepared transaction in the target database. + */ +void +TerminateOtherDBBackends(Oid databaseId) +{ + ProcArrayStruct *arrayP = procArray; + List *pids = NIL; + int nprepared = 0; + int i; + + LWLockAcquire(ProcArrayLock, LW_SHARED); + + for (i = 0; i < procArray->numProcs; i++) + { + int pgprocno = arrayP->pgprocnos[i]; + PGPROC *proc = &allProcs[pgprocno]; + + if (proc->databaseId != databaseId) + continue; + if (proc == MyProc) + continue; + + if (proc->pid != 0) + pids = lappend_int(pids, proc->pid); + else + nprepared++; + } + + LWLockRelease(ProcArrayLock); + + if (nprepared > 0) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_IN_USE), + errmsg("database \"%s\" is being used by prepared transactions", + get_database_name(databaseId)), + errdetail_plural("There is %d prepared transaction using the database.", + "There are %d prepared transactions using the database.", + nprepared, + nprepared))); + + if (pids) + { + ListCell *lc; + + /* + * Check whether we have the necessary rights to terminate other + * sessions. We don't terminate any session until we ensure that we + * have rights on all the sessions to be terminated. These checks are + * the same as we do in pg_terminate_backend. + * + * In this case we don't raise some warnings - like "PID %d is not a + * PostgreSQL server process", because for us already finished session + * is not a problem. + */ + foreach(lc, pids) + { + int pid = lfirst_int(lc); + PGPROC *proc = BackendPidGetProc(pid); + + if (proc != NULL) + { + /* Only allow superusers to signal superuser-owned backends. */ + if (superuser_arg(proc->roleId) && !superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be a superuser to terminate superuser process"))); + + /* Users can signal backends they have role membership in. */ + if (!has_privs_of_role(GetUserId(), proc->roleId) && + !has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be a member of the role whose process is being terminated or member of pg_signal_backend"))); + } + } + + /* + * There's a race condition here: once we release the ProcArrayLock, + * it's possible for the session to exit before we issue kill. That + * race condition possibility seems too unlikely to worry about. See + * pg_signal_backend. + */ + foreach(lc, pids) + { + int pid = lfirst_int(lc); + PGPROC *proc = BackendPidGetProc(pid); + + if (proc != NULL) + { + /* + * If we have setsid(), signal the backend's whole process + * group + */ +#ifdef HAVE_SETSID + (void) kill(-pid, SIGTERM); +#else + (void) kill(pid, SIGTERM); +#endif + } + } + } +} + +/* + * ProcArraySetReplicationSlotXmin + * + * Install limits to future computations of the xmin horizon to prevent vacuum + * and HOT pruning from removing affected rows still needed by clients with + * replication slots. + */ +void +ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, + bool already_locked) +{ + Assert(!already_locked || LWLockHeldByMe(ProcArrayLock)); + + if (!already_locked) + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + procArray->replication_slot_xmin = xmin; + procArray->replication_slot_catalog_xmin = catalog_xmin; + + if (!already_locked) + LWLockRelease(ProcArrayLock); +} + +/* + * ProcArrayGetReplicationSlotXmin + * + * Return the current slot xmin limits. That's useful to be able to remove + * data that's older than those limits. + */ +void +ProcArrayGetReplicationSlotXmin(TransactionId *xmin, + TransactionId *catalog_xmin) +{ + LWLockAcquire(ProcArrayLock, LW_SHARED); + + if (xmin != NULL) + *xmin = procArray->replication_slot_xmin; + + if (catalog_xmin != NULL) + *catalog_xmin = procArray->replication_slot_catalog_xmin; + + LWLockRelease(ProcArrayLock); +} + +/* + * XidCacheRemoveRunningXids + * + * Remove a bunch of TransactionIds from the list of known-running + * subtransactions for my backend. Both the specified xid and those in + * the xids[] array (of length nxids) are removed from the subxids cache. + * latestXid must be the latest XID among the group. + */ +void +XidCacheRemoveRunningXids(TransactionId xid, + int nxids, const TransactionId *xids, + TransactionId latestXid) +{ + int i, + j; + XidCacheStatus *mysubxidstat; + + Assert(TransactionIdIsValid(xid)); + + /* + * We must hold ProcArrayLock exclusively in order to remove transactions + * from the PGPROC array. (See src/backend/access/transam/README.) It's + * possible this could be relaxed since we know this routine is only used + * to abort subtransactions, but pending closer analysis we'd best be + * conservative. + * + * Note that we do not have to be careful about memory ordering of our own + * reads wrt. GetNewTransactionId() here - only this process can modify + * relevant fields of MyProc/ProcGlobal->xids[]. But we do have to be + * careful about our own writes being well ordered. + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + mysubxidstat = &ProcGlobal->subxidStates[MyProc->pgxactoff]; + + /* + * Under normal circumstances xid and xids[] will be in increasing order, + * as will be the entries in subxids. Scan backwards to avoid O(N^2) + * behavior when removing a lot of xids. + */ + for (i = nxids - 1; i >= 0; i--) + { + TransactionId anxid = xids[i]; + + for (j = MyProc->subxidStatus.count - 1; j >= 0; j--) + { + if (TransactionIdEquals(MyProc->subxids.xids[j], anxid)) + { + MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1]; + pg_write_barrier(); + mysubxidstat->count--; + MyProc->subxidStatus.count--; + break; + } + } + + /* + * Ordinarily we should have found it, unless the cache has + * overflowed. However it's also possible for this routine to be + * invoked multiple times for the same subtransaction, in case of an + * error during AbortSubTransaction. So instead of Assert, emit a + * debug warning. + */ + if (j < 0 && !MyProc->subxidStatus.overflowed) + elog(WARNING, "did not find subXID %u in MyProc", anxid); + } + + for (j = MyProc->subxidStatus.count - 1; j >= 0; j--) + { + if (TransactionIdEquals(MyProc->subxids.xids[j], xid)) + { + MyProc->subxids.xids[j] = MyProc->subxids.xids[MyProc->subxidStatus.count - 1]; + pg_write_barrier(); + mysubxidstat->count--; + MyProc->subxidStatus.count--; + break; + } + } + /* Ordinarily we should have found it, unless the cache has overflowed */ + if (j < 0 && !MyProc->subxidStatus.overflowed) + elog(WARNING, "did not find subXID %u in MyProc", xid); + + /* Also advance global latestCompletedXid while holding the lock */ + MaintainLatestCompletedXid(latestXid); + + /* ... and xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; + + LWLockRelease(ProcArrayLock); +} + +#ifdef XIDCACHE_DEBUG + +/* + * Print stats about effectiveness of XID cache + */ +static void +DisplayXidCache(void) +{ + fprintf(stderr, + "XidCache: xmin: %ld, known: %ld, myxact: %ld, latest: %ld, mainxid: %ld, childxid: %ld, knownassigned: %ld, nooflo: %ld, slow: %ld\n", + xc_by_recent_xmin, + xc_by_known_xact, + xc_by_my_xact, + xc_by_latest_xid, + xc_by_main_xid, + xc_by_child_xid, + xc_by_known_assigned, + xc_no_overflow, + xc_slow_answer); +} +#endif /* XIDCACHE_DEBUG */ + +/* + * If rel != NULL, return test state appropriate for relation, otherwise + * return state usable for all relations. The latter may consider XIDs as + * not-yet-visible-to-everyone that a state for a specific relation would + * already consider visible-to-everyone. + * + * This needs to be called while a snapshot is active or registered, otherwise + * there are wraparound and other dangers. + * + * See comment for GlobalVisState for details. + */ +GlobalVisState * +GlobalVisTestFor(Relation rel) +{ + GlobalVisState *state = NULL; + + /* XXX: we should assert that a snapshot is pushed or registered */ + Assert(RecentXmin); + + switch (GlobalVisHorizonKindForRel(rel)) + { + case VISHORIZON_SHARED: + state = &GlobalVisSharedRels; + break; + case VISHORIZON_CATALOG: + state = &GlobalVisCatalogRels; + break; + case VISHORIZON_DATA: + state = &GlobalVisDataRels; + break; + case VISHORIZON_TEMP: + state = &GlobalVisTempRels; + break; + } + + Assert(FullTransactionIdIsValid(state->definitely_needed) && + FullTransactionIdIsValid(state->maybe_needed)); + + return state; +} + +/* + * Return true if it's worth updating the accurate maybe_needed boundary. + * + * As it is somewhat expensive to determine xmin horizons, we don't want to + * repeatedly do so when there is a low likelihood of it being beneficial. + * + * The current heuristic is that we update only if RecentXmin has changed + * since the last update. If the oldest currently running transaction has not + * finished, it is unlikely that recomputing the horizon would be useful. + */ +static bool +GlobalVisTestShouldUpdate(GlobalVisState *state) +{ + /* hasn't been updated yet */ + if (!TransactionIdIsValid(ComputeXidHorizonsResultLastXmin)) + return true; + + /* + * If the maybe_needed/definitely_needed boundaries are the same, it's + * unlikely to be beneficial to refresh boundaries. + */ + if (FullTransactionIdFollowsOrEquals(state->maybe_needed, + state->definitely_needed)) + return false; + + /* does the last snapshot built have a different xmin? */ + return RecentXmin != ComputeXidHorizonsResultLastXmin; +} + +static void +GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons) +{ + GlobalVisSharedRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->shared_oldest_nonremovable); + GlobalVisCatalogRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->catalog_oldest_nonremovable); + GlobalVisDataRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->data_oldest_nonremovable); + GlobalVisTempRels.maybe_needed = + FullXidRelativeTo(horizons->latest_completed, + horizons->temp_oldest_nonremovable); + + /* + * In longer running transactions it's possible that transactions we + * previously needed to treat as running aren't around anymore. So update + * definitely_needed to not be earlier than maybe_needed. + */ + GlobalVisSharedRels.definitely_needed = + FullTransactionIdNewer(GlobalVisSharedRels.maybe_needed, + GlobalVisSharedRels.definitely_needed); + GlobalVisCatalogRels.definitely_needed = + FullTransactionIdNewer(GlobalVisCatalogRels.maybe_needed, + GlobalVisCatalogRels.definitely_needed); + GlobalVisDataRels.definitely_needed = + FullTransactionIdNewer(GlobalVisDataRels.maybe_needed, + GlobalVisDataRels.definitely_needed); + GlobalVisTempRels.definitely_needed = GlobalVisTempRels.maybe_needed; + + ComputeXidHorizonsResultLastXmin = RecentXmin; +} + +/* + * Update boundaries in GlobalVis{Shared,Catalog, Data}Rels + * using ComputeXidHorizons(). + */ +static void +GlobalVisUpdate(void) +{ + ComputeXidHorizonsResult horizons; + + /* updates the horizons as a side-effect */ + ComputeXidHorizons(&horizons); +} + +/* + * Return true if no snapshot still considers fxid to be running. + * + * The state passed needs to have been initialized for the relation fxid is + * from (NULL is also OK), otherwise the result may not be correct. + * + * See comment for GlobalVisState for details. + */ +bool +GlobalVisTestIsRemovableFullXid(GlobalVisState *state, + FullTransactionId fxid) +{ + /* + * If fxid is older than maybe_needed bound, it definitely is visible to + * everyone. + */ + if (FullTransactionIdPrecedes(fxid, state->maybe_needed)) + return true; + + /* + * If fxid is >= definitely_needed bound, it is very likely to still be + * considered running. + */ + if (FullTransactionIdFollowsOrEquals(fxid, state->definitely_needed)) + return false; + + /* + * fxid is between maybe_needed and definitely_needed, i.e. there might or + * might not exist a snapshot considering fxid running. If it makes sense, + * update boundaries and recheck. + */ + if (GlobalVisTestShouldUpdate(state)) + { + GlobalVisUpdate(); + + Assert(FullTransactionIdPrecedes(fxid, state->definitely_needed)); + + return FullTransactionIdPrecedes(fxid, state->maybe_needed); + } + else + return false; +} + +/* + * Wrapper around GlobalVisTestIsRemovableFullXid() for 32bit xids. + * + * It is crucial that this only gets called for xids from a source that + * protects against xid wraparounds (e.g. from a table and thus protected by + * relfrozenxid). + */ +bool +GlobalVisTestIsRemovableXid(GlobalVisState *state, TransactionId xid) +{ + FullTransactionId fxid; + + /* + * Convert 32 bit argument to FullTransactionId. We can do so safely + * because we know the xid has to, at the very least, be between + * [oldestXid, nextFullXid), i.e. within 2 billion of xid. To avoid taking + * a lock to determine either, we can just compare with + * state->definitely_needed, which was based on those value at the time + * the current snapshot was built. + */ + fxid = FullXidRelativeTo(state->definitely_needed, xid); + + return GlobalVisTestIsRemovableFullXid(state, fxid); +} + +/* + * Return FullTransactionId below which all transactions are not considered + * running anymore. + * + * Note: This is less efficient than testing with + * GlobalVisTestIsRemovableFullXid as it likely requires building an accurate + * cutoff, even in the case all the XIDs compared with the cutoff are outside + * [maybe_needed, definitely_needed). + */ +FullTransactionId +GlobalVisTestNonRemovableFullHorizon(GlobalVisState *state) +{ + /* acquire accurate horizon if not already done */ + if (GlobalVisTestShouldUpdate(state)) + GlobalVisUpdate(); + + return state->maybe_needed; +} + +/* Convenience wrapper around GlobalVisTestNonRemovableFullHorizon */ +TransactionId +GlobalVisTestNonRemovableHorizon(GlobalVisState *state) +{ + FullTransactionId cutoff; + + cutoff = GlobalVisTestNonRemovableFullHorizon(state); + + return XidFromFullTransactionId(cutoff); +} + +/* + * Convenience wrapper around GlobalVisTestFor() and + * GlobalVisTestIsRemovableFullXid(), see their comments. + */ +bool +GlobalVisCheckRemovableFullXid(Relation rel, FullTransactionId fxid) +{ + GlobalVisState *state; + + state = GlobalVisTestFor(rel); + + return GlobalVisTestIsRemovableFullXid(state, fxid); +} + +/* + * Convenience wrapper around GlobalVisTestFor() and + * GlobalVisTestIsRemovableXid(), see their comments. + */ +bool +GlobalVisCheckRemovableXid(Relation rel, TransactionId xid) +{ + GlobalVisState *state; + + state = GlobalVisTestFor(rel); + + return GlobalVisTestIsRemovableXid(state, xid); +} + +/* + * Convert a 32 bit transaction id into 64 bit transaction id, by assuming it + * is within MaxTransactionId / 2 of XidFromFullTransactionId(rel). + * + * Be very careful about when to use this function. It can only safely be used + * when there is a guarantee that xid is within MaxTransactionId / 2 xids of + * rel. That e.g. can be guaranteed if the caller assures a snapshot is + * held by the backend and xid is from a table (where vacuum/freezing ensures + * the xid has to be within that range), or if xid is from the procarray and + * prevents xid wraparound that way. + */ +static inline FullTransactionId +FullXidRelativeTo(FullTransactionId rel, TransactionId xid) +{ + TransactionId rel_xid = XidFromFullTransactionId(rel); + + Assert(TransactionIdIsValid(xid)); + Assert(TransactionIdIsValid(rel_xid)); + + /* not guaranteed to find issues, but likely to catch mistakes */ + AssertTransactionIdInAllowableRange(xid); + + return FullTransactionIdFromU64(U64FromFullTransactionId(rel) + + (int32) (xid - rel_xid)); +} + + +/* ---------------------------------------------- + * KnownAssignedTransactionIds sub-module + * ---------------------------------------------- + */ + +/* + * In Hot Standby mode, we maintain a list of transactions that are (or were) + * running on the primary at the current point in WAL. These XIDs must be + * treated as running by standby transactions, even though they are not in + * the standby server's PGPROC array. + * + * We record all XIDs that we know have been assigned. That includes all the + * XIDs seen in WAL records, plus all unobserved XIDs that we can deduce have + * been assigned. We can deduce the existence of unobserved XIDs because we + * know XIDs are assigned in sequence, with no gaps. The KnownAssignedXids + * list expands as new XIDs are observed or inferred, and contracts when + * transaction completion records arrive. + * + * During hot standby we do not fret too much about the distinction between + * top-level XIDs and subtransaction XIDs. We store both together in the + * KnownAssignedXids list. In backends, this is copied into snapshots in + * GetSnapshotData(), taking advantage of the fact that XidInMVCCSnapshot() + * doesn't care about the distinction either. Subtransaction XIDs are + * effectively treated as top-level XIDs and in the typical case pg_subtrans + * links are *not* maintained (which does not affect visibility). + * + * We have room in KnownAssignedXids and in snapshots to hold maxProcs * + * (1 + PGPROC_MAX_CACHED_SUBXIDS) XIDs, so every primary transaction must + * report its subtransaction XIDs in a WAL XLOG_XACT_ASSIGNMENT record at + * least every PGPROC_MAX_CACHED_SUBXIDS. When we receive one of these + * records, we mark the subXIDs as children of the top XID in pg_subtrans, + * and then remove them from KnownAssignedXids. This prevents overflow of + * KnownAssignedXids and snapshots, at the cost that status checks for these + * subXIDs will take a slower path through TransactionIdIsInProgress(). + * This means that KnownAssignedXids is not necessarily complete for subXIDs, + * though it should be complete for top-level XIDs; this is the same situation + * that holds with respect to the PGPROC entries in normal running. + * + * When we throw away subXIDs from KnownAssignedXids, we need to keep track of + * that, similarly to tracking overflow of a PGPROC's subxids array. We do + * that by remembering the lastOverflowedXid, ie the last thrown-away subXID. + * As long as that is within the range of interesting XIDs, we have to assume + * that subXIDs are missing from snapshots. (Note that subXID overflow occurs + * on primary when 65th subXID arrives, whereas on standby it occurs when 64th + * subXID arrives - that is not an error.) + * + * Should a backend on primary somehow disappear before it can write an abort + * record, then we just leave those XIDs in KnownAssignedXids. They actually + * aborted but we think they were running; the distinction is irrelevant + * because either way any changes done by the transaction are not visible to + * backends in the standby. We prune KnownAssignedXids when + * XLOG_RUNNING_XACTS arrives, to forestall possible overflow of the + * array due to such dead XIDs. + */ + +/* + * RecordKnownAssignedTransactionIds + * Record the given XID in KnownAssignedXids, as well as any preceding + * unobserved XIDs. + * + * RecordKnownAssignedTransactionIds() should be run for *every* WAL record + * associated with a transaction. Must be called for each record after we + * have executed StartupCLOG() et al, since we must ExtendCLOG() etc.. + * + * Called during recovery in analogy with and in place of GetNewTransactionId() + */ +void +RecordKnownAssignedTransactionIds(TransactionId xid) +{ + Assert(standbyState >= STANDBY_INITIALIZED); + Assert(TransactionIdIsValid(xid)); + Assert(TransactionIdIsValid(latestObservedXid)); + + elog(trace_recovery(DEBUG4), "record known xact %u latestObservedXid %u", + xid, latestObservedXid); + + /* + * When a newly observed xid arrives, it is frequently the case that it is + * *not* the next xid in sequence. When this occurs, we must treat the + * intervening xids as running also. + */ + if (TransactionIdFollows(xid, latestObservedXid)) + { + TransactionId next_expected_xid; + + /* + * Extend subtrans like we do in GetNewTransactionId() during normal + * operation using individual extend steps. Note that we do not need + * to extend clog since its extensions are WAL logged. + * + * This part has to be done regardless of standbyState since we + * immediately start assigning subtransactions to their toplevel + * transactions. + */ + next_expected_xid = latestObservedXid; + while (TransactionIdPrecedes(next_expected_xid, xid)) + { + TransactionIdAdvance(next_expected_xid); + ExtendSUBTRANS(next_expected_xid); + } + Assert(next_expected_xid == xid); + + /* + * If the KnownAssignedXids machinery isn't up yet, there's nothing + * more to do since we don't track assigned xids yet. + */ + if (standbyState <= STANDBY_INITIALIZED) + { + latestObservedXid = xid; + return; + } + + /* + * Add (latestObservedXid, xid] onto the KnownAssignedXids array. + */ + next_expected_xid = latestObservedXid; + TransactionIdAdvance(next_expected_xid); + KnownAssignedXidsAdd(next_expected_xid, xid, false); + + /* + * Now we can advance latestObservedXid + */ + latestObservedXid = xid; + + /* ShmemVariableCache->nextXid must be beyond any observed xid */ + AdvanceNextFullTransactionIdPastXid(latestObservedXid); + next_expected_xid = latestObservedXid; + TransactionIdAdvance(next_expected_xid); + } +} + +/* + * ExpireTreeKnownAssignedTransactionIds + * Remove the given XIDs from KnownAssignedXids. + * + * Called during recovery in analogy with and in place of ProcArrayEndTransaction() + */ +void +ExpireTreeKnownAssignedTransactionIds(TransactionId xid, int nsubxids, + TransactionId *subxids, TransactionId max_xid) +{ + Assert(standbyState >= STANDBY_INITIALIZED); + + /* + * Uses same locking as transaction commit + */ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + KnownAssignedXidsRemoveTree(xid, nsubxids, subxids); + + /* As in ProcArrayEndTransaction, advance latestCompletedXid */ + MaintainLatestCompletedXidRecovery(max_xid); + + /* ... and xactCompletionCount */ + ShmemVariableCache->xactCompletionCount++; + + LWLockRelease(ProcArrayLock); +} + +/* + * ExpireAllKnownAssignedTransactionIds + * Remove all entries in KnownAssignedXids and reset lastOverflowedXid. + */ +void +ExpireAllKnownAssignedTransactionIds(void) +{ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + KnownAssignedXidsRemovePreceding(InvalidTransactionId); + + /* + * Reset lastOverflowedXid. Currently, lastOverflowedXid has no use after + * the call of this function. But do this for unification with what + * ExpireOldKnownAssignedTransactionIds() do. + */ + procArray->lastOverflowedXid = InvalidTransactionId; + LWLockRelease(ProcArrayLock); +} + +/* + * ExpireOldKnownAssignedTransactionIds + * Remove KnownAssignedXids entries preceding the given XID and + * potentially reset lastOverflowedXid. + */ +void +ExpireOldKnownAssignedTransactionIds(TransactionId xid) +{ + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + /* + * Reset lastOverflowedXid if we know all transactions that have been + * possibly running are being gone. Not doing so could cause an incorrect + * lastOverflowedXid value, which makes extra snapshots be marked as + * suboverflowed. + */ + if (TransactionIdPrecedes(procArray->lastOverflowedXid, xid)) + procArray->lastOverflowedXid = InvalidTransactionId; + KnownAssignedXidsRemovePreceding(xid); + LWLockRelease(ProcArrayLock); +} + + +/* + * Private module functions to manipulate KnownAssignedXids + * + * There are 5 main uses of the KnownAssignedXids data structure: + * + * * backends taking snapshots - all valid XIDs need to be copied out + * * backends seeking to determine presence of a specific XID + * * startup process adding new known-assigned XIDs + * * startup process removing specific XIDs as transactions end + * * startup process pruning array when special WAL records arrive + * + * This data structure is known to be a hot spot during Hot Standby, so we + * go to some lengths to make these operations as efficient and as concurrent + * as possible. + * + * The XIDs are stored in an array in sorted order --- TransactionIdPrecedes + * order, to be exact --- to allow binary search for specific XIDs. Note: + * in general TransactionIdPrecedes would not provide a total order, but + * we know that the entries present at any instant should not extend across + * a large enough fraction of XID space to wrap around (the primary would + * shut down for fear of XID wrap long before that happens). So it's OK to + * use TransactionIdPrecedes as a binary-search comparator. + * + * It's cheap to maintain the sortedness during insertions, since new known + * XIDs are always reported in XID order; we just append them at the right. + * + * To keep individual deletions cheap, we need to allow gaps in the array. + * This is implemented by marking array elements as valid or invalid using + * the parallel boolean array KnownAssignedXidsValid[]. A deletion is done + * by setting KnownAssignedXidsValid[i] to false, *without* clearing the + * XID entry itself. This preserves the property that the XID entries are + * sorted, so we can do binary searches easily. Periodically we compress + * out the unused entries; that's much cheaper than having to compress the + * array immediately on every deletion. + * + * The actually valid items in KnownAssignedXids[] and KnownAssignedXidsValid[] + * are those with indexes tail <= i < head; items outside this subscript range + * have unspecified contents. When head reaches the end of the array, we + * force compression of unused entries rather than wrapping around, since + * allowing wraparound would greatly complicate the search logic. We maintain + * an explicit tail pointer so that pruning of old XIDs can be done without + * immediately moving the array contents. In most cases only a small fraction + * of the array contains valid entries at any instant. + * + * Although only the startup process can ever change the KnownAssignedXids + * data structure, we still need interlocking so that standby backends will + * not observe invalid intermediate states. The convention is that backends + * must hold shared ProcArrayLock to examine the array. To remove XIDs from + * the array, the startup process must hold ProcArrayLock exclusively, for + * the usual transactional reasons (compare commit/abort of a transaction + * during normal running). Compressing unused entries out of the array + * likewise requires exclusive lock. To add XIDs to the array, we just insert + * them into slots to the right of the head pointer and then advance the head + * pointer. This wouldn't require any lock at all, except that on machines + * with weak memory ordering we need to be careful that other processors + * see the array element changes before they see the head pointer change. + * We handle this by using a spinlock to protect reads and writes of the + * head/tail pointers. (We could dispense with the spinlock if we were to + * create suitable memory access barrier primitives and use those instead.) + * The spinlock must be taken to read or write the head/tail pointers unless + * the caller holds ProcArrayLock exclusively. + * + * Algorithmic analysis: + * + * If we have a maximum of M slots, with N XIDs currently spread across + * S elements then we have N <= S <= M always. + * + * * Adding a new XID is O(1) and needs little locking (unless compression + * must happen) + * * Compressing the array is O(S) and requires exclusive lock + * * Removing an XID is O(logS) and requires exclusive lock + * * Taking a snapshot is O(S) and requires shared lock + * * Checking for an XID is O(logS) and requires shared lock + * + * In comparison, using a hash table for KnownAssignedXids would mean that + * taking snapshots would be O(M). If we can maintain S << M then the + * sorted array technique will deliver significantly faster snapshots. + * If we try to keep S too small then we will spend too much time compressing, + * so there is an optimal point for any workload mix. We use a heuristic to + * decide when to compress the array, though trimming also helps reduce + * frequency of compressing. The heuristic requires us to track the number of + * currently valid XIDs in the array. + */ + + +/* + * Compress KnownAssignedXids by shifting valid data down to the start of the + * array, removing any gaps. + * + * A compression step is forced if "force" is true, otherwise we do it + * only if a heuristic indicates it's a good time to do it. + * + * Caller must hold ProcArrayLock in exclusive mode. + */ +static void +KnownAssignedXidsCompress(bool force) +{ + ProcArrayStruct *pArray = procArray; + int head, + tail; + int compress_index; + int i; + + /* no spinlock required since we hold ProcArrayLock exclusively */ + head = pArray->headKnownAssignedXids; + tail = pArray->tailKnownAssignedXids; + + if (!force) + { + /* + * If we can choose how much to compress, use a heuristic to avoid + * compressing too often or not often enough. + * + * Heuristic is if we have a large enough current spread and less than + * 50% of the elements are currently in use, then compress. This + * should ensure we compress fairly infrequently. We could compress + * less often though the virtual array would spread out more and + * snapshots would become more expensive. + */ + int nelements = head - tail; + + if (nelements < 4 * PROCARRAY_MAXPROCS || + nelements < 2 * pArray->numKnownAssignedXids) + return; + } + + /* + * We compress the array by reading the valid values from tail to head, + * re-aligning data to 0th element. + */ + compress_index = 0; + for (i = tail; i < head; i++) + { + if (KnownAssignedXidsValid[i]) + { + KnownAssignedXids[compress_index] = KnownAssignedXids[i]; + KnownAssignedXidsValid[compress_index] = true; + compress_index++; + } + } + + pArray->tailKnownAssignedXids = 0; + pArray->headKnownAssignedXids = compress_index; +} + +/* + * Add xids into KnownAssignedXids at the head of the array. + * + * xids from from_xid to to_xid, inclusive, are added to the array. + * + * If exclusive_lock is true then caller already holds ProcArrayLock in + * exclusive mode, so we need no extra locking here. Else caller holds no + * lock, so we need to be sure we maintain sufficient interlocks against + * concurrent readers. (Only the startup process ever calls this, so no need + * to worry about concurrent writers.) + */ +static void +KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, + bool exclusive_lock) +{ + ProcArrayStruct *pArray = procArray; + TransactionId next_xid; + int head, + tail; + int nxids; + int i; + + Assert(TransactionIdPrecedesOrEquals(from_xid, to_xid)); + + /* + * Calculate how many array slots we'll need. Normally this is cheap; in + * the unusual case where the XIDs cross the wrap point, we do it the hard + * way. + */ + if (to_xid >= from_xid) + nxids = to_xid - from_xid + 1; + else + { + nxids = 1; + next_xid = from_xid; + while (TransactionIdPrecedes(next_xid, to_xid)) + { + nxids++; + TransactionIdAdvance(next_xid); + } + } + + /* + * Since only the startup process modifies the head/tail pointers, we + * don't need a lock to read them here. + */ + head = pArray->headKnownAssignedXids; + tail = pArray->tailKnownAssignedXids; + + Assert(head >= 0 && head <= pArray->maxKnownAssignedXids); + Assert(tail >= 0 && tail < pArray->maxKnownAssignedXids); + + /* + * Verify that insertions occur in TransactionId sequence. Note that even + * if the last existing element is marked invalid, it must still have a + * correctly sequenced XID value. + */ + if (head > tail && + TransactionIdFollowsOrEquals(KnownAssignedXids[head - 1], from_xid)) + { + KnownAssignedXidsDisplay(LOG); + elog(ERROR, "out-of-order XID insertion in KnownAssignedXids"); + } + + /* + * If our xids won't fit in the remaining space, compress out free space + */ + if (head + nxids > pArray->maxKnownAssignedXids) + { + /* must hold lock to compress */ + if (!exclusive_lock) + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + KnownAssignedXidsCompress(true); + + head = pArray->headKnownAssignedXids; + /* note: we no longer care about the tail pointer */ + + if (!exclusive_lock) + LWLockRelease(ProcArrayLock); + + /* + * If it still won't fit then we're out of memory + */ + if (head + nxids > pArray->maxKnownAssignedXids) + elog(ERROR, "too many KnownAssignedXids"); + } + + /* Now we can insert the xids into the space starting at head */ + next_xid = from_xid; + for (i = 0; i < nxids; i++) + { + KnownAssignedXids[head] = next_xid; + KnownAssignedXidsValid[head] = true; + TransactionIdAdvance(next_xid); + head++; + } + + /* Adjust count of number of valid entries */ + pArray->numKnownAssignedXids += nxids; + + /* + * Now update the head pointer. We use a spinlock to protect this + * pointer, not because the update is likely to be non-atomic, but to + * ensure that other processors see the above array updates before they + * see the head pointer change. + * + * If we're holding ProcArrayLock exclusively, there's no need to take the + * spinlock. + */ + if (exclusive_lock) + pArray->headKnownAssignedXids = head; + else + { + SpinLockAcquire(&pArray->known_assigned_xids_lck); + pArray->headKnownAssignedXids = head; + SpinLockRelease(&pArray->known_assigned_xids_lck); + } +} + +/* + * KnownAssignedXidsSearch + * + * Searches KnownAssignedXids for a specific xid and optionally removes it. + * Returns true if it was found, false if not. + * + * Caller must hold ProcArrayLock in shared or exclusive mode. + * Exclusive lock must be held for remove = true. + */ +static bool +KnownAssignedXidsSearch(TransactionId xid, bool remove) +{ + ProcArrayStruct *pArray = procArray; + int first, + last; + int head; + int tail; + int result_index = -1; + + if (remove) + { + /* we hold ProcArrayLock exclusively, so no need for spinlock */ + tail = pArray->tailKnownAssignedXids; + head = pArray->headKnownAssignedXids; + } + else + { + /* take spinlock to ensure we see up-to-date array contents */ + SpinLockAcquire(&pArray->known_assigned_xids_lck); + tail = pArray->tailKnownAssignedXids; + head = pArray->headKnownAssignedXids; + SpinLockRelease(&pArray->known_assigned_xids_lck); + } + + /* + * Standard binary search. Note we can ignore the KnownAssignedXidsValid + * array here, since even invalid entries will contain sorted XIDs. + */ + first = tail; + last = head - 1; + while (first <= last) + { + int mid_index; + TransactionId mid_xid; + + mid_index = (first + last) / 2; + mid_xid = KnownAssignedXids[mid_index]; + + if (xid == mid_xid) + { + result_index = mid_index; + break; + } + else if (TransactionIdPrecedes(xid, mid_xid)) + last = mid_index - 1; + else + first = mid_index + 1; + } + + if (result_index < 0) + return false; /* not in array */ + + if (!KnownAssignedXidsValid[result_index]) + return false; /* in array, but invalid */ + + if (remove) + { + KnownAssignedXidsValid[result_index] = false; + + pArray->numKnownAssignedXids--; + Assert(pArray->numKnownAssignedXids >= 0); + + /* + * If we're removing the tail element then advance tail pointer over + * any invalid elements. This will speed future searches. + */ + if (result_index == tail) + { + tail++; + while (tail < head && !KnownAssignedXidsValid[tail]) + tail++; + if (tail >= head) + { + /* Array is empty, so we can reset both pointers */ + pArray->headKnownAssignedXids = 0; + pArray->tailKnownAssignedXids = 0; + } + else + { + pArray->tailKnownAssignedXids = tail; + } + } + } + + return true; +} + +/* + * Is the specified XID present in KnownAssignedXids[]? + * + * Caller must hold ProcArrayLock in shared or exclusive mode. + */ +static bool +KnownAssignedXidExists(TransactionId xid) +{ + Assert(TransactionIdIsValid(xid)); + + return KnownAssignedXidsSearch(xid, false); +} + +/* + * Remove the specified XID from KnownAssignedXids[]. + * + * Caller must hold ProcArrayLock in exclusive mode. + */ +static void +KnownAssignedXidsRemove(TransactionId xid) +{ + Assert(TransactionIdIsValid(xid)); + + elog(trace_recovery(DEBUG4), "remove KnownAssignedXid %u", xid); + + /* + * Note: we cannot consider it an error to remove an XID that's not + * present. We intentionally remove subxact IDs while processing + * XLOG_XACT_ASSIGNMENT, to avoid array overflow. Then those XIDs will be + * removed again when the top-level xact commits or aborts. + * + * It might be possible to track such XIDs to distinguish this case from + * actual errors, but it would be complicated and probably not worth it. + * So, just ignore the search result. + */ + (void) KnownAssignedXidsSearch(xid, true); +} + +/* + * KnownAssignedXidsRemoveTree + * Remove xid (if it's not InvalidTransactionId) and all the subxids. + * + * Caller must hold ProcArrayLock in exclusive mode. + */ +static void +KnownAssignedXidsRemoveTree(TransactionId xid, int nsubxids, + TransactionId *subxids) +{ + int i; + + if (TransactionIdIsValid(xid)) + KnownAssignedXidsRemove(xid); + + for (i = 0; i < nsubxids; i++) + KnownAssignedXidsRemove(subxids[i]); + + /* Opportunistically compress the array */ + KnownAssignedXidsCompress(false); +} + +/* + * Prune KnownAssignedXids up to, but *not* including xid. If xid is invalid + * then clear the whole table. + * + * Caller must hold ProcArrayLock in exclusive mode. + */ +static void +KnownAssignedXidsRemovePreceding(TransactionId removeXid) +{ + ProcArrayStruct *pArray = procArray; + int count = 0; + int head, + tail, + i; + + if (!TransactionIdIsValid(removeXid)) + { + elog(trace_recovery(DEBUG4), "removing all KnownAssignedXids"); + pArray->numKnownAssignedXids = 0; + pArray->headKnownAssignedXids = pArray->tailKnownAssignedXids = 0; + return; + } + + elog(trace_recovery(DEBUG4), "prune KnownAssignedXids to %u", removeXid); + + /* + * Mark entries invalid starting at the tail. Since array is sorted, we + * can stop as soon as we reach an entry >= removeXid. + */ + tail = pArray->tailKnownAssignedXids; + head = pArray->headKnownAssignedXids; + + for (i = tail; i < head; i++) + { + if (KnownAssignedXidsValid[i]) + { + TransactionId knownXid = KnownAssignedXids[i]; + + if (TransactionIdFollowsOrEquals(knownXid, removeXid)) + break; + + if (!StandbyTransactionIdIsPrepared(knownXid)) + { + KnownAssignedXidsValid[i] = false; + count++; + } + } + } + + pArray->numKnownAssignedXids -= count; + Assert(pArray->numKnownAssignedXids >= 0); + + /* + * Advance the tail pointer if we've marked the tail item invalid. + */ + for (i = tail; i < head; i++) + { + if (KnownAssignedXidsValid[i]) + break; + } + if (i >= head) + { + /* Array is empty, so we can reset both pointers */ + pArray->headKnownAssignedXids = 0; + pArray->tailKnownAssignedXids = 0; + } + else + { + pArray->tailKnownAssignedXids = i; + } + + /* Opportunistically compress the array */ + KnownAssignedXidsCompress(false); +} + +/* + * KnownAssignedXidsGet - Get an array of xids by scanning KnownAssignedXids. + * We filter out anything >= xmax. + * + * Returns the number of XIDs stored into xarray[]. Caller is responsible + * that array is large enough. + * + * Caller must hold ProcArrayLock in (at least) shared mode. + */ +static int +KnownAssignedXidsGet(TransactionId *xarray, TransactionId xmax) +{ + TransactionId xtmp = InvalidTransactionId; + + return KnownAssignedXidsGetAndSetXmin(xarray, &xtmp, xmax); +} + +/* + * KnownAssignedXidsGetAndSetXmin - as KnownAssignedXidsGet, plus + * we reduce *xmin to the lowest xid value seen if not already lower. + * + * Caller must hold ProcArrayLock in (at least) shared mode. + */ +static int +KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, + TransactionId xmax) +{ + int count = 0; + int head, + tail; + int i; + + /* + * Fetch head just once, since it may change while we loop. We can stop + * once we reach the initially seen head, since we are certain that an xid + * cannot enter and then leave the array while we hold ProcArrayLock. We + * might miss newly-added xids, but they should be >= xmax so irrelevant + * anyway. + * + * Must take spinlock to ensure we see up-to-date array contents. + */ + SpinLockAcquire(&procArray->known_assigned_xids_lck); + tail = procArray->tailKnownAssignedXids; + head = procArray->headKnownAssignedXids; + SpinLockRelease(&procArray->known_assigned_xids_lck); + + for (i = tail; i < head; i++) + { + /* Skip any gaps in the array */ + if (KnownAssignedXidsValid[i]) + { + TransactionId knownXid = KnownAssignedXids[i]; + + /* + * Update xmin if required. Only the first XID need be checked, + * since the array is sorted. + */ + if (count == 0 && + TransactionIdPrecedes(knownXid, *xmin)) + *xmin = knownXid; + + /* + * Filter out anything >= xmax, again relying on sorted property + * of array. + */ + if (TransactionIdIsValid(xmax) && + TransactionIdFollowsOrEquals(knownXid, xmax)) + break; + + /* Add knownXid into output array */ + xarray[count++] = knownXid; + } + } + + return count; +} + +/* + * Get oldest XID in the KnownAssignedXids array, or InvalidTransactionId + * if nothing there. + */ +static TransactionId +KnownAssignedXidsGetOldestXmin(void) +{ + int head, + tail; + int i; + + /* + * Fetch head just once, since it may change while we loop. + */ + SpinLockAcquire(&procArray->known_assigned_xids_lck); + tail = procArray->tailKnownAssignedXids; + head = procArray->headKnownAssignedXids; + SpinLockRelease(&procArray->known_assigned_xids_lck); + + for (i = tail; i < head; i++) + { + /* Skip any gaps in the array */ + if (KnownAssignedXidsValid[i]) + return KnownAssignedXids[i]; + } + + return InvalidTransactionId; +} + +/* + * Display KnownAssignedXids to provide debug trail + * + * Currently this is only called within startup process, so we need no + * special locking. + * + * Note this is pretty expensive, and much of the expense will be incurred + * even if the elog message will get discarded. It's not currently called + * in any performance-critical places, however, so no need to be tenser. + */ +static void +KnownAssignedXidsDisplay(int trace_level) +{ + ProcArrayStruct *pArray = procArray; + StringInfoData buf; + int head, + tail, + i; + int nxids = 0; + + tail = pArray->tailKnownAssignedXids; + head = pArray->headKnownAssignedXids; + + initStringInfo(&buf); + + for (i = tail; i < head; i++) + { + if (KnownAssignedXidsValid[i]) + { + nxids++; + appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]); + } + } + + elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s", + nxids, + pArray->numKnownAssignedXids, + pArray->tailKnownAssignedXids, + pArray->headKnownAssignedXids, + buf.data); + + pfree(buf.data); +} + +/* + * KnownAssignedXidsReset + * Resets KnownAssignedXids to be empty + */ +static void +KnownAssignedXidsReset(void) +{ + ProcArrayStruct *pArray = procArray; + + LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + + pArray->numKnownAssignedXids = 0; + pArray->tailKnownAssignedXids = 0; + pArray->headKnownAssignedXids = 0; + + LWLockRelease(ProcArrayLock); +} diff --git a/src/backend/storage/ipc/procsignal.c b/src/backend/storage/ipc/procsignal.c new file mode 100644 index 0000000..defb75a --- /dev/null +++ b/src/backend/storage/ipc/procsignal.c @@ -0,0 +1,685 @@ +/*------------------------------------------------------------------------- + * + * procsignal.c + * Routines for interprocess signaling + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/ipc/procsignal.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> + +#include "access/parallel.h" +#include "port/pg_bitutils.h" +#include "commands/async.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "replication/walsender.h" +#include "storage/condition_variable.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/proc.h" +#include "storage/shmem.h" +#include "storage/sinval.h" +#include "tcop/tcopprot.h" +#include "utils/memutils.h" + +/* + * The SIGUSR1 signal is multiplexed to support signaling multiple event + * types. The specific reason is communicated via flags in shared memory. + * We keep a boolean flag for each possible "reason", so that different + * reasons can be signaled to a process concurrently. (However, if the same + * reason is signaled more than once nearly simultaneously, the process may + * observe it only once.) + * + * Each process that wants to receive signals registers its process ID + * in the ProcSignalSlots array. The array is indexed by backend ID to make + * slot allocation simple, and to avoid having to search the array when you + * know the backend ID of the process you're signaling. (We do support + * signaling without backend ID, but it's a bit less efficient.) + * + * The flags are actually declared as "volatile sig_atomic_t" for maximum + * portability. This should ensure that loads and stores of the flag + * values are atomic, allowing us to dispense with any explicit locking. + * + * pss_signalFlags are intended to be set in cases where we don't need to + * keep track of whether or not the target process has handled the signal, + * but sometimes we need confirmation, as when making a global state change + * that cannot be considered complete until all backends have taken notice + * of it. For such use cases, we set a bit in pss_barrierCheckMask and then + * increment the current "barrier generation"; when the new barrier generation + * (or greater) appears in the pss_barrierGeneration flag of every process, + * we know that the message has been received everywhere. + */ +typedef struct +{ + volatile pid_t pss_pid; + volatile sig_atomic_t pss_signalFlags[NUM_PROCSIGNALS]; + pg_atomic_uint64 pss_barrierGeneration; + pg_atomic_uint32 pss_barrierCheckMask; + ConditionVariable pss_barrierCV; +} ProcSignalSlot; + +/* + * Information that is global to the entire ProcSignal system can be stored + * here. + * + * psh_barrierGeneration is the highest barrier generation in existence. + */ +typedef struct +{ + pg_atomic_uint64 psh_barrierGeneration; + ProcSignalSlot psh_slot[FLEXIBLE_ARRAY_MEMBER]; +} ProcSignalHeader; + +/* + * We reserve a slot for each possible BackendId, plus one for each + * possible auxiliary process type. (This scheme assumes there is not + * more than one of any auxiliary process type at a time.) + */ +#define NumProcSignalSlots (MaxBackends + NUM_AUXPROCTYPES) + +/* Check whether the relevant type bit is set in the flags. */ +#define BARRIER_SHOULD_CHECK(flags, type) \ + (((flags) & (((uint32) 1) << (uint32) (type))) != 0) + +/* Clear the relevant type bit from the flags. */ +#define BARRIER_CLEAR_BIT(flags, type) \ + ((flags) &= ~(((uint32) 1) << (uint32) (type))) + +static ProcSignalHeader *ProcSignal = NULL; +static ProcSignalSlot *MyProcSignalSlot = NULL; + +static bool CheckProcSignal(ProcSignalReason reason); +static void CleanupProcSignalState(int status, Datum arg); +static void ResetProcSignalBarrierBits(uint32 flags); +static bool ProcessBarrierPlaceholder(void); + +/* + * ProcSignalShmemSize + * Compute space needed for procsignal's shared memory + */ +Size +ProcSignalShmemSize(void) +{ + Size size; + + size = mul_size(NumProcSignalSlots, sizeof(ProcSignalSlot)); + size = add_size(size, offsetof(ProcSignalHeader, psh_slot)); + return size; +} + +/* + * ProcSignalShmemInit + * Allocate and initialize procsignal's shared memory + */ +void +ProcSignalShmemInit(void) +{ + Size size = ProcSignalShmemSize(); + bool found; + + ProcSignal = (ProcSignalHeader *) + ShmemInitStruct("ProcSignal", size, &found); + + /* If we're first, initialize. */ + if (!found) + { + int i; + + pg_atomic_init_u64(&ProcSignal->psh_barrierGeneration, 0); + + for (i = 0; i < NumProcSignalSlots; ++i) + { + ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; + + slot->pss_pid = 0; + MemSet(slot->pss_signalFlags, 0, sizeof(slot->pss_signalFlags)); + pg_atomic_init_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX); + pg_atomic_init_u32(&slot->pss_barrierCheckMask, 0); + ConditionVariableInit(&slot->pss_barrierCV); + } + } +} + +/* + * ProcSignalInit + * Register the current process in the procsignal array + * + * The passed index should be my BackendId if the process has one, + * or MaxBackends + aux process type if not. + */ +void +ProcSignalInit(int pss_idx) +{ + ProcSignalSlot *slot; + uint64 barrier_generation; + + Assert(pss_idx >= 1 && pss_idx <= NumProcSignalSlots); + + slot = &ProcSignal->psh_slot[pss_idx - 1]; + + /* sanity check */ + if (slot->pss_pid != 0) + elog(LOG, "process %d taking over ProcSignal slot %d, but it's not empty", + MyProcPid, pss_idx); + + /* Clear out any leftover signal reasons */ + MemSet(slot->pss_signalFlags, 0, NUM_PROCSIGNALS * sizeof(sig_atomic_t)); + + /* + * Initialize barrier state. Since we're a brand-new process, there + * shouldn't be any leftover backend-private state that needs to be + * updated. Therefore, we can broadcast the latest barrier generation and + * disregard any previously-set check bits. + * + * NB: This only works if this initialization happens early enough in the + * startup sequence that we haven't yet cached any state that might need + * to be invalidated. That's also why we have a memory barrier here, to be + * sure that any later reads of memory happen strictly after this. + */ + pg_atomic_write_u32(&slot->pss_barrierCheckMask, 0); + barrier_generation = + pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration); + pg_atomic_write_u64(&slot->pss_barrierGeneration, barrier_generation); + pg_memory_barrier(); + + /* Mark slot with my PID */ + slot->pss_pid = MyProcPid; + + /* Remember slot location for CheckProcSignal */ + MyProcSignalSlot = slot; + + /* Set up to release the slot on process exit */ + on_shmem_exit(CleanupProcSignalState, Int32GetDatum(pss_idx)); +} + +/* + * CleanupProcSignalState + * Remove current process from ProcSignal mechanism + * + * This function is called via on_shmem_exit() during backend shutdown. + */ +static void +CleanupProcSignalState(int status, Datum arg) +{ + int pss_idx = DatumGetInt32(arg); + ProcSignalSlot *slot; + + slot = &ProcSignal->psh_slot[pss_idx - 1]; + Assert(slot == MyProcSignalSlot); + + /* + * Clear MyProcSignalSlot, so that a SIGUSR1 received after this point + * won't try to access it after it's no longer ours (and perhaps even + * after we've unmapped the shared memory segment). + */ + MyProcSignalSlot = NULL; + + /* sanity check */ + if (slot->pss_pid != MyProcPid) + { + /* + * don't ERROR here. We're exiting anyway, and don't want to get into + * infinite loop trying to exit + */ + elog(LOG, "process %d releasing ProcSignal slot %d, but it contains %d", + MyProcPid, pss_idx, (int) slot->pss_pid); + return; /* XXX better to zero the slot anyway? */ + } + + /* + * Make this slot look like it's absorbed all possible barriers, so that + * no barrier waits block on it. + */ + pg_atomic_write_u64(&slot->pss_barrierGeneration, PG_UINT64_MAX); + ConditionVariableBroadcast(&slot->pss_barrierCV); + + slot->pss_pid = 0; +} + +/* + * SendProcSignal + * Send a signal to a Postgres process + * + * Providing backendId is optional, but it will speed up the operation. + * + * On success (a signal was sent), zero is returned. + * On error, -1 is returned, and errno is set (typically to ESRCH or EPERM). + * + * Not to be confused with ProcSendSignal + */ +int +SendProcSignal(pid_t pid, ProcSignalReason reason, BackendId backendId) +{ + volatile ProcSignalSlot *slot; + + if (backendId != InvalidBackendId) + { + slot = &ProcSignal->psh_slot[backendId - 1]; + + /* + * Note: Since there's no locking, it's possible that the target + * process detaches from shared memory and exits right after this + * test, before we set the flag and send signal. And the signal slot + * might even be recycled by a new process, so it's remotely possible + * that we set a flag for a wrong process. That's OK, all the signals + * are such that no harm is done if they're mistakenly fired. + */ + if (slot->pss_pid == pid) + { + /* Atomically set the proper flag */ + slot->pss_signalFlags[reason] = true; + /* Send signal */ + return kill(pid, SIGUSR1); + } + } + else + { + /* + * BackendId not provided, so search the array using pid. We search + * the array back to front so as to reduce search overhead. Passing + * InvalidBackendId means that the target is most likely an auxiliary + * process, which will have a slot near the end of the array. + */ + int i; + + for (i = NumProcSignalSlots - 1; i >= 0; i--) + { + slot = &ProcSignal->psh_slot[i]; + + if (slot->pss_pid == pid) + { + /* the above note about race conditions applies here too */ + + /* Atomically set the proper flag */ + slot->pss_signalFlags[reason] = true; + /* Send signal */ + return kill(pid, SIGUSR1); + } + } + } + + errno = ESRCH; + return -1; +} + +/* + * EmitProcSignalBarrier + * Send a signal to every Postgres process + * + * The return value of this function is the barrier "generation" created + * by this operation. This value can be passed to WaitForProcSignalBarrier + * to wait until it is known that every participant in the ProcSignal + * mechanism has absorbed the signal (or started afterwards). + * + * Note that it would be a bad idea to use this for anything that happens + * frequently, as interrupting every backend could cause a noticeable + * performance hit. + * + * Callers are entitled to assume that this function will not throw ERROR + * or FATAL. + */ +uint64 +EmitProcSignalBarrier(ProcSignalBarrierType type) +{ + uint32 flagbit = 1 << (uint32) type; + uint64 generation; + + /* + * Set all the flags. + * + * Note that pg_atomic_fetch_or_u32 has full barrier semantics, so this is + * totally ordered with respect to anything the caller did before, and + * anything that we do afterwards. (This is also true of the later call to + * pg_atomic_add_fetch_u64.) + */ + for (int i = 0; i < NumProcSignalSlots; i++) + { + volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; + + pg_atomic_fetch_or_u32(&slot->pss_barrierCheckMask, flagbit); + } + + /* + * Increment the generation counter. + */ + generation = + pg_atomic_add_fetch_u64(&ProcSignal->psh_barrierGeneration, 1); + + /* + * Signal all the processes, so that they update their advertised barrier + * generation. + * + * Concurrency is not a problem here. Backends that have exited don't + * matter, and new backends that have joined since we entered this + * function must already have current state, since the caller is + * responsible for making sure that the relevant state is entirely visible + * before calling this function in the first place. We still have to wake + * them up - because we can't distinguish between such backends and older + * backends that need to update state - but they won't actually need to + * change any state. + */ + for (int i = NumProcSignalSlots - 1; i >= 0; i--) + { + volatile ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; + pid_t pid = slot->pss_pid; + + if (pid != 0) + { + /* see SendProcSignal for details */ + slot->pss_signalFlags[PROCSIG_BARRIER] = true; + kill(pid, SIGUSR1); + } + } + + return generation; +} + +/* + * WaitForProcSignalBarrier - wait until it is guaranteed that all changes + * requested by a specific call to EmitProcSignalBarrier() have taken effect. + */ +void +WaitForProcSignalBarrier(uint64 generation) +{ + Assert(generation <= pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration)); + + for (int i = NumProcSignalSlots - 1; i >= 0; i--) + { + ProcSignalSlot *slot = &ProcSignal->psh_slot[i]; + uint64 oldval; + + /* + * It's important that we check only pss_barrierGeneration here and + * not pss_barrierCheckMask. Bits in pss_barrierCheckMask get cleared + * before the barrier is actually absorbed, but pss_barrierGeneration + * is updated only afterward. + */ + oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration); + while (oldval < generation) + { + ConditionVariableSleep(&slot->pss_barrierCV, + WAIT_EVENT_PROC_SIGNAL_BARRIER); + oldval = pg_atomic_read_u64(&slot->pss_barrierGeneration); + } + ConditionVariableCancelSleep(); + } + + /* + * The caller is probably calling this function because it wants to read + * the shared state or perform further writes to shared state once all + * backends are known to have absorbed the barrier. However, the read of + * pss_barrierGeneration was performed unlocked; insert a memory barrier + * to separate it from whatever follows. + */ + pg_memory_barrier(); +} + +/* + * Handle receipt of an interrupt indicating a global barrier event. + * + * All the actual work is deferred to ProcessProcSignalBarrier(), because we + * cannot safely access the barrier generation inside the signal handler as + * 64bit atomics might use spinlock based emulation, even for reads. As this + * routine only gets called when PROCSIG_BARRIER is sent that won't cause a + * lot of unnecessary work. + */ +static void +HandleProcSignalBarrierInterrupt(void) +{ + InterruptPending = true; + ProcSignalBarrierPending = true; + /* latch will be set by procsignal_sigusr1_handler */ +} + +/* + * Perform global barrier related interrupt checking. + * + * Any backend that participates in ProcSignal signaling must arrange to + * call this function periodically. It is called from CHECK_FOR_INTERRUPTS(), + * which is enough for normal backends, but not necessarily for all types of + * background processes. + */ +void +ProcessProcSignalBarrier(void) +{ + uint64 local_gen; + uint64 shared_gen; + volatile uint32 flags; + + Assert(MyProcSignalSlot); + + /* Exit quickly if there's no work to do. */ + if (!ProcSignalBarrierPending) + return; + ProcSignalBarrierPending = false; + + /* + * It's not unlikely to process multiple barriers at once, before the + * signals for all the barriers have arrived. To avoid unnecessary work in + * response to subsequent signals, exit early if we already have processed + * all of them. + */ + local_gen = pg_atomic_read_u64(&MyProcSignalSlot->pss_barrierGeneration); + shared_gen = pg_atomic_read_u64(&ProcSignal->psh_barrierGeneration); + + Assert(local_gen <= shared_gen); + + if (local_gen == shared_gen) + return; + + /* + * Get and clear the flags that are set for this backend. Note that + * pg_atomic_exchange_u32 is a full barrier, so we're guaranteed that the + * read of the barrier generation above happens before we atomically + * extract the flags, and that any subsequent state changes happen + * afterward. + * + * NB: In order to avoid race conditions, we must zero + * pss_barrierCheckMask first and only afterwards try to do barrier + * processing. If we did it in the other order, someone could send us + * another barrier of some type right after we called the + * barrier-processing function but before we cleared the bit. We would + * have no way of knowing that the bit needs to stay set in that case, so + * the need to call the barrier-processing function again would just get + * forgotten. So instead, we tentatively clear all the bits and then put + * back any for which we don't manage to successfully absorb the barrier. + */ + flags = pg_atomic_exchange_u32(&MyProcSignalSlot->pss_barrierCheckMask, 0); + + /* + * If there are no flags set, then we can skip doing any real work. + * Otherwise, establish a PG_TRY block, so that we don't lose track of + * which types of barrier processing are needed if an ERROR occurs. + */ + if (flags != 0) + { + bool success = true; + + PG_TRY(); + { + /* + * Process each type of barrier. The barrier-processing functions + * should normally return true, but may return false if the + * barrier can't be absorbed at the current time. This should be + * rare, because it's pretty expensive. Every single + * CHECK_FOR_INTERRUPTS() will return here until we manage to + * absorb the barrier, and that cost will add up in a hurry. + * + * NB: It ought to be OK to call the barrier-processing functions + * unconditionally, but it's more efficient to call only the ones + * that might need us to do something based on the flags. + */ + while (flags != 0) + { + ProcSignalBarrierType type; + bool processed = true; + + type = (ProcSignalBarrierType) pg_rightmost_one_pos32(flags); + switch (type) + { + case PROCSIGNAL_BARRIER_PLACEHOLDER: + processed = ProcessBarrierPlaceholder(); + break; + } + + /* + * To avoid an infinite loop, we must always unset the bit in + * flags. + */ + BARRIER_CLEAR_BIT(flags, type); + + /* + * If we failed to process the barrier, reset the shared bit + * so we try again later, and set a flag so that we don't bump + * our generation. + */ + if (!processed) + { + ResetProcSignalBarrierBits(((uint32) 1) << type); + success = false; + } + } + } + PG_CATCH(); + { + /* + * If an ERROR occurred, we'll need to try again later to handle + * that barrier type and any others that haven't been handled yet + * or weren't successfully absorbed. + */ + ResetProcSignalBarrierBits(flags); + PG_RE_THROW(); + } + PG_END_TRY(); + + /* + * If some barrier types were not successfully absorbed, we will have + * to try again later. + */ + if (!success) + return; + } + + /* + * State changes related to all types of barriers that might have been + * emitted have now been handled, so we can update our notion of the + * generation to the one we observed before beginning the updates. If + * things have changed further, it'll get fixed up when this function is + * next called. + */ + pg_atomic_write_u64(&MyProcSignalSlot->pss_barrierGeneration, shared_gen); + ConditionVariableBroadcast(&MyProcSignalSlot->pss_barrierCV); +} + +/* + * If it turns out that we couldn't absorb one or more barrier types, either + * because the barrier-processing functions returned false or due to an error, + * arrange for processing to be retried later. + */ +static void +ResetProcSignalBarrierBits(uint32 flags) +{ + pg_atomic_fetch_or_u32(&MyProcSignalSlot->pss_barrierCheckMask, flags); + ProcSignalBarrierPending = true; + InterruptPending = true; +} + +static bool +ProcessBarrierPlaceholder(void) +{ + /* + * XXX. This is just a placeholder until the first real user of this + * machinery gets committed. Rename PROCSIGNAL_BARRIER_PLACEHOLDER to + * PROCSIGNAL_BARRIER_SOMETHING_ELSE where SOMETHING_ELSE is something + * appropriately descriptive. Get rid of this function and instead have + * ProcessBarrierSomethingElse. Most likely, that function should live in + * the file pertaining to that subsystem, rather than here. + * + * The return value should be 'true' if the barrier was successfully + * absorbed and 'false' if not. Note that returning 'false' can lead to + * very frequent retries, so try hard to make that an uncommon case. + */ + return true; +} + +/* + * CheckProcSignal - check to see if a particular reason has been + * signaled, and clear the signal flag. Should be called after receiving + * SIGUSR1. + */ +static bool +CheckProcSignal(ProcSignalReason reason) +{ + volatile ProcSignalSlot *slot = MyProcSignalSlot; + + if (slot != NULL) + { + /* Careful here --- don't clear flag if we haven't seen it set */ + if (slot->pss_signalFlags[reason]) + { + slot->pss_signalFlags[reason] = false; + return true; + } + } + + return false; +} + +/* + * procsignal_sigusr1_handler - handle SIGUSR1 signal. + */ +void +procsignal_sigusr1_handler(SIGNAL_ARGS) +{ + int save_errno = errno; + + if (CheckProcSignal(PROCSIG_CATCHUP_INTERRUPT)) + HandleCatchupInterrupt(); + + if (CheckProcSignal(PROCSIG_NOTIFY_INTERRUPT)) + HandleNotifyInterrupt(); + + if (CheckProcSignal(PROCSIG_PARALLEL_MESSAGE)) + HandleParallelMessageInterrupt(); + + if (CheckProcSignal(PROCSIG_WALSND_INIT_STOPPING)) + HandleWalSndInitStopping(); + + if (CheckProcSignal(PROCSIG_BARRIER)) + HandleProcSignalBarrierInterrupt(); + + if (CheckProcSignal(PROCSIG_LOG_MEMORY_CONTEXT)) + HandleLogMemoryContextInterrupt(); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_DATABASE)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_DATABASE); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_TABLESPACE)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_TABLESPACE); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_LOCK)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_LOCK); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_SNAPSHOT); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + + if (CheckProcSignal(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN)) + RecoveryConflictInterrupt(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + + SetLatch(MyLatch); + + errno = save_errno; +} diff --git a/src/backend/storage/ipc/shm_mq.c b/src/backend/storage/ipc/shm_mq.c new file mode 100644 index 0000000..3240af4 --- /dev/null +++ b/src/backend/storage/ipc/shm_mq.c @@ -0,0 +1,1288 @@ +/*------------------------------------------------------------------------- + * + * shm_mq.c + * single-reader, single-writer shared memory message queue + * + * Both the sender and the receiver must have a PGPROC; their respective + * process latches are used for synchronization. Only the sender may send, + * and only the receiver may receive. This is intended to allow a user + * backend to communicate with worker backends that it has registered. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/storage/ipc/shm_mq.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" +#include "storage/procsignal.h" +#include "storage/shm_mq.h" +#include "storage/spin.h" +#include "utils/memutils.h" + +/* + * This structure represents the actual queue, stored in shared memory. + * + * Some notes on synchronization: + * + * mq_receiver and mq_bytes_read can only be changed by the receiver; and + * mq_sender and mq_bytes_written can only be changed by the sender. + * mq_receiver and mq_sender are protected by mq_mutex, although, importantly, + * they cannot change once set, and thus may be read without a lock once this + * is known to be the case. + * + * mq_bytes_read and mq_bytes_written are not protected by the mutex. Instead, + * they are written atomically using 8 byte loads and stores. Memory barriers + * must be carefully used to synchronize reads and writes of these values with + * reads and writes of the actual data in mq_ring. + * + * mq_detached needs no locking. It can be set by either the sender or the + * receiver, but only ever from false to true, so redundant writes don't + * matter. It is important that if we set mq_detached and then set the + * counterparty's latch, the counterparty must be certain to see the change + * after waking up. Since SetLatch begins with a memory barrier and ResetLatch + * ends with one, this should be OK. + * + * mq_ring_size and mq_ring_offset never change after initialization, and + * can therefore be read without the lock. + * + * Importantly, mq_ring can be safely read and written without a lock. + * At any given time, the difference between mq_bytes_read and + * mq_bytes_written defines the number of bytes within mq_ring that contain + * unread data, and mq_bytes_read defines the position where those bytes + * begin. The sender can increase the number of unread bytes at any time, + * but only the receiver can give license to overwrite those bytes, by + * incrementing mq_bytes_read. Therefore, it's safe for the receiver to read + * the unread bytes it knows to be present without the lock. Conversely, + * the sender can write to the unused portion of the ring buffer without + * the lock, because nobody else can be reading or writing those bytes. The + * receiver could be making more bytes unused by incrementing mq_bytes_read, + * but that's OK. Note that it would be unsafe for the receiver to read any + * data it's already marked as read, or to write any data; and it would be + * unsafe for the sender to reread any data after incrementing + * mq_bytes_written, but fortunately there's no need for any of that. + */ +struct shm_mq +{ + slock_t mq_mutex; + PGPROC *mq_receiver; + PGPROC *mq_sender; + pg_atomic_uint64 mq_bytes_read; + pg_atomic_uint64 mq_bytes_written; + Size mq_ring_size; + bool mq_detached; + uint8 mq_ring_offset; + char mq_ring[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * This structure is a backend-private handle for access to a queue. + * + * mqh_queue is a pointer to the queue we've attached, and mqh_segment is + * an optional pointer to the dynamic shared memory segment that contains it. + * (If mqh_segment is provided, we register an on_dsm_detach callback to + * make sure we detach from the queue before detaching from DSM.) + * + * If this queue is intended to connect the current process with a background + * worker that started it, the user can pass a pointer to the worker handle + * to shm_mq_attach(), and we'll store it in mqh_handle. The point of this + * is to allow us to begin sending to or receiving from that queue before the + * process we'll be communicating with has even been started. If it fails + * to start, the handle will allow us to notice that and fail cleanly, rather + * than waiting forever; see shm_mq_wait_internal. This is mostly useful in + * simple cases - e.g. where there are just 2 processes communicating; in + * more complex scenarios, every process may not have a BackgroundWorkerHandle + * available, or may need to watch for the failure of more than one other + * process at a time. + * + * When a message exists as a contiguous chunk of bytes in the queue - that is, + * it is smaller than the size of the ring buffer and does not wrap around + * the end - we return the message to the caller as a pointer into the buffer. + * For messages that are larger or happen to wrap, we reassemble the message + * locally by copying the chunks into a backend-local buffer. mqh_buffer is + * the buffer, and mqh_buflen is the number of bytes allocated for it. + * + * mqh_partial_bytes, mqh_expected_bytes, and mqh_length_word_complete + * are used to track the state of non-blocking operations. When the caller + * attempts a non-blocking operation that returns SHM_MQ_WOULD_BLOCK, they + * are expected to retry the call at a later time with the same argument; + * we need to retain enough state to pick up where we left off. + * mqh_length_word_complete tracks whether we are done sending or receiving + * (whichever we're doing) the entire length word. mqh_partial_bytes tracks + * the number of bytes read or written for either the length word or the + * message itself, and mqh_expected_bytes - which is used only for reads - + * tracks the expected total size of the payload. + * + * mqh_counterparty_attached tracks whether we know the counterparty to have + * attached to the queue at some previous point. This lets us avoid some + * mutex acquisitions. + * + * mqh_context is the memory context in effect at the time we attached to + * the shm_mq. The shm_mq_handle itself is allocated in this context, and + * we make sure any other allocations we do happen in this context as well, + * to avoid nasty surprises. + */ +struct shm_mq_handle +{ + shm_mq *mqh_queue; + dsm_segment *mqh_segment; + BackgroundWorkerHandle *mqh_handle; + char *mqh_buffer; + Size mqh_buflen; + Size mqh_consume_pending; + Size mqh_partial_bytes; + Size mqh_expected_bytes; + bool mqh_length_word_complete; + bool mqh_counterparty_attached; + MemoryContext mqh_context; +}; + +static void shm_mq_detach_internal(shm_mq *mq); +static shm_mq_result shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, + const void *data, bool nowait, Size *bytes_written); +static shm_mq_result shm_mq_receive_bytes(shm_mq_handle *mqh, + Size bytes_needed, bool nowait, Size *nbytesp, + void **datap); +static bool shm_mq_counterparty_gone(shm_mq *mq, + BackgroundWorkerHandle *handle); +static bool shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr, + BackgroundWorkerHandle *handle); +static void shm_mq_inc_bytes_read(shm_mq *mq, Size n); +static void shm_mq_inc_bytes_written(shm_mq *mq, Size n); +static void shm_mq_detach_callback(dsm_segment *seg, Datum arg); + +/* Minimum queue size is enough for header and at least one chunk of data. */ +const Size shm_mq_minimum_size = +MAXALIGN(offsetof(shm_mq, mq_ring)) + MAXIMUM_ALIGNOF; + +#define MQH_INITIAL_BUFSIZE 8192 + +/* + * Initialize a new shared message queue. + */ +shm_mq * +shm_mq_create(void *address, Size size) +{ + shm_mq *mq = address; + Size data_offset = MAXALIGN(offsetof(shm_mq, mq_ring)); + + /* If the size isn't MAXALIGN'd, just discard the odd bytes. */ + size = MAXALIGN_DOWN(size); + + /* Queue size must be large enough to hold some data. */ + Assert(size > data_offset); + + /* Initialize queue header. */ + SpinLockInit(&mq->mq_mutex); + mq->mq_receiver = NULL; + mq->mq_sender = NULL; + pg_atomic_init_u64(&mq->mq_bytes_read, 0); + pg_atomic_init_u64(&mq->mq_bytes_written, 0); + mq->mq_ring_size = size - data_offset; + mq->mq_detached = false; + mq->mq_ring_offset = data_offset - offsetof(shm_mq, mq_ring); + + return mq; +} + +/* + * Set the identity of the process that will receive from a shared message + * queue. + */ +void +shm_mq_set_receiver(shm_mq *mq, PGPROC *proc) +{ + PGPROC *sender; + + SpinLockAcquire(&mq->mq_mutex); + Assert(mq->mq_receiver == NULL); + mq->mq_receiver = proc; + sender = mq->mq_sender; + SpinLockRelease(&mq->mq_mutex); + + if (sender != NULL) + SetLatch(&sender->procLatch); +} + +/* + * Set the identity of the process that will send to a shared message queue. + */ +void +shm_mq_set_sender(shm_mq *mq, PGPROC *proc) +{ + PGPROC *receiver; + + SpinLockAcquire(&mq->mq_mutex); + Assert(mq->mq_sender == NULL); + mq->mq_sender = proc; + receiver = mq->mq_receiver; + SpinLockRelease(&mq->mq_mutex); + + if (receiver != NULL) + SetLatch(&receiver->procLatch); +} + +/* + * Get the configured receiver. + */ +PGPROC * +shm_mq_get_receiver(shm_mq *mq) +{ + PGPROC *receiver; + + SpinLockAcquire(&mq->mq_mutex); + receiver = mq->mq_receiver; + SpinLockRelease(&mq->mq_mutex); + + return receiver; +} + +/* + * Get the configured sender. + */ +PGPROC * +shm_mq_get_sender(shm_mq *mq) +{ + PGPROC *sender; + + SpinLockAcquire(&mq->mq_mutex); + sender = mq->mq_sender; + SpinLockRelease(&mq->mq_mutex); + + return sender; +} + +/* + * Attach to a shared message queue so we can send or receive messages. + * + * The memory context in effect at the time this function is called should + * be one which will last for at least as long as the message queue itself. + * We'll allocate the handle in that context, and future allocations that + * are needed to buffer incoming data will happen in that context as well. + * + * If seg != NULL, the queue will be automatically detached when that dynamic + * shared memory segment is detached. + * + * If handle != NULL, the queue can be read or written even before the + * other process has attached. We'll wait for it to do so if needed. The + * handle must be for a background worker initialized with bgw_notify_pid + * equal to our PID. + * + * shm_mq_detach() should be called when done. This will free the + * shm_mq_handle and mark the queue itself as detached, so that our + * counterpart won't get stuck waiting for us to fill or drain the queue + * after we've already lost interest. + */ +shm_mq_handle * +shm_mq_attach(shm_mq *mq, dsm_segment *seg, BackgroundWorkerHandle *handle) +{ + shm_mq_handle *mqh = palloc(sizeof(shm_mq_handle)); + + Assert(mq->mq_receiver == MyProc || mq->mq_sender == MyProc); + mqh->mqh_queue = mq; + mqh->mqh_segment = seg; + mqh->mqh_handle = handle; + mqh->mqh_buffer = NULL; + mqh->mqh_buflen = 0; + mqh->mqh_consume_pending = 0; + mqh->mqh_partial_bytes = 0; + mqh->mqh_expected_bytes = 0; + mqh->mqh_length_word_complete = false; + mqh->mqh_counterparty_attached = false; + mqh->mqh_context = CurrentMemoryContext; + + if (seg != NULL) + on_dsm_detach(seg, shm_mq_detach_callback, PointerGetDatum(mq)); + + return mqh; +} + +/* + * Associate a BackgroundWorkerHandle with a shm_mq_handle just as if it had + * been passed to shm_mq_attach. + */ +void +shm_mq_set_handle(shm_mq_handle *mqh, BackgroundWorkerHandle *handle) +{ + Assert(mqh->mqh_handle == NULL); + mqh->mqh_handle = handle; +} + +/* + * Write a message into a shared message queue. + */ +shm_mq_result +shm_mq_send(shm_mq_handle *mqh, Size nbytes, const void *data, bool nowait) +{ + shm_mq_iovec iov; + + iov.data = data; + iov.len = nbytes; + + return shm_mq_sendv(mqh, &iov, 1, nowait); +} + +/* + * Write a message into a shared message queue, gathered from multiple + * addresses. + * + * When nowait = false, we'll wait on our process latch when the ring buffer + * fills up, and then continue writing once the receiver has drained some data. + * The process latch is reset after each wait. + * + * When nowait = true, we do not manipulate the state of the process latch; + * instead, if the buffer becomes full, we return SHM_MQ_WOULD_BLOCK. In + * this case, the caller should call this function again, with the same + * arguments, each time the process latch is set. (Once begun, the sending + * of a message cannot be aborted except by detaching from the queue; changing + * the length or payload will corrupt the queue.) + */ +shm_mq_result +shm_mq_sendv(shm_mq_handle *mqh, shm_mq_iovec *iov, int iovcnt, bool nowait) +{ + shm_mq_result res; + shm_mq *mq = mqh->mqh_queue; + PGPROC *receiver; + Size nbytes = 0; + Size bytes_written; + int i; + int which_iov = 0; + Size offset; + + Assert(mq->mq_sender == MyProc); + + /* Compute total size of write. */ + for (i = 0; i < iovcnt; ++i) + nbytes += iov[i].len; + + /* Prevent writing messages overwhelming the receiver. */ + if (nbytes > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("cannot send a message of size %zu via shared memory queue", + nbytes))); + + /* Try to write, or finish writing, the length word into the buffer. */ + while (!mqh->mqh_length_word_complete) + { + Assert(mqh->mqh_partial_bytes < sizeof(Size)); + res = shm_mq_send_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes, + ((char *) &nbytes) + mqh->mqh_partial_bytes, + nowait, &bytes_written); + + if (res == SHM_MQ_DETACHED) + { + /* Reset state in case caller tries to send another message. */ + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = false; + return res; + } + mqh->mqh_partial_bytes += bytes_written; + + if (mqh->mqh_partial_bytes >= sizeof(Size)) + { + Assert(mqh->mqh_partial_bytes == sizeof(Size)); + + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = true; + } + + if (res != SHM_MQ_SUCCESS) + return res; + + /* Length word can't be split unless bigger than required alignment. */ + Assert(mqh->mqh_length_word_complete || sizeof(Size) > MAXIMUM_ALIGNOF); + } + + /* Write the actual data bytes into the buffer. */ + Assert(mqh->mqh_partial_bytes <= nbytes); + offset = mqh->mqh_partial_bytes; + do + { + Size chunksize; + + /* Figure out which bytes need to be sent next. */ + if (offset >= iov[which_iov].len) + { + offset -= iov[which_iov].len; + ++which_iov; + if (which_iov >= iovcnt) + break; + continue; + } + + /* + * We want to avoid copying the data if at all possible, but every + * chunk of bytes we write into the queue has to be MAXALIGN'd, except + * the last. Thus, if a chunk other than the last one ends on a + * non-MAXALIGN'd boundary, we have to combine the tail end of its + * data with data from one or more following chunks until we either + * reach the last chunk or accumulate a number of bytes which is + * MAXALIGN'd. + */ + if (which_iov + 1 < iovcnt && + offset + MAXIMUM_ALIGNOF > iov[which_iov].len) + { + char tmpbuf[MAXIMUM_ALIGNOF]; + int j = 0; + + for (;;) + { + if (offset < iov[which_iov].len) + { + tmpbuf[j] = iov[which_iov].data[offset]; + j++; + offset++; + if (j == MAXIMUM_ALIGNOF) + break; + } + else + { + offset -= iov[which_iov].len; + which_iov++; + if (which_iov >= iovcnt) + break; + } + } + + res = shm_mq_send_bytes(mqh, j, tmpbuf, nowait, &bytes_written); + + if (res == SHM_MQ_DETACHED) + { + /* Reset state in case caller tries to send another message. */ + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = false; + return res; + } + + mqh->mqh_partial_bytes += bytes_written; + if (res != SHM_MQ_SUCCESS) + return res; + continue; + } + + /* + * If this is the last chunk, we can write all the data, even if it + * isn't a multiple of MAXIMUM_ALIGNOF. Otherwise, we need to + * MAXALIGN_DOWN the write size. + */ + chunksize = iov[which_iov].len - offset; + if (which_iov + 1 < iovcnt) + chunksize = MAXALIGN_DOWN(chunksize); + res = shm_mq_send_bytes(mqh, chunksize, &iov[which_iov].data[offset], + nowait, &bytes_written); + + if (res == SHM_MQ_DETACHED) + { + /* Reset state in case caller tries to send another message. */ + mqh->mqh_length_word_complete = false; + mqh->mqh_partial_bytes = 0; + return res; + } + + mqh->mqh_partial_bytes += bytes_written; + offset += bytes_written; + if (res != SHM_MQ_SUCCESS) + return res; + } while (mqh->mqh_partial_bytes < nbytes); + + /* Reset for next message. */ + mqh->mqh_partial_bytes = 0; + mqh->mqh_length_word_complete = false; + + /* If queue has been detached, let caller know. */ + if (mq->mq_detached) + return SHM_MQ_DETACHED; + + /* + * If the counterparty is known to have attached, we can read mq_receiver + * without acquiring the spinlock and assume it isn't NULL. Otherwise, + * more caution is needed. + */ + if (mqh->mqh_counterparty_attached) + receiver = mq->mq_receiver; + else + { + SpinLockAcquire(&mq->mq_mutex); + receiver = mq->mq_receiver; + SpinLockRelease(&mq->mq_mutex); + if (receiver == NULL) + return SHM_MQ_SUCCESS; + mqh->mqh_counterparty_attached = true; + } + + /* Notify receiver of the newly-written data, and return. */ + SetLatch(&receiver->procLatch); + return SHM_MQ_SUCCESS; +} + +/* + * Receive a message from a shared message queue. + * + * We set *nbytes to the message length and *data to point to the message + * payload. If the entire message exists in the queue as a single, + * contiguous chunk, *data will point directly into shared memory; otherwise, + * it will point to a temporary buffer. This mostly avoids data copying in + * the hoped-for case where messages are short compared to the buffer size, + * while still allowing longer messages. In either case, the return value + * remains valid until the next receive operation is performed on the queue. + * + * When nowait = false, we'll wait on our process latch when the ring buffer + * is empty and we have not yet received a full message. The sender will + * set our process latch after more data has been written, and we'll resume + * processing. Each call will therefore return a complete message + * (unless the sender detaches the queue). + * + * When nowait = true, we do not manipulate the state of the process latch; + * instead, whenever the buffer is empty and we need to read from it, we + * return SHM_MQ_WOULD_BLOCK. In this case, the caller should call this + * function again after the process latch has been set. + */ +shm_mq_result +shm_mq_receive(shm_mq_handle *mqh, Size *nbytesp, void **datap, bool nowait) +{ + shm_mq *mq = mqh->mqh_queue; + shm_mq_result res; + Size rb = 0; + Size nbytes; + void *rawdata; + + Assert(mq->mq_receiver == MyProc); + + /* We can't receive data until the sender has attached. */ + if (!mqh->mqh_counterparty_attached) + { + if (nowait) + { + int counterparty_gone; + + /* + * We shouldn't return at this point at all unless the sender + * hasn't attached yet. However, the correct return value depends + * on whether the sender is still attached. If we first test + * whether the sender has ever attached and then test whether the + * sender has detached, there's a race condition: a sender that + * attaches and detaches very quickly might fool us into thinking + * the sender never attached at all. So, test whether our + * counterparty is definitively gone first, and only afterwards + * check whether the sender ever attached in the first place. + */ + counterparty_gone = shm_mq_counterparty_gone(mq, mqh->mqh_handle); + if (shm_mq_get_sender(mq) == NULL) + { + if (counterparty_gone) + return SHM_MQ_DETACHED; + else + return SHM_MQ_WOULD_BLOCK; + } + } + else if (!shm_mq_wait_internal(mq, &mq->mq_sender, mqh->mqh_handle) + && shm_mq_get_sender(mq) == NULL) + { + mq->mq_detached = true; + return SHM_MQ_DETACHED; + } + mqh->mqh_counterparty_attached = true; + } + + /* + * If we've consumed an amount of data greater than 1/4th of the ring + * size, mark it consumed in shared memory. We try to avoid doing this + * unnecessarily when only a small amount of data has been consumed, + * because SetLatch() is fairly expensive and we don't want to do it too + * often. + */ + if (mqh->mqh_consume_pending > mq->mq_ring_size / 4) + { + shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending); + mqh->mqh_consume_pending = 0; + } + + /* Try to read, or finish reading, the length word from the buffer. */ + while (!mqh->mqh_length_word_complete) + { + /* Try to receive the message length word. */ + Assert(mqh->mqh_partial_bytes < sizeof(Size)); + res = shm_mq_receive_bytes(mqh, sizeof(Size) - mqh->mqh_partial_bytes, + nowait, &rb, &rawdata); + if (res != SHM_MQ_SUCCESS) + return res; + + /* + * Hopefully, we'll receive the entire message length word at once. + * But if sizeof(Size) > MAXIMUM_ALIGNOF, then it might be split over + * multiple reads. + */ + if (mqh->mqh_partial_bytes == 0 && rb >= sizeof(Size)) + { + Size needed; + + nbytes = *(Size *) rawdata; + + /* If we've already got the whole message, we're done. */ + needed = MAXALIGN(sizeof(Size)) + MAXALIGN(nbytes); + if (rb >= needed) + { + mqh->mqh_consume_pending += needed; + *nbytesp = nbytes; + *datap = ((char *) rawdata) + MAXALIGN(sizeof(Size)); + return SHM_MQ_SUCCESS; + } + + /* + * We don't have the whole message, but we at least have the whole + * length word. + */ + mqh->mqh_expected_bytes = nbytes; + mqh->mqh_length_word_complete = true; + mqh->mqh_consume_pending += MAXALIGN(sizeof(Size)); + rb -= MAXALIGN(sizeof(Size)); + } + else + { + Size lengthbytes; + + /* Can't be split unless bigger than required alignment. */ + Assert(sizeof(Size) > MAXIMUM_ALIGNOF); + + /* Message word is split; need buffer to reassemble. */ + if (mqh->mqh_buffer == NULL) + { + mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, + MQH_INITIAL_BUFSIZE); + mqh->mqh_buflen = MQH_INITIAL_BUFSIZE; + } + Assert(mqh->mqh_buflen >= sizeof(Size)); + + /* Copy partial length word; remember to consume it. */ + if (mqh->mqh_partial_bytes + rb > sizeof(Size)) + lengthbytes = sizeof(Size) - mqh->mqh_partial_bytes; + else + lengthbytes = rb; + memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, + lengthbytes); + mqh->mqh_partial_bytes += lengthbytes; + mqh->mqh_consume_pending += MAXALIGN(lengthbytes); + rb -= lengthbytes; + + /* If we now have the whole word, we're ready to read payload. */ + if (mqh->mqh_partial_bytes >= sizeof(Size)) + { + Assert(mqh->mqh_partial_bytes == sizeof(Size)); + mqh->mqh_expected_bytes = *(Size *) mqh->mqh_buffer; + mqh->mqh_length_word_complete = true; + mqh->mqh_partial_bytes = 0; + } + } + } + nbytes = mqh->mqh_expected_bytes; + + /* + * Should be disallowed on the sending side already, but better check and + * error out on the receiver side as well rather than trying to read a + * prohibitively large message. + */ + if (nbytes > MaxAllocSize) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("invalid message size %zu in shared memory queue", + nbytes))); + + if (mqh->mqh_partial_bytes == 0) + { + /* + * Try to obtain the whole message in a single chunk. If this works, + * we need not copy the data and can return a pointer directly into + * shared memory. + */ + res = shm_mq_receive_bytes(mqh, nbytes, nowait, &rb, &rawdata); + if (res != SHM_MQ_SUCCESS) + return res; + if (rb >= nbytes) + { + mqh->mqh_length_word_complete = false; + mqh->mqh_consume_pending += MAXALIGN(nbytes); + *nbytesp = nbytes; + *datap = rawdata; + return SHM_MQ_SUCCESS; + } + + /* + * The message has wrapped the buffer. We'll need to copy it in order + * to return it to the client in one chunk. First, make sure we have + * a large enough buffer available. + */ + if (mqh->mqh_buflen < nbytes) + { + Size newbuflen = Max(mqh->mqh_buflen, MQH_INITIAL_BUFSIZE); + + /* + * Double the buffer size until the payload fits, but limit to + * MaxAllocSize. + */ + while (newbuflen < nbytes) + newbuflen *= 2; + newbuflen = Min(newbuflen, MaxAllocSize); + + if (mqh->mqh_buffer != NULL) + { + pfree(mqh->mqh_buffer); + mqh->mqh_buffer = NULL; + mqh->mqh_buflen = 0; + } + mqh->mqh_buffer = MemoryContextAlloc(mqh->mqh_context, newbuflen); + mqh->mqh_buflen = newbuflen; + } + } + + /* Loop until we've copied the entire message. */ + for (;;) + { + Size still_needed; + + /* Copy as much as we can. */ + Assert(mqh->mqh_partial_bytes + rb <= nbytes); + if (rb > 0) + { + memcpy(&mqh->mqh_buffer[mqh->mqh_partial_bytes], rawdata, rb); + mqh->mqh_partial_bytes += rb; + } + + /* + * Update count of bytes that can be consumed, accounting for + * alignment padding. Note that this will never actually insert any + * padding except at the end of a message, because the buffer size is + * a multiple of MAXIMUM_ALIGNOF, and each read and write is as well. + */ + Assert(mqh->mqh_partial_bytes == nbytes || rb == MAXALIGN(rb)); + mqh->mqh_consume_pending += MAXALIGN(rb); + + /* If we got all the data, exit the loop. */ + if (mqh->mqh_partial_bytes >= nbytes) + break; + + /* Wait for some more data. */ + still_needed = nbytes - mqh->mqh_partial_bytes; + res = shm_mq_receive_bytes(mqh, still_needed, nowait, &rb, &rawdata); + if (res != SHM_MQ_SUCCESS) + return res; + if (rb > still_needed) + rb = still_needed; + } + + /* Return the complete message, and reset for next message. */ + *nbytesp = nbytes; + *datap = mqh->mqh_buffer; + mqh->mqh_length_word_complete = false; + mqh->mqh_partial_bytes = 0; + return SHM_MQ_SUCCESS; +} + +/* + * Wait for the other process that's supposed to use this queue to attach + * to it. + * + * The return value is SHM_MQ_DETACHED if the worker has already detached or + * if it dies; it is SHM_MQ_SUCCESS if we detect that the worker has attached. + * Note that we will only be able to detect that the worker has died before + * attaching if a background worker handle was passed to shm_mq_attach(). + */ +shm_mq_result +shm_mq_wait_for_attach(shm_mq_handle *mqh) +{ + shm_mq *mq = mqh->mqh_queue; + PGPROC **victim; + + if (shm_mq_get_receiver(mq) == MyProc) + victim = &mq->mq_sender; + else + { + Assert(shm_mq_get_sender(mq) == MyProc); + victim = &mq->mq_receiver; + } + + if (shm_mq_wait_internal(mq, victim, mqh->mqh_handle)) + return SHM_MQ_SUCCESS; + else + return SHM_MQ_DETACHED; +} + +/* + * Detach from a shared message queue, and destroy the shm_mq_handle. + */ +void +shm_mq_detach(shm_mq_handle *mqh) +{ + /* Notify counterparty that we're outta here. */ + shm_mq_detach_internal(mqh->mqh_queue); + + /* Cancel on_dsm_detach callback, if any. */ + if (mqh->mqh_segment) + cancel_on_dsm_detach(mqh->mqh_segment, + shm_mq_detach_callback, + PointerGetDatum(mqh->mqh_queue)); + + /* Release local memory associated with handle. */ + if (mqh->mqh_buffer != NULL) + pfree(mqh->mqh_buffer); + pfree(mqh); +} + +/* + * Notify counterparty that we're detaching from shared message queue. + * + * The purpose of this function is to make sure that the process + * with which we're communicating doesn't block forever waiting for us to + * fill or drain the queue once we've lost interest. When the sender + * detaches, the receiver can read any messages remaining in the queue; + * further reads will return SHM_MQ_DETACHED. If the receiver detaches, + * further attempts to send messages will likewise return SHM_MQ_DETACHED. + * + * This is separated out from shm_mq_detach() because if the on_dsm_detach + * callback fires, we only want to do this much. We do not try to touch + * the local shm_mq_handle, as it may have been pfree'd already. + */ +static void +shm_mq_detach_internal(shm_mq *mq) +{ + PGPROC *victim; + + SpinLockAcquire(&mq->mq_mutex); + if (mq->mq_sender == MyProc) + victim = mq->mq_receiver; + else + { + Assert(mq->mq_receiver == MyProc); + victim = mq->mq_sender; + } + mq->mq_detached = true; + SpinLockRelease(&mq->mq_mutex); + + if (victim != NULL) + SetLatch(&victim->procLatch); +} + +/* + * Get the shm_mq from handle. + */ +shm_mq * +shm_mq_get_queue(shm_mq_handle *mqh) +{ + return mqh->mqh_queue; +} + +/* + * Write bytes into a shared message queue. + */ +static shm_mq_result +shm_mq_send_bytes(shm_mq_handle *mqh, Size nbytes, const void *data, + bool nowait, Size *bytes_written) +{ + shm_mq *mq = mqh->mqh_queue; + Size sent = 0; + uint64 used; + Size ringsize = mq->mq_ring_size; + Size available; + + while (sent < nbytes) + { + uint64 rb; + uint64 wb; + + /* Compute number of ring buffer bytes used and available. */ + rb = pg_atomic_read_u64(&mq->mq_bytes_read); + wb = pg_atomic_read_u64(&mq->mq_bytes_written); + Assert(wb >= rb); + used = wb - rb; + Assert(used <= ringsize); + available = Min(ringsize - used, nbytes - sent); + + /* + * Bail out if the queue has been detached. Note that we would be in + * trouble if the compiler decided to cache the value of + * mq->mq_detached in a register or on the stack across loop + * iterations. It probably shouldn't do that anyway since we'll + * always return, call an external function that performs a system + * call, or reach a memory barrier at some point later in the loop, + * but just to be sure, insert a compiler barrier here. + */ + pg_compiler_barrier(); + if (mq->mq_detached) + { + *bytes_written = sent; + return SHM_MQ_DETACHED; + } + + if (available == 0 && !mqh->mqh_counterparty_attached) + { + /* + * The queue is full, so if the receiver isn't yet known to be + * attached, we must wait for that to happen. + */ + if (nowait) + { + if (shm_mq_counterparty_gone(mq, mqh->mqh_handle)) + { + *bytes_written = sent; + return SHM_MQ_DETACHED; + } + if (shm_mq_get_receiver(mq) == NULL) + { + *bytes_written = sent; + return SHM_MQ_WOULD_BLOCK; + } + } + else if (!shm_mq_wait_internal(mq, &mq->mq_receiver, + mqh->mqh_handle)) + { + mq->mq_detached = true; + *bytes_written = sent; + return SHM_MQ_DETACHED; + } + mqh->mqh_counterparty_attached = true; + + /* + * The receiver may have read some data after attaching, so we + * must not wait without rechecking the queue state. + */ + } + else if (available == 0) + { + /* + * Since mq->mqh_counterparty_attached is known to be true at this + * point, mq_receiver has been set, and it can't change once set. + * Therefore, we can read it without acquiring the spinlock. + */ + Assert(mqh->mqh_counterparty_attached); + SetLatch(&mq->mq_receiver->procLatch); + + /* Skip manipulation of our latch if nowait = true. */ + if (nowait) + { + *bytes_written = sent; + return SHM_MQ_WOULD_BLOCK; + } + + /* + * Wait for our latch to be set. It might already be set for some + * unrelated reason, but that'll just result in one extra trip + * through the loop. It's worth it to avoid resetting the latch + * at top of loop, because setting an already-set latch is much + * cheaper than setting one that has been reset. + */ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + WAIT_EVENT_MQ_SEND); + + /* Reset the latch so we don't spin. */ + ResetLatch(MyLatch); + + /* An interrupt may have occurred while we were waiting. */ + CHECK_FOR_INTERRUPTS(); + } + else + { + Size offset; + Size sendnow; + + offset = wb % (uint64) ringsize; + sendnow = Min(available, ringsize - offset); + + /* + * Write as much data as we can via a single memcpy(). Make sure + * these writes happen after the read of mq_bytes_read, above. + * This barrier pairs with the one in shm_mq_inc_bytes_read. + * (Since we're separating the read of mq_bytes_read from a + * subsequent write to mq_ring, we need a full barrier here.) + */ + pg_memory_barrier(); + memcpy(&mq->mq_ring[mq->mq_ring_offset + offset], + (char *) data + sent, sendnow); + sent += sendnow; + + /* + * Update count of bytes written, with alignment padding. Note + * that this will never actually insert any padding except at the + * end of a run of bytes, because the buffer size is a multiple of + * MAXIMUM_ALIGNOF, and each read is as well. + */ + Assert(sent == nbytes || sendnow == MAXALIGN(sendnow)); + shm_mq_inc_bytes_written(mq, MAXALIGN(sendnow)); + + /* + * For efficiency, we don't set the reader's latch here. We'll do + * that only when the buffer fills up or after writing an entire + * message. + */ + } + } + + *bytes_written = sent; + return SHM_MQ_SUCCESS; +} + +/* + * Wait until at least *nbytesp bytes are available to be read from the + * shared message queue, or until the buffer wraps around. If the queue is + * detached, returns SHM_MQ_DETACHED. If nowait is specified and a wait + * would be required, returns SHM_MQ_WOULD_BLOCK. Otherwise, *datap is set + * to the location at which data bytes can be read, *nbytesp is set to the + * number of bytes which can be read at that address, and the return value + * is SHM_MQ_SUCCESS. + */ +static shm_mq_result +shm_mq_receive_bytes(shm_mq_handle *mqh, Size bytes_needed, bool nowait, + Size *nbytesp, void **datap) +{ + shm_mq *mq = mqh->mqh_queue; + Size ringsize = mq->mq_ring_size; + uint64 used; + uint64 written; + + for (;;) + { + Size offset; + uint64 read; + + /* Get bytes written, so we can compute what's available to read. */ + written = pg_atomic_read_u64(&mq->mq_bytes_written); + + /* + * Get bytes read. Include bytes we could consume but have not yet + * consumed. + */ + read = pg_atomic_read_u64(&mq->mq_bytes_read) + + mqh->mqh_consume_pending; + used = written - read; + Assert(used <= ringsize); + offset = read % (uint64) ringsize; + + /* If we have enough data or buffer has wrapped, we're done. */ + if (used >= bytes_needed || offset + used >= ringsize) + { + *nbytesp = Min(used, ringsize - offset); + *datap = &mq->mq_ring[mq->mq_ring_offset + offset]; + + /* + * Separate the read of mq_bytes_written, above, from caller's + * attempt to read the data itself. Pairs with the barrier in + * shm_mq_inc_bytes_written. + */ + pg_read_barrier(); + return SHM_MQ_SUCCESS; + } + + /* + * Fall out before waiting if the queue has been detached. + * + * Note that we don't check for this until *after* considering whether + * the data already available is enough, since the receiver can finish + * receiving a message stored in the buffer even after the sender has + * detached. + */ + if (mq->mq_detached) + { + /* + * If the writer advanced mq_bytes_written and then set + * mq_detached, we might not have read the final value of + * mq_bytes_written above. Insert a read barrier and then check + * again if mq_bytes_written has advanced. + */ + pg_read_barrier(); + if (written != pg_atomic_read_u64(&mq->mq_bytes_written)) + continue; + + return SHM_MQ_DETACHED; + } + + /* + * We didn't get enough data to satisfy the request, so mark any data + * previously-consumed as read to make more buffer space. + */ + if (mqh->mqh_consume_pending > 0) + { + shm_mq_inc_bytes_read(mq, mqh->mqh_consume_pending); + mqh->mqh_consume_pending = 0; + } + + /* Skip manipulation of our latch if nowait = true. */ + if (nowait) + return SHM_MQ_WOULD_BLOCK; + + /* + * Wait for our latch to be set. It might already be set for some + * unrelated reason, but that'll just result in one extra trip through + * the loop. It's worth it to avoid resetting the latch at top of + * loop, because setting an already-set latch is much cheaper than + * setting one that has been reset. + */ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + WAIT_EVENT_MQ_RECEIVE); + + /* Reset the latch so we don't spin. */ + ResetLatch(MyLatch); + + /* An interrupt may have occurred while we were waiting. */ + CHECK_FOR_INTERRUPTS(); + } +} + +/* + * Test whether a counterparty who may not even be alive yet is definitely gone. + */ +static bool +shm_mq_counterparty_gone(shm_mq *mq, BackgroundWorkerHandle *handle) +{ + pid_t pid; + + /* If the queue has been detached, counterparty is definitely gone. */ + if (mq->mq_detached) + return true; + + /* If there's a handle, check worker status. */ + if (handle != NULL) + { + BgwHandleStatus status; + + /* Check for unexpected worker death. */ + status = GetBackgroundWorkerPid(handle, &pid); + if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED) + { + /* Mark it detached, just to make it official. */ + mq->mq_detached = true; + return true; + } + } + + /* Counterparty is not definitively gone. */ + return false; +} + +/* + * This is used when a process is waiting for its counterpart to attach to the + * queue. We exit when the other process attaches as expected, or, if + * handle != NULL, when the referenced background process or the postmaster + * dies. Note that if handle == NULL, and the process fails to attach, we'll + * potentially get stuck here forever waiting for a process that may never + * start. We do check for interrupts, though. + * + * ptr is a pointer to the memory address that we're expecting to become + * non-NULL when our counterpart attaches to the queue. + */ +static bool +shm_mq_wait_internal(shm_mq *mq, PGPROC **ptr, BackgroundWorkerHandle *handle) +{ + bool result = false; + + for (;;) + { + BgwHandleStatus status; + pid_t pid; + + /* Acquire the lock just long enough to check the pointer. */ + SpinLockAcquire(&mq->mq_mutex); + result = (*ptr != NULL); + SpinLockRelease(&mq->mq_mutex); + + /* Fail if detached; else succeed if initialized. */ + if (mq->mq_detached) + { + result = false; + break; + } + if (result) + break; + + if (handle != NULL) + { + /* Check for unexpected worker death. */ + status = GetBackgroundWorkerPid(handle, &pid); + if (status != BGWH_STARTED && status != BGWH_NOT_YET_STARTED) + { + result = false; + break; + } + } + + /* Wait to be signaled. */ + (void) WaitLatch(MyLatch, WL_LATCH_SET | WL_EXIT_ON_PM_DEATH, 0, + WAIT_EVENT_MQ_INTERNAL); + + /* Reset the latch so we don't spin. */ + ResetLatch(MyLatch); + + /* An interrupt may have occurred while we were waiting. */ + CHECK_FOR_INTERRUPTS(); + } + + return result; +} + +/* + * Increment the number of bytes read. + */ +static void +shm_mq_inc_bytes_read(shm_mq *mq, Size n) +{ + PGPROC *sender; + + /* + * Separate prior reads of mq_ring from the increment of mq_bytes_read + * which follows. This pairs with the full barrier in + * shm_mq_send_bytes(). We only need a read barrier here because the + * increment of mq_bytes_read is actually a read followed by a dependent + * write. + */ + pg_read_barrier(); + + /* + * There's no need to use pg_atomic_fetch_add_u64 here, because nobody + * else can be changing this value. This method should be cheaper. + */ + pg_atomic_write_u64(&mq->mq_bytes_read, + pg_atomic_read_u64(&mq->mq_bytes_read) + n); + + /* + * We shouldn't have any bytes to read without a sender, so we can read + * mq_sender here without a lock. Once it's initialized, it can't change. + */ + sender = mq->mq_sender; + Assert(sender != NULL); + SetLatch(&sender->procLatch); +} + +/* + * Increment the number of bytes written. + */ +static void +shm_mq_inc_bytes_written(shm_mq *mq, Size n) +{ + /* + * Separate prior reads of mq_ring from the write of mq_bytes_written + * which we're about to do. Pairs with the read barrier found in + * shm_mq_receive_bytes. + */ + pg_write_barrier(); + + /* + * There's no need to use pg_atomic_fetch_add_u64 here, because nobody + * else can be changing this value. This method avoids taking the bus + * lock unnecessarily. + */ + pg_atomic_write_u64(&mq->mq_bytes_written, + pg_atomic_read_u64(&mq->mq_bytes_written) + n); +} + +/* Shim for on_dsm_detach callback. */ +static void +shm_mq_detach_callback(dsm_segment *seg, Datum arg) +{ + shm_mq *mq = (shm_mq *) DatumGetPointer(arg); + + shm_mq_detach_internal(mq); +} diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c new file mode 100644 index 0000000..863b98b --- /dev/null +++ b/src/backend/storage/ipc/shm_toc.c @@ -0,0 +1,272 @@ +/*------------------------------------------------------------------------- + * + * shm_toc.c + * shared memory segment table of contents + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/backend/storage/ipc/shm_toc.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "port/atomics.h" +#include "storage/shm_toc.h" +#include "storage/spin.h" + +typedef struct shm_toc_entry +{ + uint64 key; /* Arbitrary identifier */ + Size offset; /* Offset, in bytes, from TOC start */ +} shm_toc_entry; + +struct shm_toc +{ + uint64 toc_magic; /* Magic number identifying this TOC */ + slock_t toc_mutex; /* Spinlock for mutual exclusion */ + Size toc_total_bytes; /* Bytes managed by this TOC */ + Size toc_allocated_bytes; /* Bytes allocated of those managed */ + uint32 toc_nentry; /* Number of entries in TOC */ + shm_toc_entry toc_entry[FLEXIBLE_ARRAY_MEMBER]; +}; + +/* + * Initialize a region of shared memory with a table of contents. + */ +shm_toc * +shm_toc_create(uint64 magic, void *address, Size nbytes) +{ + shm_toc *toc = (shm_toc *) address; + + Assert(nbytes > offsetof(shm_toc, toc_entry)); + toc->toc_magic = magic; + SpinLockInit(&toc->toc_mutex); + + /* + * The alignment code in shm_toc_allocate() assumes that the starting + * value is buffer-aligned. + */ + toc->toc_total_bytes = BUFFERALIGN_DOWN(nbytes); + toc->toc_allocated_bytes = 0; + toc->toc_nentry = 0; + + return toc; +} + +/* + * Attach to an existing table of contents. If the magic number found at + * the target address doesn't match our expectations, return NULL. + */ +shm_toc * +shm_toc_attach(uint64 magic, void *address) +{ + shm_toc *toc = (shm_toc *) address; + + if (toc->toc_magic != magic) + return NULL; + + Assert(toc->toc_total_bytes >= toc->toc_allocated_bytes); + Assert(toc->toc_total_bytes > offsetof(shm_toc, toc_entry)); + + return toc; +} + +/* + * Allocate shared memory from a segment managed by a table of contents. + * + * This is not a full-blown allocator; there's no way to free memory. It's + * just a way of dividing a single physical shared memory segment into logical + * chunks that may be used for different purposes. + * + * We allocate backwards from the end of the segment, so that the TOC entries + * can grow forward from the start of the segment. + */ +void * +shm_toc_allocate(shm_toc *toc, Size nbytes) +{ + volatile shm_toc *vtoc = toc; + Size total_bytes; + Size allocated_bytes; + Size nentry; + Size toc_bytes; + + /* + * Make sure request is well-aligned. XXX: MAXALIGN is not enough, + * because atomic ops might need a wider alignment. We don't have a + * proper definition for the minimum to make atomic ops safe, but + * BUFFERALIGN ought to be enough. + */ + nbytes = BUFFERALIGN(nbytes); + + SpinLockAcquire(&toc->toc_mutex); + + total_bytes = vtoc->toc_total_bytes; + allocated_bytes = vtoc->toc_allocated_bytes; + nentry = vtoc->toc_nentry; + toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) + + allocated_bytes; + + /* Check for memory exhaustion and overflow. */ + if (toc_bytes + nbytes > total_bytes || toc_bytes + nbytes < toc_bytes) + { + SpinLockRelease(&toc->toc_mutex); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); + } + vtoc->toc_allocated_bytes += nbytes; + + SpinLockRelease(&toc->toc_mutex); + + return ((char *) toc) + (total_bytes - allocated_bytes - nbytes); +} + +/* + * Return the number of bytes that can still be allocated. + */ +Size +shm_toc_freespace(shm_toc *toc) +{ + volatile shm_toc *vtoc = toc; + Size total_bytes; + Size allocated_bytes; + Size nentry; + Size toc_bytes; + + SpinLockAcquire(&toc->toc_mutex); + total_bytes = vtoc->toc_total_bytes; + allocated_bytes = vtoc->toc_allocated_bytes; + nentry = vtoc->toc_nentry; + SpinLockRelease(&toc->toc_mutex); + + toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry); + Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes); + return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes)); +} + +/* + * Insert a TOC entry. + * + * The idea here is that the process setting up the shared memory segment will + * register the addresses of data structures within the segment using this + * function. Each data structure will be identified using a 64-bit key, which + * is assumed to be a well-known or discoverable integer. Other processes + * accessing the shared memory segment can pass the same key to + * shm_toc_lookup() to discover the addresses of those data structures. + * + * Since the shared memory segment may be mapped at different addresses within + * different backends, we store relative rather than absolute pointers. + * + * This won't scale well to a large number of keys. Hopefully, that isn't + * necessary; if it proves to be, we might need to provide a more sophisticated + * data structure here. But the real idea here is just to give someone mapping + * a dynamic shared memory the ability to find the bare minimum number of + * pointers that they need to bootstrap. If you're storing a lot of stuff in + * the TOC, you're doing it wrong. + */ +void +shm_toc_insert(shm_toc *toc, uint64 key, void *address) +{ + volatile shm_toc *vtoc = toc; + Size total_bytes; + Size allocated_bytes; + Size nentry; + Size toc_bytes; + Size offset; + + /* Relativize pointer. */ + Assert(address > (void *) toc); + offset = ((char *) address) - (char *) toc; + + SpinLockAcquire(&toc->toc_mutex); + + total_bytes = vtoc->toc_total_bytes; + allocated_bytes = vtoc->toc_allocated_bytes; + nentry = vtoc->toc_nentry; + toc_bytes = offsetof(shm_toc, toc_entry) + nentry * sizeof(shm_toc_entry) + + allocated_bytes; + + /* Check for memory exhaustion and overflow. */ + if (toc_bytes + sizeof(shm_toc_entry) > total_bytes || + toc_bytes + sizeof(shm_toc_entry) < toc_bytes || + nentry >= PG_UINT32_MAX) + { + SpinLockRelease(&toc->toc_mutex); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory"))); + } + + Assert(offset < total_bytes); + vtoc->toc_entry[nentry].key = key; + vtoc->toc_entry[nentry].offset = offset; + + /* + * By placing a write barrier after filling in the entry and before + * updating the number of entries, we make it safe to read the TOC + * unlocked. + */ + pg_write_barrier(); + + vtoc->toc_nentry++; + + SpinLockRelease(&toc->toc_mutex); +} + +/* + * Look up a TOC entry. + * + * If the key is not found, returns NULL if noError is true, otherwise + * throws elog(ERROR). + * + * Unlike the other functions in this file, this operation acquires no lock; + * it uses only barriers. It probably wouldn't hurt concurrency very much even + * if it did get a lock, but since it's reasonably likely that a group of + * worker processes could each read a series of entries from the same TOC + * right around the same time, there seems to be some value in avoiding it. + */ +void * +shm_toc_lookup(shm_toc *toc, uint64 key, bool noError) +{ + uint32 nentry; + uint32 i; + + /* + * Read the number of entries before we examine any entry. We assume that + * reading a uint32 is atomic. + */ + nentry = toc->toc_nentry; + pg_read_barrier(); + + /* Now search for a matching entry. */ + for (i = 0; i < nentry; ++i) + { + if (toc->toc_entry[i].key == key) + return ((char *) toc) + toc->toc_entry[i].offset; + } + + /* No matching entry was found. */ + if (!noError) + elog(ERROR, "could not find key " UINT64_FORMAT " in shm TOC at %p", + key, toc); + return NULL; +} + +/* + * Estimate how much shared memory will be required to store a TOC and its + * dependent data structures. + */ +Size +shm_toc_estimate(shm_toc_estimator *e) +{ + Size sz; + + sz = offsetof(shm_toc, toc_entry); + sz = add_size(sz, mul_size(e->number_of_keys, sizeof(shm_toc_entry))); + sz = add_size(sz, e->space_for_chunks); + + return BUFFERALIGN(sz); +} diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c new file mode 100644 index 0000000..4425e99 --- /dev/null +++ b/src/backend/storage/ipc/shmem.c @@ -0,0 +1,611 @@ +/*------------------------------------------------------------------------- + * + * shmem.c + * create shared memory and initialize shared memory data structures. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/shmem.c + * + *------------------------------------------------------------------------- + */ +/* + * POSTGRES processes share one or more regions of shared memory. + * The shared memory is created by a postmaster and is inherited + * by each backend via fork() (or, in some ports, via other OS-specific + * methods). The routines in this file are used for allocating and + * binding to shared memory data structures. + * + * NOTES: + * (a) There are three kinds of shared memory data structures + * available to POSTGRES: fixed-size structures, queues and hash + * tables. Fixed-size structures contain things like global variables + * for a module and should never be allocated after the shared memory + * initialization phase. Hash tables have a fixed maximum size, but + * their actual size can vary dynamically. When entries are added + * to the table, more space is allocated. Queues link data structures + * that have been allocated either within fixed-size structures or as hash + * buckets. Each shared data structure has a string name to identify + * it (assigned in the module that declares it). + * + * (b) During initialization, each module looks for its + * shared data structures in a hash table called the "Shmem Index". + * If the data structure is not present, the caller can allocate + * a new one and initialize it. If the data structure is present, + * the caller "attaches" to the structure by initializing a pointer + * in the local address space. + * The shmem index has two purposes: first, it gives us + * a simple model of how the world looks when a backend process + * initializes. If something is present in the shmem index, + * it is initialized. If it is not, it is uninitialized. Second, + * the shmem index allows us to allocate shared memory on demand + * instead of trying to preallocate structures and hard-wire the + * sizes and locations in header files. If you are using a lot + * of shared memory in a lot of different places (and changing + * things during development), this is important. + * + * (c) In standard Unix-ish environments, individual backends do not + * need to re-establish their local pointers into shared memory, because + * they inherit correct values of those variables via fork() from the + * postmaster. However, this does not work in the EXEC_BACKEND case. + * In ports using EXEC_BACKEND, new backends have to set up their local + * pointers using the method described in (b) above. + * + * (d) memory allocation model: shared memory can never be + * freed, once allocated. Each hash table has its own free list, + * so hash buckets can be reused when an item is deleted. However, + * if one hash table grows very large and then shrinks, its space + * cannot be redistributed to other tables. We could build a simple + * hash bucket garbage collector if need be. Right now, it seems + * unnecessary. + */ + +#include "postgres.h" + +#include "access/transam.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "storage/lwlock.h" +#include "storage/pg_shmem.h" +#include "storage/shmem.h" +#include "storage/spin.h" +#include "utils/builtins.h" + +static void *ShmemAllocRaw(Size size, Size *allocated_size); + +/* shared memory global variables */ + +static PGShmemHeader *ShmemSegHdr; /* shared mem segment header */ + +static void *ShmemBase; /* start address of shared memory */ + +static void *ShmemEnd; /* end+1 address of shared memory */ + +slock_t *ShmemLock; /* spinlock for shared memory and LWLock + * allocation */ + +static HTAB *ShmemIndex = NULL; /* primary index hashtable for shmem */ + + +/* + * InitShmemAccess() --- set up basic pointers to shared memory. + * + * Note: the argument should be declared "PGShmemHeader *seghdr", + * but we use void to avoid having to include ipc.h in shmem.h. + */ +void +InitShmemAccess(void *seghdr) +{ + PGShmemHeader *shmhdr = (PGShmemHeader *) seghdr; + + ShmemSegHdr = shmhdr; + ShmemBase = (void *) shmhdr; + ShmemEnd = (char *) ShmemBase + shmhdr->totalsize; +} + +/* + * InitShmemAllocation() --- set up shared-memory space allocation. + * + * This should be called only in the postmaster or a standalone backend. + */ +void +InitShmemAllocation(void) +{ + PGShmemHeader *shmhdr = ShmemSegHdr; + char *aligned; + + Assert(shmhdr != NULL); + + /* + * Initialize the spinlock used by ShmemAlloc. We must use + * ShmemAllocUnlocked, since obviously ShmemAlloc can't be called yet. + */ + ShmemLock = (slock_t *) ShmemAllocUnlocked(sizeof(slock_t)); + + SpinLockInit(ShmemLock); + + /* + * Allocations after this point should go through ShmemAlloc, which + * expects to allocate everything on cache line boundaries. Make sure the + * first allocation begins on a cache line boundary. + */ + aligned = (char *) + (CACHELINEALIGN((((char *) shmhdr) + shmhdr->freeoffset))); + shmhdr->freeoffset = aligned - (char *) shmhdr; + + /* ShmemIndex can't be set up yet (need LWLocks first) */ + shmhdr->index = NULL; + ShmemIndex = (HTAB *) NULL; + + /* + * Initialize ShmemVariableCache for transaction manager. (This doesn't + * really belong here, but not worth moving.) + */ + ShmemVariableCache = (VariableCache) + ShmemAlloc(sizeof(*ShmemVariableCache)); + memset(ShmemVariableCache, 0, sizeof(*ShmemVariableCache)); +} + +/* + * ShmemAlloc -- allocate max-aligned chunk from shared memory + * + * Throws error if request cannot be satisfied. + * + * Assumes ShmemLock and ShmemSegHdr are initialized. + */ +void * +ShmemAlloc(Size size) +{ + void *newSpace; + Size allocated_size; + + newSpace = ShmemAllocRaw(size, &allocated_size); + if (!newSpace) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory (%zu bytes requested)", + size))); + return newSpace; +} + +/* + * ShmemAllocNoError -- allocate max-aligned chunk from shared memory + * + * As ShmemAlloc, but returns NULL if out of space, rather than erroring. + */ +void * +ShmemAllocNoError(Size size) +{ + Size allocated_size; + + return ShmemAllocRaw(size, &allocated_size); +} + +/* + * ShmemAllocRaw -- allocate align chunk and return allocated size + * + * Also sets *allocated_size to the number of bytes allocated, which will + * be equal to the number requested plus any padding we choose to add. + */ +static void * +ShmemAllocRaw(Size size, Size *allocated_size) +{ + Size newStart; + Size newFree; + void *newSpace; + + /* + * Ensure all space is adequately aligned. We used to only MAXALIGN this + * space but experience has proved that on modern systems that is not good + * enough. Many parts of the system are very sensitive to critical data + * structures getting split across cache line boundaries. To avoid that, + * attempt to align the beginning of the allocation to a cache line + * boundary. The calling code will still need to be careful about how it + * uses the allocated space - e.g. by padding each element in an array of + * structures out to a power-of-two size - but without this, even that + * won't be sufficient. + */ + size = CACHELINEALIGN(size); + *allocated_size = size; + + Assert(ShmemSegHdr != NULL); + + SpinLockAcquire(ShmemLock); + + newStart = ShmemSegHdr->freeoffset; + + newFree = newStart + size; + if (newFree <= ShmemSegHdr->totalsize) + { + newSpace = (void *) ((char *) ShmemBase + newStart); + ShmemSegHdr->freeoffset = newFree; + } + else + newSpace = NULL; + + SpinLockRelease(ShmemLock); + + /* note this assert is okay with newSpace == NULL */ + Assert(newSpace == (void *) CACHELINEALIGN(newSpace)); + + return newSpace; +} + +/* + * ShmemAllocUnlocked -- allocate max-aligned chunk from shared memory + * + * Allocate space without locking ShmemLock. This should be used for, + * and only for, allocations that must happen before ShmemLock is ready. + * + * We consider maxalign, rather than cachealign, sufficient here. + */ +void * +ShmemAllocUnlocked(Size size) +{ + Size newStart; + Size newFree; + void *newSpace; + + /* + * Ensure allocated space is adequately aligned. + */ + size = MAXALIGN(size); + + Assert(ShmemSegHdr != NULL); + + newStart = ShmemSegHdr->freeoffset; + + newFree = newStart + size; + if (newFree > ShmemSegHdr->totalsize) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of shared memory (%zu bytes requested)", + size))); + ShmemSegHdr->freeoffset = newFree; + + newSpace = (void *) ((char *) ShmemBase + newStart); + + Assert(newSpace == (void *) MAXALIGN(newSpace)); + + return newSpace; +} + +/* + * ShmemAddrIsValid -- test if an address refers to shared memory + * + * Returns true if the pointer points within the shared memory segment. + */ +bool +ShmemAddrIsValid(const void *addr) +{ + return (addr >= ShmemBase) && (addr < ShmemEnd); +} + +/* + * InitShmemIndex() --- set up or attach to shmem index table. + */ +void +InitShmemIndex(void) +{ + HASHCTL info; + + /* + * Create the shared memory shmem index. + * + * Since ShmemInitHash calls ShmemInitStruct, which expects the ShmemIndex + * hashtable to exist already, we have a bit of a circularity problem in + * initializing the ShmemIndex itself. The special "ShmemIndex" hash + * table name will tell ShmemInitStruct to fake it. + */ + info.keysize = SHMEM_INDEX_KEYSIZE; + info.entrysize = sizeof(ShmemIndexEnt); + + ShmemIndex = ShmemInitHash("ShmemIndex", + SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE, + &info, + HASH_ELEM | HASH_STRINGS); +} + +/* + * ShmemInitHash -- Create and initialize, or attach to, a + * shared memory hash table. + * + * We assume caller is doing some kind of synchronization + * so that two processes don't try to create/initialize the same + * table at once. (In practice, all creations are done in the postmaster + * process; child processes should always be attaching to existing tables.) + * + * max_size is the estimated maximum number of hashtable entries. This is + * not a hard limit, but the access efficiency will degrade if it is + * exceeded substantially (since it's used to compute directory size and + * the hash table buckets will get overfull). + * + * init_size is the number of hashtable entries to preallocate. For a table + * whose maximum size is certain, this should be equal to max_size; that + * ensures that no run-time out-of-shared-memory failures can occur. + * + * *infoP and hash_flags must specify at least the entry sizes and key + * comparison semantics (see hash_create()). Flag bits and values specific + * to shared-memory hash tables are added here, except that callers may + * choose to specify HASH_PARTITION and/or HASH_FIXED_SIZE. + * + * Note: before Postgres 9.0, this function returned NULL for some failure + * cases. Now, it always throws error instead, so callers need not check + * for NULL. + */ +HTAB * +ShmemInitHash(const char *name, /* table string name for shmem index */ + long init_size, /* initial table size */ + long max_size, /* max size of the table */ + HASHCTL *infoP, /* info about key and bucket size */ + int hash_flags) /* info about infoP */ +{ + bool found; + void *location; + + /* + * Hash tables allocated in shared memory have a fixed directory; it can't + * grow or other backends wouldn't be able to find it. So, make sure we + * make it big enough to start with. + * + * The shared memory allocator must be specified too. + */ + infoP->dsize = infoP->max_dsize = hash_select_dirsize(max_size); + infoP->alloc = ShmemAllocNoError; + hash_flags |= HASH_SHARED_MEM | HASH_ALLOC | HASH_DIRSIZE; + + /* look it up in the shmem index */ + location = ShmemInitStruct(name, + hash_get_shared_size(infoP, hash_flags), + &found); + + /* + * if it already exists, attach to it rather than allocate and initialize + * new space + */ + if (found) + hash_flags |= HASH_ATTACH; + + /* Pass location of hashtable header to hash_create */ + infoP->hctl = (HASHHDR *) location; + + return hash_create(name, init_size, infoP, hash_flags); +} + +/* + * ShmemInitStruct -- Create/attach to a structure in shared memory. + * + * This is called during initialization to find or allocate + * a data structure in shared memory. If no other process + * has created the structure, this routine allocates space + * for it. If it exists already, a pointer to the existing + * structure is returned. + * + * Returns: pointer to the object. *foundPtr is set true if the object was + * already in the shmem index (hence, already initialized). + * + * Note: before Postgres 9.0, this function returned NULL for some failure + * cases. Now, it always throws error instead, so callers need not check + * for NULL. + */ +void * +ShmemInitStruct(const char *name, Size size, bool *foundPtr) +{ + ShmemIndexEnt *result; + void *structPtr; + + LWLockAcquire(ShmemIndexLock, LW_EXCLUSIVE); + + if (!ShmemIndex) + { + PGShmemHeader *shmemseghdr = ShmemSegHdr; + + /* Must be trying to create/attach to ShmemIndex itself */ + Assert(strcmp(name, "ShmemIndex") == 0); + + if (IsUnderPostmaster) + { + /* Must be initializing a (non-standalone) backend */ + Assert(shmemseghdr->index != NULL); + structPtr = shmemseghdr->index; + *foundPtr = true; + } + else + { + /* + * If the shmem index doesn't exist, we are bootstrapping: we must + * be trying to init the shmem index itself. + * + * Notice that the ShmemIndexLock is released before the shmem + * index has been initialized. This should be OK because no other + * process can be accessing shared memory yet. + */ + Assert(shmemseghdr->index == NULL); + structPtr = ShmemAlloc(size); + shmemseghdr->index = structPtr; + *foundPtr = false; + } + LWLockRelease(ShmemIndexLock); + return structPtr; + } + + /* look it up in the shmem index */ + result = (ShmemIndexEnt *) + hash_search(ShmemIndex, name, HASH_ENTER_NULL, foundPtr); + + if (!result) + { + LWLockRelease(ShmemIndexLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("could not create ShmemIndex entry for data structure \"%s\"", + name))); + } + + if (*foundPtr) + { + /* + * Structure is in the shmem index so someone else has allocated it + * already. The size better be the same as the size we are trying to + * initialize to, or there is a name conflict (or worse). + */ + if (result->size != size) + { + LWLockRelease(ShmemIndexLock); + ereport(ERROR, + (errmsg("ShmemIndex entry size is wrong for data structure" + " \"%s\": expected %zu, actual %zu", + name, size, result->size))); + } + structPtr = result->location; + } + else + { + Size allocated_size; + + /* It isn't in the table yet. allocate and initialize it */ + structPtr = ShmemAllocRaw(size, &allocated_size); + if (structPtr == NULL) + { + /* out of memory; remove the failed ShmemIndex entry */ + hash_search(ShmemIndex, name, HASH_REMOVE, NULL); + LWLockRelease(ShmemIndexLock); + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("not enough shared memory for data structure" + " \"%s\" (%zu bytes requested)", + name, size))); + } + result->size = size; + result->allocated_size = allocated_size; + result->location = structPtr; + } + + LWLockRelease(ShmemIndexLock); + + Assert(ShmemAddrIsValid(structPtr)); + + Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); + + return structPtr; +} + + +/* + * Add two Size values, checking for overflow + */ +Size +add_size(Size s1, Size s2) +{ + Size result; + + result = s1 + s2; + /* We are assuming Size is an unsigned type here... */ + if (result < s1 || result < s2) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested shared memory size overflows size_t"))); + return result; +} + +/* + * Multiply two Size values, checking for overflow + */ +Size +mul_size(Size s1, Size s2) +{ + Size result; + + if (s1 == 0 || s2 == 0) + return 0; + result = s1 * s2; + /* We are assuming Size is an unsigned type here... */ + if (result / s2 != s1) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("requested shared memory size overflows size_t"))); + return result; +} + +/* SQL SRF showing allocated shared memory */ +Datum +pg_get_shmem_allocations(PG_FUNCTION_ARGS) +{ +#define PG_GET_SHMEM_SIZES_COLS 4 + ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; + TupleDesc tupdesc; + Tuplestorestate *tupstore; + MemoryContext per_query_ctx; + MemoryContext oldcontext; + HASH_SEQ_STATUS hstat; + ShmemIndexEnt *ent; + Size named_allocated = 0; + Datum values[PG_GET_SHMEM_SIZES_COLS]; + bool nulls[PG_GET_SHMEM_SIZES_COLS]; + + /* check to see if caller supports us returning a tuplestore */ + if (rsinfo == NULL || !IsA(rsinfo, ReturnSetInfo)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"))); + if (!(rsinfo->allowedModes & SFRM_Materialize)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("materialize mode required, but it is not allowed in this context"))); + + /* Build a tuple descriptor for our result type */ + if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) + elog(ERROR, "return type must be a row type"); + + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + + tupstore = tuplestore_begin_heap(true, false, work_mem); + rsinfo->returnMode = SFRM_Materialize; + rsinfo->setResult = tupstore; + rsinfo->setDesc = tupdesc; + + MemoryContextSwitchTo(oldcontext); + + LWLockAcquire(ShmemIndexLock, LW_SHARED); + + hash_seq_init(&hstat, ShmemIndex); + + /* output all allocated entries */ + memset(nulls, 0, sizeof(nulls)); + while ((ent = (ShmemIndexEnt *) hash_seq_search(&hstat)) != NULL) + { + values[0] = CStringGetTextDatum(ent->key); + values[1] = Int64GetDatum((char *) ent->location - (char *) ShmemSegHdr); + values[2] = Int64GetDatum(ent->size); + values[3] = Int64GetDatum(ent->allocated_size); + named_allocated += ent->allocated_size; + + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + } + + /* output shared memory allocated but not counted via the shmem index */ + values[0] = CStringGetTextDatum("<anonymous>"); + nulls[1] = true; + values[2] = Int64GetDatum(ShmemSegHdr->freeoffset - named_allocated); + values[3] = values[2]; + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + + /* output as-of-yet unused shared memory */ + nulls[0] = true; + values[1] = Int64GetDatum(ShmemSegHdr->freeoffset); + nulls[1] = false; + values[2] = Int64GetDatum(ShmemSegHdr->totalsize - ShmemSegHdr->freeoffset); + values[3] = values[2]; + tuplestore_putvalues(tupstore, tupdesc, values, nulls); + + LWLockRelease(ShmemIndexLock); + + tuplestore_donestoring(tupstore); + + return (Datum) 0; +} diff --git a/src/backend/storage/ipc/shmqueue.c b/src/backend/storage/ipc/shmqueue.c new file mode 100644 index 0000000..dc3238c --- /dev/null +++ b/src/backend/storage/ipc/shmqueue.c @@ -0,0 +1,190 @@ +/*------------------------------------------------------------------------- + * + * shmqueue.c + * shared memory linked lists + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/shmqueue.c + * + * NOTES + * + * Package for managing doubly-linked lists in shared memory. + * The only tricky thing is that SHM_QUEUE will usually be a field + * in a larger record. SHMQueueNext has to return a pointer + * to the record itself instead of a pointer to the SHMQueue field + * of the record. It takes an extra parameter and does some extra + * pointer arithmetic to do this correctly. + * + * NOTE: These are set up so they can be turned into macros some day. + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "storage/shmem.h" + + +/* + * ShmemQueueInit -- make the head of a new queue point + * to itself + */ +void +SHMQueueInit(SHM_QUEUE *queue) +{ + Assert(ShmemAddrIsValid(queue)); + queue->prev = queue->next = queue; +} + +/* + * SHMQueueIsDetached -- true if element is not currently + * in a queue. + */ +bool +SHMQueueIsDetached(const SHM_QUEUE *queue) +{ + Assert(ShmemAddrIsValid(queue)); + return (queue->prev == NULL); +} + +/* + * SHMQueueElemInit -- clear an element's links + */ +void +SHMQueueElemInit(SHM_QUEUE *queue) +{ + Assert(ShmemAddrIsValid(queue)); + queue->prev = queue->next = NULL; +} + +/* + * SHMQueueDelete -- remove an element from the queue and + * close the links + */ +void +SHMQueueDelete(SHM_QUEUE *queue) +{ + SHM_QUEUE *nextElem = queue->next; + SHM_QUEUE *prevElem = queue->prev; + + Assert(ShmemAddrIsValid(queue)); + Assert(ShmemAddrIsValid(nextElem)); + Assert(ShmemAddrIsValid(prevElem)); + + prevElem->next = queue->next; + nextElem->prev = queue->prev; + + queue->prev = queue->next = NULL; +} + +/* + * SHMQueueInsertBefore -- put elem in queue before the given queue + * element. Inserting "before" the queue head puts the elem + * at the tail of the queue. + */ +void +SHMQueueInsertBefore(SHM_QUEUE *queue, SHM_QUEUE *elem) +{ + SHM_QUEUE *prevPtr = queue->prev; + + Assert(ShmemAddrIsValid(queue)); + Assert(ShmemAddrIsValid(elem)); + + elem->next = prevPtr->next; + elem->prev = queue->prev; + queue->prev = elem; + prevPtr->next = elem; +} + +/* + * SHMQueueInsertAfter -- put elem in queue after the given queue + * element. Inserting "after" the queue head puts the elem + * at the head of the queue. + */ +void +SHMQueueInsertAfter(SHM_QUEUE *queue, SHM_QUEUE *elem) +{ + SHM_QUEUE *nextPtr = queue->next; + + Assert(ShmemAddrIsValid(queue)); + Assert(ShmemAddrIsValid(elem)); + + elem->prev = nextPtr->prev; + elem->next = queue->next; + queue->next = elem; + nextPtr->prev = elem; +} + +/*-------------------- + * SHMQueueNext -- Get the next element from a queue + * + * To start the iteration, pass the queue head as both queue and curElem. + * Returns NULL if no more elements. + * + * Next element is at curElem->next. If SHMQueue is part of + * a larger structure, we want to return a pointer to the + * whole structure rather than a pointer to its SHMQueue field. + * For example, + * struct { + * int stuff; + * SHMQueue elem; + * } ELEMType; + * When this element is in a queue, prevElem->next points at struct.elem. + * We subtract linkOffset to get the correct start address of the structure. + * + * calls to SHMQueueNext should take these parameters: + * &(queueHead), &(queueHead), offsetof(ELEMType, elem) + * or + * &(queueHead), &(curElem->elem), offsetof(ELEMType, elem) + *-------------------- + */ +Pointer +SHMQueueNext(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset) +{ + SHM_QUEUE *elemPtr = curElem->next; + + Assert(ShmemAddrIsValid(curElem)); + + if (elemPtr == queue) /* back to the queue head? */ + return NULL; + + return (Pointer) (((char *) elemPtr) - linkOffset); +} + +/*-------------------- + * SHMQueuePrev -- Get the previous element from a queue + * + * Same as SHMQueueNext, just starting at tail and moving towards head. + * All other comments and usage applies. + */ +Pointer +SHMQueuePrev(const SHM_QUEUE *queue, const SHM_QUEUE *curElem, Size linkOffset) +{ + SHM_QUEUE *elemPtr = curElem->prev; + + Assert(ShmemAddrIsValid(curElem)); + + if (elemPtr == queue) /* back to the queue head? */ + return NULL; + + return (Pointer) (((char *) elemPtr) - linkOffset); +} + +/* + * SHMQueueEmpty -- true if queue head is only element, false otherwise + */ +bool +SHMQueueEmpty(const SHM_QUEUE *queue) +{ + Assert(ShmemAddrIsValid(queue)); + + if (queue->prev == queue) + { + Assert(queue->next == queue); + return true; + } + return false; +} diff --git a/src/backend/storage/ipc/signalfuncs.c b/src/backend/storage/ipc/signalfuncs.c new file mode 100644 index 0000000..de69d60 --- /dev/null +++ b/src/backend/storage/ipc/signalfuncs.c @@ -0,0 +1,300 @@ +/*------------------------------------------------------------------------- + * + * signalfuncs.c + * Functions for signaling backends + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/signalfuncs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> + +#include "catalog/pg_authid.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "postmaster/syslogger.h" +#include "storage/pmsignal.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "utils/acl.h" +#include "utils/builtins.h" + + +/* + * Send a signal to another backend. + * + * The signal is delivered if the user is either a superuser or the same + * role as the backend being signaled. For "dangerous" signals, an explicit + * check for superuser needs to be done prior to calling this function. + * + * Returns 0 on success, 1 on general failure, 2 on normal permission error + * and 3 if the caller needs to be a superuser. + * + * In the event of a general failure (return code 1), a warning message will + * be emitted. For permission errors, doing that is the responsibility of + * the caller. + */ +#define SIGNAL_BACKEND_SUCCESS 0 +#define SIGNAL_BACKEND_ERROR 1 +#define SIGNAL_BACKEND_NOPERMISSION 2 +#define SIGNAL_BACKEND_NOSUPERUSER 3 +static int +pg_signal_backend(int pid, int sig) +{ + PGPROC *proc = BackendPidGetProc(pid); + + /* + * BackendPidGetProc returns NULL if the pid isn't valid; but by the time + * we reach kill(), a process for which we get a valid proc here might + * have terminated on its own. There's no way to acquire a lock on an + * arbitrary process to prevent that. But since so far all the callers of + * this mechanism involve some request for ending the process anyway, that + * it might end on its own first is not a problem. + */ + if (proc == NULL) + { + /* + * This is just a warning so a loop-through-resultset will not abort + * if one backend terminated on its own during the run. + */ + ereport(WARNING, + (errmsg("PID %d is not a PostgreSQL server process", pid))); + return SIGNAL_BACKEND_ERROR; + } + + /* Only allow superusers to signal superuser-owned backends. */ + if (superuser_arg(proc->roleId) && !superuser()) + return SIGNAL_BACKEND_NOSUPERUSER; + + /* Users can signal backends they have role membership in. */ + if (!has_privs_of_role(GetUserId(), proc->roleId) && + !has_privs_of_role(GetUserId(), ROLE_PG_SIGNAL_BACKEND)) + return SIGNAL_BACKEND_NOPERMISSION; + + /* + * Can the process we just validated above end, followed by the pid being + * recycled for a new process, before reaching here? Then we'd be trying + * to kill the wrong thing. Seems near impossible when sequential pid + * assignment and wraparound is used. Perhaps it could happen on a system + * where pid re-use is randomized. That race condition possibility seems + * too unlikely to worry about. + */ + + /* If we have setsid(), signal the backend's whole process group */ +#ifdef HAVE_SETSID + if (kill(-pid, sig)) +#else + if (kill(pid, sig)) +#endif + { + /* Again, just a warning to allow loops */ + ereport(WARNING, + (errmsg("could not send signal to process %d: %m", pid))); + return SIGNAL_BACKEND_ERROR; + } + return SIGNAL_BACKEND_SUCCESS; +} + +/* + * Signal to cancel a backend process. This is allowed if you are a member of + * the role whose process is being canceled. + * + * Note that only superusers can signal superuser-owned processes. + */ +Datum +pg_cancel_backend(PG_FUNCTION_ARGS) +{ + int r = pg_signal_backend(PG_GETARG_INT32(0), SIGINT); + + if (r == SIGNAL_BACKEND_NOSUPERUSER) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be a superuser to cancel superuser query"))); + + if (r == SIGNAL_BACKEND_NOPERMISSION) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be a member of the role whose query is being canceled or member of pg_signal_backend"))); + + PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS); +} + +/* + * Wait until there is no backend process with the given PID and return true. + * On timeout, a warning is emitted and false is returned. + */ +static bool +pg_wait_until_termination(int pid, int64 timeout) +{ + /* + * Wait in steps of waittime milliseconds until this function exits or + * timeout. + */ + int64 waittime = 100; + + /* + * Initially remaining time is the entire timeout specified by the user. + */ + int64 remainingtime = timeout; + + /* + * Check existence of the backend. If the backend still exists, then wait + * for waittime milliseconds, again check for the existence. Repeat this + * until timeout or an error occurs or a pending interrupt such as query + * cancel gets processed. + */ + do + { + if (remainingtime < waittime) + waittime = remainingtime; + + if (kill(pid, 0) == -1) + { + if (errno == ESRCH) + return true; + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("could not check the existence of the backend with PID %d: %m", + pid))); + } + + /* Process interrupts, if any, before waiting */ + CHECK_FOR_INTERRUPTS(); + + (void) WaitLatch(MyLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH, + waittime, + WAIT_EVENT_BACKEND_TERMINATION); + + ResetLatch(MyLatch); + + remainingtime -= waittime; + } while (remainingtime > 0); + + ereport(WARNING, + (errmsg_plural("backend with PID %d did not terminate within %lld millisecond", + "backend with PID %d did not terminate within %lld milliseconds", + timeout, + pid, (long long int) timeout))); + + return false; +} + +/* + * Send a signal to terminate a backend process. This is allowed if you are a + * member of the role whose process is being terminated. If the timeout input + * argument is 0, then this function just signals the backend and returns + * true. If timeout is nonzero, then it waits until no process has the given + * PID; if the process ends within the timeout, true is returned, and if the + * timeout is exceeded, a warning is emitted and false is returned. + * + * Note that only superusers can signal superuser-owned processes. + */ +Datum +pg_terminate_backend(PG_FUNCTION_ARGS) +{ + int pid; + int r; + int timeout; /* milliseconds */ + + pid = PG_GETARG_INT32(0); + timeout = PG_GETARG_INT64(1); + + if (timeout < 0) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"timeout\" must not be negative"))); + + r = pg_signal_backend(pid, SIGTERM); + + if (r == SIGNAL_BACKEND_NOSUPERUSER) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be a superuser to terminate superuser process"))); + + if (r == SIGNAL_BACKEND_NOPERMISSION) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be a member of the role whose process is being terminated or member of pg_signal_backend"))); + + /* Wait only on success and if actually requested */ + if (r == SIGNAL_BACKEND_SUCCESS && timeout > 0) + PG_RETURN_BOOL(pg_wait_until_termination(pid, timeout)); + else + PG_RETURN_BOOL(r == SIGNAL_BACKEND_SUCCESS); +} + +/* + * Signal to reload the database configuration + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_reload_conf(PG_FUNCTION_ARGS) +{ + if (kill(PostmasterPid, SIGHUP)) + { + ereport(WARNING, + (errmsg("failed to send signal to postmaster: %m"))); + PG_RETURN_BOOL(false); + } + + PG_RETURN_BOOL(true); +} + + +/* + * Rotate log file + * + * This function is kept to support adminpack 1.0. + */ +Datum +pg_rotate_logfile(PG_FUNCTION_ARGS) +{ + if (!superuser()) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), + errmsg("must be superuser to rotate log files with adminpack 1.0"), + /* translator: %s is a SQL function name */ + errhint("Consider using %s, which is part of core, instead.", + "pg_logfile_rotate()"))); + + if (!Logging_collector) + { + ereport(WARNING, + (errmsg("rotation not possible because log collection not active"))); + PG_RETURN_BOOL(false); + } + + SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE); + PG_RETURN_BOOL(true); +} + +/* + * Rotate log file + * + * Permission checking for this function is managed through the normal + * GRANT system. + */ +Datum +pg_rotate_logfile_v2(PG_FUNCTION_ARGS) +{ + if (!Logging_collector) + { + ereport(WARNING, + (errmsg("rotation not possible because log collection not active"))); + PG_RETURN_BOOL(false); + } + + SendPostmasterSignal(PMSIGNAL_ROTATE_LOGFILE); + PG_RETURN_BOOL(true); +} diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c new file mode 100644 index 0000000..f585d63 --- /dev/null +++ b/src/backend/storage/ipc/sinval.c @@ -0,0 +1,205 @@ +/*------------------------------------------------------------------------- + * + * sinval.c + * POSTGRES shared cache invalidation communication code. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/sinval.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "commands/async.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/sinvaladt.h" +#include "utils/inval.h" + + +uint64 SharedInvalidMessageCounter; + + +/* + * Because backends sitting idle will not be reading sinval events, we + * need a way to give an idle backend a swift kick in the rear and make + * it catch up before the sinval queue overflows and forces it to go + * through a cache reset exercise. This is done by sending + * PROCSIG_CATCHUP_INTERRUPT to any backend that gets too far behind. + * + * The signal handler will set an interrupt pending flag and will set the + * processes latch. Whenever starting to read from the client, or when + * interrupted while doing so, ProcessClientReadInterrupt() will call + * ProcessCatchupEvent(). + */ +volatile sig_atomic_t catchupInterruptPending = false; + + +/* + * SendSharedInvalidMessages + * Add shared-cache-invalidation message(s) to the global SI message queue. + */ +void +SendSharedInvalidMessages(const SharedInvalidationMessage *msgs, int n) +{ + SIInsertDataEntries(msgs, n); +} + +/* + * ReceiveSharedInvalidMessages + * Process shared-cache-invalidation messages waiting for this backend + * + * We guarantee to process all messages that had been queued before the + * routine was entered. It is of course possible for more messages to get + * queued right after our last SIGetDataEntries call. + * + * NOTE: it is entirely possible for this routine to be invoked recursively + * as a consequence of processing inside the invalFunction or resetFunction. + * Furthermore, such a recursive call must guarantee that all outstanding + * inval messages have been processed before it exits. This is the reason + * for the strange-looking choice to use a statically allocated buffer array + * and counters; it's so that a recursive call can process messages already + * sucked out of sinvaladt.c. + */ +void +ReceiveSharedInvalidMessages(void (*invalFunction) (SharedInvalidationMessage *msg), + void (*resetFunction) (void)) +{ +#define MAXINVALMSGS 32 + static SharedInvalidationMessage messages[MAXINVALMSGS]; + + /* + * We use volatile here to prevent bugs if a compiler doesn't realize that + * recursion is a possibility ... + */ + static volatile int nextmsg = 0; + static volatile int nummsgs = 0; + + /* Deal with any messages still pending from an outer recursion */ + while (nextmsg < nummsgs) + { + SharedInvalidationMessage msg = messages[nextmsg++]; + + SharedInvalidMessageCounter++; + invalFunction(&msg); + } + + do + { + int getResult; + + nextmsg = nummsgs = 0; + + /* Try to get some more messages */ + getResult = SIGetDataEntries(messages, MAXINVALMSGS); + + if (getResult < 0) + { + /* got a reset message */ + elog(DEBUG4, "cache state reset"); + SharedInvalidMessageCounter++; + resetFunction(); + break; /* nothing more to do */ + } + + /* Process them, being wary that a recursive call might eat some */ + nextmsg = 0; + nummsgs = getResult; + + while (nextmsg < nummsgs) + { + SharedInvalidationMessage msg = messages[nextmsg++]; + + SharedInvalidMessageCounter++; + invalFunction(&msg); + } + + /* + * We only need to loop if the last SIGetDataEntries call (which might + * have been within a recursive call) returned a full buffer. + */ + } while (nummsgs == MAXINVALMSGS); + + /* + * We are now caught up. If we received a catchup signal, reset that + * flag, and call SICleanupQueue(). This is not so much because we need + * to flush dead messages right now, as that we want to pass on the + * catchup signal to the next slowest backend. "Daisy chaining" the + * catchup signal this way avoids creating spikes in system load for what + * should be just a background maintenance activity. + */ + if (catchupInterruptPending) + { + catchupInterruptPending = false; + elog(DEBUG4, "sinval catchup complete, cleaning queue"); + SICleanupQueue(false, 0); + } +} + + +/* + * HandleCatchupInterrupt + * + * This is called when PROCSIG_CATCHUP_INTERRUPT is received. + * + * We used to directly call ProcessCatchupEvent directly when idle. These days + * we just set a flag to do it later and notify the process of that fact by + * setting the process's latch. + */ +void +HandleCatchupInterrupt(void) +{ + /* + * Note: this is called by a SIGNAL HANDLER. You must be very wary what + * you do here. + */ + + catchupInterruptPending = true; + + /* make sure the event is processed in due course */ + SetLatch(MyLatch); +} + +/* + * ProcessCatchupInterrupt + * + * The portion of catchup interrupt handling that runs outside of the signal + * handler, which allows it to actually process pending invalidations. + */ +void +ProcessCatchupInterrupt(void) +{ + while (catchupInterruptPending) + { + /* + * What we need to do here is cause ReceiveSharedInvalidMessages() to + * run, which will do the necessary work and also reset the + * catchupInterruptPending flag. If we are inside a transaction we + * can just call AcceptInvalidationMessages() to do this. If we + * aren't, we start and immediately end a transaction; the call to + * AcceptInvalidationMessages() happens down inside transaction start. + * + * It is awfully tempting to just call AcceptInvalidationMessages() + * without the rest of the xact start/stop overhead, and I think that + * would actually work in the normal case; but I am not sure that + * things would clean up nicely if we got an error partway through. + */ + if (IsTransactionOrTransactionBlock()) + { + elog(DEBUG4, "ProcessCatchupEvent inside transaction"); + AcceptInvalidationMessages(); + } + else + { + elog(DEBUG4, "ProcessCatchupEvent outside transaction"); + StartTransactionCommand(); + CommitTransactionCommand(); + } + } +} diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c new file mode 100644 index 0000000..946bd8e --- /dev/null +++ b/src/backend/storage/ipc/sinvaladt.c @@ -0,0 +1,777 @@ +/*------------------------------------------------------------------------- + * + * sinvaladt.c + * POSTGRES shared cache invalidation data manager. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * + * IDENTIFICATION + * src/backend/storage/ipc/sinvaladt.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include <signal.h> +#include <unistd.h> + +#include "access/transam.h" +#include "miscadmin.h" +#include "storage/backendid.h" +#include "storage/ipc.h" +#include "storage/proc.h" +#include "storage/procsignal.h" +#include "storage/shmem.h" +#include "storage/sinvaladt.h" +#include "storage/spin.h" + +/* + * Conceptually, the shared cache invalidation messages are stored in an + * infinite array, where maxMsgNum is the next array subscript to store a + * submitted message in, minMsgNum is the smallest array subscript containing + * a message not yet read by all backends, and we always have maxMsgNum >= + * minMsgNum. (They are equal when there are no messages pending.) For each + * active backend, there is a nextMsgNum pointer indicating the next message it + * needs to read; we have maxMsgNum >= nextMsgNum >= minMsgNum for every + * backend. + * + * (In the current implementation, minMsgNum is a lower bound for the + * per-process nextMsgNum values, but it isn't rigorously kept equal to the + * smallest nextMsgNum --- it may lag behind. We only update it when + * SICleanupQueue is called, and we try not to do that often.) + * + * In reality, the messages are stored in a circular buffer of MAXNUMMESSAGES + * entries. We translate MsgNum values into circular-buffer indexes by + * computing MsgNum % MAXNUMMESSAGES (this should be fast as long as + * MAXNUMMESSAGES is a constant and a power of 2). As long as maxMsgNum + * doesn't exceed minMsgNum by more than MAXNUMMESSAGES, we have enough space + * in the buffer. If the buffer does overflow, we recover by setting the + * "reset" flag for each backend that has fallen too far behind. A backend + * that is in "reset" state is ignored while determining minMsgNum. When + * it does finally attempt to receive inval messages, it must discard all + * its invalidatable state, since it won't know what it missed. + * + * To reduce the probability of needing resets, we send a "catchup" interrupt + * to any backend that seems to be falling unreasonably far behind. The + * normal behavior is that at most one such interrupt is in flight at a time; + * when a backend completes processing a catchup interrupt, it executes + * SICleanupQueue, which will signal the next-furthest-behind backend if + * needed. This avoids undue contention from multiple backends all trying + * to catch up at once. However, the furthest-back backend might be stuck + * in a state where it can't catch up. Eventually it will get reset, so it + * won't cause any more problems for anyone but itself. But we don't want + * to find that a bunch of other backends are now too close to the reset + * threshold to be saved. So SICleanupQueue is designed to occasionally + * send extra catchup interrupts as the queue gets fuller, to backends that + * are far behind and haven't gotten one yet. As long as there aren't a lot + * of "stuck" backends, we won't need a lot of extra interrupts, since ones + * that aren't stuck will propagate their interrupts to the next guy. + * + * We would have problems if the MsgNum values overflow an integer, so + * whenever minMsgNum exceeds MSGNUMWRAPAROUND, we subtract MSGNUMWRAPAROUND + * from all the MsgNum variables simultaneously. MSGNUMWRAPAROUND can be + * large so that we don't need to do this often. It must be a multiple of + * MAXNUMMESSAGES so that the existing circular-buffer entries don't need + * to be moved when we do it. + * + * Access to the shared sinval array is protected by two locks, SInvalReadLock + * and SInvalWriteLock. Readers take SInvalReadLock in shared mode; this + * authorizes them to modify their own ProcState but not to modify or even + * look at anyone else's. When we need to perform array-wide updates, + * such as in SICleanupQueue, we take SInvalReadLock in exclusive mode to + * lock out all readers. Writers take SInvalWriteLock (always in exclusive + * mode) to serialize adding messages to the queue. Note that a writer + * can operate in parallel with one or more readers, because the writer + * has no need to touch anyone's ProcState, except in the infrequent cases + * when SICleanupQueue is needed. The only point of overlap is that + * the writer wants to change maxMsgNum while readers need to read it. + * We deal with that by having a spinlock that readers must take for just + * long enough to read maxMsgNum, while writers take it for just long enough + * to write maxMsgNum. (The exact rule is that you need the spinlock to + * read maxMsgNum if you are not holding SInvalWriteLock, and you need the + * spinlock to write maxMsgNum unless you are holding both locks.) + * + * Note: since maxMsgNum is an int and hence presumably atomically readable/ + * writable, the spinlock might seem unnecessary. The reason it is needed + * is to provide a memory barrier: we need to be sure that messages written + * to the array are actually there before maxMsgNum is increased, and that + * readers will see that data after fetching maxMsgNum. Multiprocessors + * that have weak memory-ordering guarantees can fail without the memory + * barrier instructions that are included in the spinlock sequences. + */ + + +/* + * Configurable parameters. + * + * MAXNUMMESSAGES: max number of shared-inval messages we can buffer. + * Must be a power of 2 for speed. + * + * MSGNUMWRAPAROUND: how often to reduce MsgNum variables to avoid overflow. + * Must be a multiple of MAXNUMMESSAGES. Should be large. + * + * CLEANUP_MIN: the minimum number of messages that must be in the buffer + * before we bother to call SICleanupQueue. + * + * CLEANUP_QUANTUM: how often (in messages) to call SICleanupQueue once + * we exceed CLEANUP_MIN. Should be a power of 2 for speed. + * + * SIG_THRESHOLD: the minimum number of messages a backend must have fallen + * behind before we'll send it PROCSIG_CATCHUP_INTERRUPT. + * + * WRITE_QUANTUM: the max number of messages to push into the buffer per + * iteration of SIInsertDataEntries. Noncritical but should be less than + * CLEANUP_QUANTUM, because we only consider calling SICleanupQueue once + * per iteration. + */ + +#define MAXNUMMESSAGES 4096 +#define MSGNUMWRAPAROUND (MAXNUMMESSAGES * 262144) +#define CLEANUP_MIN (MAXNUMMESSAGES / 2) +#define CLEANUP_QUANTUM (MAXNUMMESSAGES / 16) +#define SIG_THRESHOLD (MAXNUMMESSAGES / 2) +#define WRITE_QUANTUM 64 + +/* Per-backend state in shared invalidation structure */ +typedef struct ProcState +{ + /* procPid is zero in an inactive ProcState array entry. */ + pid_t procPid; /* PID of backend, for signaling */ + PGPROC *proc; /* PGPROC of backend */ + /* nextMsgNum is meaningless if procPid == 0 or resetState is true. */ + int nextMsgNum; /* next message number to read */ + bool resetState; /* backend needs to reset its state */ + bool signaled; /* backend has been sent catchup signal */ + bool hasMessages; /* backend has unread messages */ + + /* + * Backend only sends invalidations, never receives them. This only makes + * sense for Startup process during recovery because it doesn't maintain a + * relcache, yet it fires inval messages to allow query backends to see + * schema changes. + */ + bool sendOnly; /* backend only sends, never receives */ + + /* + * Next LocalTransactionId to use for each idle backend slot. We keep + * this here because it is indexed by BackendId and it is convenient to + * copy the value to and from local memory when MyBackendId is set. It's + * meaningless in an active ProcState entry. + */ + LocalTransactionId nextLXID; +} ProcState; + +/* Shared cache invalidation memory segment */ +typedef struct SISeg +{ + /* + * General state information + */ + int minMsgNum; /* oldest message still needed */ + int maxMsgNum; /* next message number to be assigned */ + int nextThreshold; /* # of messages to call SICleanupQueue */ + int lastBackend; /* index of last active procState entry, +1 */ + int maxBackends; /* size of procState array */ + + slock_t msgnumLock; /* spinlock protecting maxMsgNum */ + + /* + * Circular buffer holding shared-inval messages + */ + SharedInvalidationMessage buffer[MAXNUMMESSAGES]; + + /* + * Per-backend invalidation state info (has MaxBackends entries). + */ + ProcState procState[FLEXIBLE_ARRAY_MEMBER]; +} SISeg; + +static SISeg *shmInvalBuffer; /* pointer to the shared inval buffer */ + + +static LocalTransactionId nextLocalTransactionId; + +static void CleanupInvalidationState(int status, Datum arg); + + +/* + * SInvalShmemSize --- return shared-memory space needed + */ +Size +SInvalShmemSize(void) +{ + Size size; + + size = offsetof(SISeg, procState); + size = add_size(size, mul_size(sizeof(ProcState), MaxBackends)); + + return size; +} + +/* + * CreateSharedInvalidationState + * Create and initialize the SI message buffer + */ +void +CreateSharedInvalidationState(void) +{ + int i; + bool found; + + /* Allocate space in shared memory */ + shmInvalBuffer = (SISeg *) + ShmemInitStruct("shmInvalBuffer", SInvalShmemSize(), &found); + if (found) + return; + + /* Clear message counters, save size of procState array, init spinlock */ + shmInvalBuffer->minMsgNum = 0; + shmInvalBuffer->maxMsgNum = 0; + shmInvalBuffer->nextThreshold = CLEANUP_MIN; + shmInvalBuffer->lastBackend = 0; + shmInvalBuffer->maxBackends = MaxBackends; + SpinLockInit(&shmInvalBuffer->msgnumLock); + + /* The buffer[] array is initially all unused, so we need not fill it */ + + /* Mark all backends inactive, and initialize nextLXID */ + for (i = 0; i < shmInvalBuffer->maxBackends; i++) + { + shmInvalBuffer->procState[i].procPid = 0; /* inactive */ + shmInvalBuffer->procState[i].proc = NULL; + shmInvalBuffer->procState[i].nextMsgNum = 0; /* meaningless */ + shmInvalBuffer->procState[i].resetState = false; + shmInvalBuffer->procState[i].signaled = false; + shmInvalBuffer->procState[i].hasMessages = false; + shmInvalBuffer->procState[i].nextLXID = InvalidLocalTransactionId; + } +} + +/* + * SharedInvalBackendInit + * Initialize a new backend to operate on the sinval buffer + */ +void +SharedInvalBackendInit(bool sendOnly) +{ + int index; + ProcState *stateP = NULL; + SISeg *segP = shmInvalBuffer; + + /* + * This can run in parallel with read operations, but not with write + * operations, since SIInsertDataEntries relies on lastBackend to set + * hasMessages appropriately. + */ + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + + /* Look for a free entry in the procState array */ + for (index = 0; index < segP->lastBackend; index++) + { + if (segP->procState[index].procPid == 0) /* inactive slot? */ + { + stateP = &segP->procState[index]; + break; + } + } + + if (stateP == NULL) + { + if (segP->lastBackend < segP->maxBackends) + { + stateP = &segP->procState[segP->lastBackend]; + Assert(stateP->procPid == 0); + segP->lastBackend++; + } + else + { + /* + * out of procState slots: MaxBackends exceeded -- report normally + */ + MyBackendId = InvalidBackendId; + LWLockRelease(SInvalWriteLock); + ereport(FATAL, + (errcode(ERRCODE_TOO_MANY_CONNECTIONS), + errmsg("sorry, too many clients already"))); + } + } + + MyBackendId = (stateP - &segP->procState[0]) + 1; + + /* Advertise assigned backend ID in MyProc */ + MyProc->backendId = MyBackendId; + + /* Fetch next local transaction ID into local memory */ + nextLocalTransactionId = stateP->nextLXID; + + /* mark myself active, with all extant messages already read */ + stateP->procPid = MyProcPid; + stateP->proc = MyProc; + stateP->nextMsgNum = segP->maxMsgNum; + stateP->resetState = false; + stateP->signaled = false; + stateP->hasMessages = false; + stateP->sendOnly = sendOnly; + + LWLockRelease(SInvalWriteLock); + + /* register exit routine to mark my entry inactive at exit */ + on_shmem_exit(CleanupInvalidationState, PointerGetDatum(segP)); + + elog(DEBUG4, "my backend ID is %d", MyBackendId); +} + +/* + * CleanupInvalidationState + * Mark the current backend as no longer active. + * + * This function is called via on_shmem_exit() during backend shutdown. + * + * arg is really of type "SISeg*". + */ +static void +CleanupInvalidationState(int status, Datum arg) +{ + SISeg *segP = (SISeg *) DatumGetPointer(arg); + ProcState *stateP; + int i; + + Assert(PointerIsValid(segP)); + + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + + stateP = &segP->procState[MyBackendId - 1]; + + /* Update next local transaction ID for next holder of this backendID */ + stateP->nextLXID = nextLocalTransactionId; + + /* Mark myself inactive */ + stateP->procPid = 0; + stateP->proc = NULL; + stateP->nextMsgNum = 0; + stateP->resetState = false; + stateP->signaled = false; + + /* Recompute index of last active backend */ + for (i = segP->lastBackend; i > 0; i--) + { + if (segP->procState[i - 1].procPid != 0) + break; + } + segP->lastBackend = i; + + LWLockRelease(SInvalWriteLock); +} + +/* + * BackendIdGetProc + * Get the PGPROC structure for a backend, given the backend ID. + * The result may be out of date arbitrarily quickly, so the caller + * must be careful about how this information is used. NULL is + * returned if the backend is not active. + */ +PGPROC * +BackendIdGetProc(int backendID) +{ + PGPROC *result = NULL; + SISeg *segP = shmInvalBuffer; + + /* Need to lock out additions/removals of backends */ + LWLockAcquire(SInvalWriteLock, LW_SHARED); + + if (backendID > 0 && backendID <= segP->lastBackend) + { + ProcState *stateP = &segP->procState[backendID - 1]; + + result = stateP->proc; + } + + LWLockRelease(SInvalWriteLock); + + return result; +} + +/* + * BackendIdGetTransactionIds + * Get the xid and xmin of the backend. The result may be out of date + * arbitrarily quickly, so the caller must be careful about how this + * information is used. + */ +void +BackendIdGetTransactionIds(int backendID, TransactionId *xid, TransactionId *xmin) +{ + SISeg *segP = shmInvalBuffer; + + *xid = InvalidTransactionId; + *xmin = InvalidTransactionId; + + /* Need to lock out additions/removals of backends */ + LWLockAcquire(SInvalWriteLock, LW_SHARED); + + if (backendID > 0 && backendID <= segP->lastBackend) + { + ProcState *stateP = &segP->procState[backendID - 1]; + PGPROC *proc = stateP->proc; + + if (proc != NULL) + { + *xid = proc->xid; + *xmin = proc->xmin; + } + } + + LWLockRelease(SInvalWriteLock); +} + +/* + * SIInsertDataEntries + * Add new invalidation message(s) to the buffer. + */ +void +SIInsertDataEntries(const SharedInvalidationMessage *data, int n) +{ + SISeg *segP = shmInvalBuffer; + + /* + * N can be arbitrarily large. We divide the work into groups of no more + * than WRITE_QUANTUM messages, to be sure that we don't hold the lock for + * an unreasonably long time. (This is not so much because we care about + * letting in other writers, as that some just-caught-up backend might be + * trying to do SICleanupQueue to pass on its signal, and we don't want it + * to have to wait a long time.) Also, we need to consider calling + * SICleanupQueue every so often. + */ + while (n > 0) + { + int nthistime = Min(n, WRITE_QUANTUM); + int numMsgs; + int max; + int i; + + n -= nthistime; + + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + + /* + * If the buffer is full, we *must* acquire some space. Clean the + * queue and reset anyone who is preventing space from being freed. + * Otherwise, clean the queue only when it's exceeded the next + * fullness threshold. We have to loop and recheck the buffer state + * after any call of SICleanupQueue. + */ + for (;;) + { + numMsgs = segP->maxMsgNum - segP->minMsgNum; + if (numMsgs + nthistime > MAXNUMMESSAGES || + numMsgs >= segP->nextThreshold) + SICleanupQueue(true, nthistime); + else + break; + } + + /* + * Insert new message(s) into proper slot of circular buffer + */ + max = segP->maxMsgNum; + while (nthistime-- > 0) + { + segP->buffer[max % MAXNUMMESSAGES] = *data++; + max++; + } + + /* Update current value of maxMsgNum using spinlock */ + SpinLockAcquire(&segP->msgnumLock); + segP->maxMsgNum = max; + SpinLockRelease(&segP->msgnumLock); + + /* + * Now that the maxMsgNum change is globally visible, we give everyone + * a swift kick to make sure they read the newly added messages. + * Releasing SInvalWriteLock will enforce a full memory barrier, so + * these (unlocked) changes will be committed to memory before we exit + * the function. + */ + for (i = 0; i < segP->lastBackend; i++) + { + ProcState *stateP = &segP->procState[i]; + + stateP->hasMessages = true; + } + + LWLockRelease(SInvalWriteLock); + } +} + +/* + * SIGetDataEntries + * get next SI message(s) for current backend, if there are any + * + * Possible return values: + * 0: no SI message available + * n>0: next n SI messages have been extracted into data[] + * -1: SI reset message extracted + * + * If the return value is less than the array size "datasize", the caller + * can assume that there are no more SI messages after the one(s) returned. + * Otherwise, another call is needed to collect more messages. + * + * NB: this can run in parallel with other instances of SIGetDataEntries + * executing on behalf of other backends, since each instance will modify only + * fields of its own backend's ProcState, and no instance will look at fields + * of other backends' ProcStates. We express this by grabbing SInvalReadLock + * in shared mode. Note that this is not exactly the normal (read-only) + * interpretation of a shared lock! Look closely at the interactions before + * allowing SInvalReadLock to be grabbed in shared mode for any other reason! + * + * NB: this can also run in parallel with SIInsertDataEntries. It is not + * guaranteed that we will return any messages added after the routine is + * entered. + * + * Note: we assume that "datasize" is not so large that it might be important + * to break our hold on SInvalReadLock into segments. + */ +int +SIGetDataEntries(SharedInvalidationMessage *data, int datasize) +{ + SISeg *segP; + ProcState *stateP; + int max; + int n; + + segP = shmInvalBuffer; + stateP = &segP->procState[MyBackendId - 1]; + + /* + * Before starting to take locks, do a quick, unlocked test to see whether + * there can possibly be anything to read. On a multiprocessor system, + * it's possible that this load could migrate backwards and occur before + * we actually enter this function, so we might miss a sinval message that + * was just added by some other processor. But they can't migrate + * backwards over a preceding lock acquisition, so it should be OK. If we + * haven't acquired a lock preventing against further relevant + * invalidations, any such occurrence is not much different than if the + * invalidation had arrived slightly later in the first place. + */ + if (!stateP->hasMessages) + return 0; + + LWLockAcquire(SInvalReadLock, LW_SHARED); + + /* + * We must reset hasMessages before determining how many messages we're + * going to read. That way, if new messages arrive after we have + * determined how many we're reading, the flag will get reset and we'll + * notice those messages part-way through. + * + * Note that, if we don't end up reading all of the messages, we had + * better be certain to reset this flag before exiting! + */ + stateP->hasMessages = false; + + /* Fetch current value of maxMsgNum using spinlock */ + SpinLockAcquire(&segP->msgnumLock); + max = segP->maxMsgNum; + SpinLockRelease(&segP->msgnumLock); + + if (stateP->resetState) + { + /* + * Force reset. We can say we have dealt with any messages added + * since the reset, as well; and that means we should clear the + * signaled flag, too. + */ + stateP->nextMsgNum = max; + stateP->resetState = false; + stateP->signaled = false; + LWLockRelease(SInvalReadLock); + return -1; + } + + /* + * Retrieve messages and advance backend's counter, until data array is + * full or there are no more messages. + * + * There may be other backends that haven't read the message(s), so we + * cannot delete them here. SICleanupQueue() will eventually remove them + * from the queue. + */ + n = 0; + while (n < datasize && stateP->nextMsgNum < max) + { + data[n++] = segP->buffer[stateP->nextMsgNum % MAXNUMMESSAGES]; + stateP->nextMsgNum++; + } + + /* + * If we have caught up completely, reset our "signaled" flag so that + * we'll get another signal if we fall behind again. + * + * If we haven't caught up completely, reset the hasMessages flag so that + * we see the remaining messages next time. + */ + if (stateP->nextMsgNum >= max) + stateP->signaled = false; + else + stateP->hasMessages = true; + + LWLockRelease(SInvalReadLock); + return n; +} + +/* + * SICleanupQueue + * Remove messages that have been consumed by all active backends + * + * callerHasWriteLock is true if caller is holding SInvalWriteLock. + * minFree is the minimum number of message slots to make free. + * + * Possible side effects of this routine include marking one or more + * backends as "reset" in the array, and sending PROCSIG_CATCHUP_INTERRUPT + * to some backend that seems to be getting too far behind. We signal at + * most one backend at a time, for reasons explained at the top of the file. + * + * Caution: because we transiently release write lock when we have to signal + * some other backend, it is NOT guaranteed that there are still minFree + * free message slots at exit. Caller must recheck and perhaps retry. + */ +void +SICleanupQueue(bool callerHasWriteLock, int minFree) +{ + SISeg *segP = shmInvalBuffer; + int min, + minsig, + lowbound, + numMsgs, + i; + ProcState *needSig = NULL; + + /* Lock out all writers and readers */ + if (!callerHasWriteLock) + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + LWLockAcquire(SInvalReadLock, LW_EXCLUSIVE); + + /* + * Recompute minMsgNum = minimum of all backends' nextMsgNum, identify the + * furthest-back backend that needs signaling (if any), and reset any + * backends that are too far back. Note that because we ignore sendOnly + * backends here it is possible for them to keep sending messages without + * a problem even when they are the only active backend. + */ + min = segP->maxMsgNum; + minsig = min - SIG_THRESHOLD; + lowbound = min - MAXNUMMESSAGES + minFree; + + for (i = 0; i < segP->lastBackend; i++) + { + ProcState *stateP = &segP->procState[i]; + int n = stateP->nextMsgNum; + + /* Ignore if inactive or already in reset state */ + if (stateP->procPid == 0 || stateP->resetState || stateP->sendOnly) + continue; + + /* + * If we must free some space and this backend is preventing it, force + * him into reset state and then ignore until he catches up. + */ + if (n < lowbound) + { + stateP->resetState = true; + /* no point in signaling him ... */ + continue; + } + + /* Track the global minimum nextMsgNum */ + if (n < min) + min = n; + + /* Also see who's furthest back of the unsignaled backends */ + if (n < minsig && !stateP->signaled) + { + minsig = n; + needSig = stateP; + } + } + segP->minMsgNum = min; + + /* + * When minMsgNum gets really large, decrement all message counters so as + * to forestall overflow of the counters. This happens seldom enough that + * folding it into the previous loop would be a loser. + */ + if (min >= MSGNUMWRAPAROUND) + { + segP->minMsgNum -= MSGNUMWRAPAROUND; + segP->maxMsgNum -= MSGNUMWRAPAROUND; + for (i = 0; i < segP->lastBackend; i++) + { + /* we don't bother skipping inactive entries here */ + segP->procState[i].nextMsgNum -= MSGNUMWRAPAROUND; + } + } + + /* + * Determine how many messages are still in the queue, and set the + * threshold at which we should repeat SICleanupQueue(). + */ + numMsgs = segP->maxMsgNum - segP->minMsgNum; + if (numMsgs < CLEANUP_MIN) + segP->nextThreshold = CLEANUP_MIN; + else + segP->nextThreshold = (numMsgs / CLEANUP_QUANTUM + 1) * CLEANUP_QUANTUM; + + /* + * Lastly, signal anyone who needs a catchup interrupt. Since + * SendProcSignal() might not be fast, we don't want to hold locks while + * executing it. + */ + if (needSig) + { + pid_t his_pid = needSig->procPid; + BackendId his_backendId = (needSig - &segP->procState[0]) + 1; + + needSig->signaled = true; + LWLockRelease(SInvalReadLock); + LWLockRelease(SInvalWriteLock); + elog(DEBUG4, "sending sinval catchup signal to PID %d", (int) his_pid); + SendProcSignal(his_pid, PROCSIG_CATCHUP_INTERRUPT, his_backendId); + if (callerHasWriteLock) + LWLockAcquire(SInvalWriteLock, LW_EXCLUSIVE); + } + else + { + LWLockRelease(SInvalReadLock); + if (!callerHasWriteLock) + LWLockRelease(SInvalWriteLock); + } +} + + +/* + * GetNextLocalTransactionId --- allocate a new LocalTransactionId + * + * We split VirtualTransactionIds into two parts so that it is possible + * to allocate a new one without any contention for shared memory, except + * for a bit of additional overhead during backend startup/shutdown. + * The high-order part of a VirtualTransactionId is a BackendId, and the + * low-order part is a LocalTransactionId, which we assign from a local + * counter. To avoid the risk of a VirtualTransactionId being reused + * within a short interval, successive procs occupying the same backend ID + * slot should use a consecutive sequence of local IDs, which is implemented + * by copying nextLocalTransactionId as seen above. + */ +LocalTransactionId +GetNextLocalTransactionId(void) +{ + LocalTransactionId result; + + /* loop to avoid returning InvalidLocalTransactionId at wraparound */ + do + { + result = nextLocalTransactionId++; + } while (!LocalTransactionIdIsValid(result)); + + return result; +} diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c new file mode 100644 index 0000000..687ce03 --- /dev/null +++ b/src/backend/storage/ipc/standby.c @@ -0,0 +1,1450 @@ +/*------------------------------------------------------------------------- + * + * standby.c + * Misc functions used in Hot Standby mode. + * + * All functions for handling RM_STANDBY_ID, which relate to + * AccessExclusiveLocks and starting snapshots for Hot Standby mode. + * Plus conflict recovery processing. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/ipc/standby.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "access/transam.h" +#include "access/twophase.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "miscadmin.h" +#include "pgstat.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "storage/proc.h" +#include "storage/procarray.h" +#include "storage/sinvaladt.h" +#include "storage/standby.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "utils/ps_status.h" +#include "utils/timeout.h" +#include "utils/timestamp.h" + +/* User-settable GUC parameters */ +int vacuum_defer_cleanup_age; +int max_standby_archive_delay = 30 * 1000; +int max_standby_streaming_delay = 30 * 1000; +bool log_recovery_conflict_waits = false; + +static HTAB *RecoveryLockLists; + +/* Flags set by timeout handlers */ +static volatile sig_atomic_t got_standby_deadlock_timeout = false; +static volatile sig_atomic_t got_standby_delay_timeout = false; +static volatile sig_atomic_t got_standby_lock_timeout = false; + +static void ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, + ProcSignalReason reason, + uint32 wait_event_info, + bool report_waiting); +static void SendRecoveryConflictWithBufferPin(ProcSignalReason reason); +static XLogRecPtr LogCurrentRunningXacts(RunningTransactions CurrRunningXacts); +static void LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks); +static const char *get_recovery_conflict_desc(ProcSignalReason reason); + +/* + * Keep track of all the locks owned by a given transaction. + */ +typedef struct RecoveryLockListsEntry +{ + TransactionId xid; + List *locks; +} RecoveryLockListsEntry; + +/* + * InitRecoveryTransactionEnvironment + * Initialize tracking of our primary's in-progress transactions. + * + * We need to issue shared invalidations and hold locks. Holding locks + * means others may want to wait on us, so we need to make a lock table + * vxact entry like a real transaction. We could create and delete + * lock table entries for each transaction but its simpler just to create + * one permanent entry and leave it there all the time. Locks are then + * acquired and released as needed. Yes, this means you can see the + * Startup process in pg_locks once we have run this. + */ +void +InitRecoveryTransactionEnvironment(void) +{ + VirtualTransactionId vxid; + HASHCTL hash_ctl; + + /* + * Initialize the hash table for tracking the list of locks held by each + * transaction. + */ + hash_ctl.keysize = sizeof(TransactionId); + hash_ctl.entrysize = sizeof(RecoveryLockListsEntry); + RecoveryLockLists = hash_create("RecoveryLockLists", + 64, + &hash_ctl, + HASH_ELEM | HASH_BLOBS); + + /* + * Initialize shared invalidation management for Startup process, being + * careful to register ourselves as a sendOnly process so we don't need to + * read messages, nor will we get signaled when the queue starts filling + * up. + */ + SharedInvalBackendInit(true); + + /* + * Lock a virtual transaction id for Startup process. + * + * We need to do GetNextLocalTransactionId() because + * SharedInvalBackendInit() leaves localTransactionId invalid and the lock + * manager doesn't like that at all. + * + * Note that we don't need to run XactLockTableInsert() because nobody + * needs to wait on xids. That sounds a little strange, but table locks + * are held by vxids and row level locks are held by xids. All queries + * hold AccessShareLocks so never block while we write or lock new rows. + */ + vxid.backendId = MyBackendId; + vxid.localTransactionId = GetNextLocalTransactionId(); + VirtualXactLockTableInsert(vxid); + + standbyState = STANDBY_INITIALIZED; +} + +/* + * ShutdownRecoveryTransactionEnvironment + * Shut down transaction tracking + * + * Prepare to switch from hot standby mode to normal operation. Shut down + * recovery-time transaction tracking. + * + * This must be called even in shutdown of startup process if transaction + * tracking has been initialized. Otherwise some locks the tracked + * transactions were holding will not be released and and may interfere with + * the processes still running (but will exit soon later) at the exit of + * startup process. + */ +void +ShutdownRecoveryTransactionEnvironment(void) +{ + /* + * Do nothing if RecoveryLockLists is NULL because which means that + * transaction tracking has not been yet initialized or has been already + * shutdowned. This prevents transaction tracking from being shutdowned + * unexpectedly more than once. + */ + if (RecoveryLockLists == NULL) + return; + + /* Mark all tracked in-progress transactions as finished. */ + ExpireAllKnownAssignedTransactionIds(); + + /* Release all locks the tracked transactions were holding */ + StandbyReleaseAllLocks(); + + /* Destroy the hash table of locks. */ + hash_destroy(RecoveryLockLists); + RecoveryLockLists = NULL; + + /* Cleanup our VirtualTransaction */ + VirtualXactLockTableCleanup(); +} + + +/* + * ----------------------------------------------------- + * Standby wait timers and backend cancel logic + * ----------------------------------------------------- + */ + +/* + * Determine the cutoff time at which we want to start canceling conflicting + * transactions. Returns zero (a time safely in the past) if we are willing + * to wait forever. + */ +static TimestampTz +GetStandbyLimitTime(void) +{ + TimestampTz rtime; + bool fromStream; + + /* + * The cutoff time is the last WAL data receipt time plus the appropriate + * delay variable. Delay of -1 means wait forever. + */ + GetXLogReceiptTime(&rtime, &fromStream); + if (fromStream) + { + if (max_standby_streaming_delay < 0) + return 0; /* wait forever */ + return TimestampTzPlusMilliseconds(rtime, max_standby_streaming_delay); + } + else + { + if (max_standby_archive_delay < 0) + return 0; /* wait forever */ + return TimestampTzPlusMilliseconds(rtime, max_standby_archive_delay); + } +} + +#define STANDBY_INITIAL_WAIT_US 1000 +static int standbyWait_us = STANDBY_INITIAL_WAIT_US; + +/* + * Standby wait logic for ResolveRecoveryConflictWithVirtualXIDs. + * We wait here for a while then return. If we decide we can't wait any + * more then we return true, if we can wait some more return false. + */ +static bool +WaitExceedsMaxStandbyDelay(uint32 wait_event_info) +{ + TimestampTz ltime; + + CHECK_FOR_INTERRUPTS(); + + /* Are we past the limit time? */ + ltime = GetStandbyLimitTime(); + if (ltime && GetCurrentTimestamp() >= ltime) + return true; + + /* + * Sleep a bit (this is essential to avoid busy-waiting). + */ + pgstat_report_wait_start(wait_event_info); + pg_usleep(standbyWait_us); + pgstat_report_wait_end(); + + /* + * Progressively increase the sleep times, but not to more than 1s, since + * pg_usleep isn't interruptible on some platforms. + */ + standbyWait_us *= 2; + if (standbyWait_us > 1000000) + standbyWait_us = 1000000; + + return false; +} + +/* + * Log the recovery conflict. + * + * wait_start is the timestamp when the caller started to wait. + * now is the timestamp when this function has been called. + * wait_list is the list of virtual transaction ids assigned to + * conflicting processes. still_waiting indicates whether + * the startup process is still waiting for the recovery conflict + * to be resolved or not. + */ +void +LogRecoveryConflict(ProcSignalReason reason, TimestampTz wait_start, + TimestampTz now, VirtualTransactionId *wait_list, + bool still_waiting) +{ + long secs; + int usecs; + long msecs; + StringInfoData buf; + int nprocs = 0; + + /* + * There must be no conflicting processes when the recovery conflict has + * already been resolved. + */ + Assert(still_waiting || wait_list == NULL); + + TimestampDifference(wait_start, now, &secs, &usecs); + msecs = secs * 1000 + usecs / 1000; + usecs = usecs % 1000; + + if (wait_list) + { + VirtualTransactionId *vxids; + + /* Construct a string of list of the conflicting processes */ + vxids = wait_list; + while (VirtualTransactionIdIsValid(*vxids)) + { + PGPROC *proc = BackendIdGetProc(vxids->backendId); + + /* proc can be NULL if the target backend is not active */ + if (proc) + { + if (nprocs == 0) + { + initStringInfo(&buf); + appendStringInfo(&buf, "%d", proc->pid); + } + else + appendStringInfo(&buf, ", %d", proc->pid); + + nprocs++; + } + + vxids++; + } + } + + /* + * If wait_list is specified, report the list of PIDs of active + * conflicting backends in a detail message. Note that if all the backends + * in the list are not active, no detail message is logged. + */ + if (still_waiting) + { + ereport(LOG, + errmsg("recovery still waiting after %ld.%03d ms: %s", + msecs, usecs, get_recovery_conflict_desc(reason)), + nprocs > 0 ? errdetail_log_plural("Conflicting process: %s.", + "Conflicting processes: %s.", + nprocs, buf.data) : 0); + } + else + { + ereport(LOG, + errmsg("recovery finished waiting after %ld.%03d ms: %s", + msecs, usecs, get_recovery_conflict_desc(reason))); + } + + if (nprocs > 0) + pfree(buf.data); +} + +/* + * This is the main executioner for any query backend that conflicts with + * recovery processing. Judgement has already been passed on it within + * a specific rmgr. Here we just issue the orders to the procs. The procs + * then throw the required error as instructed. + * + * If report_waiting is true, "waiting" is reported in PS display and the + * wait for recovery conflict is reported in the log, if necessary. If + * the caller is responsible for reporting them, report_waiting should be + * false. Otherwise, both the caller and this function report the same + * thing unexpectedly. + */ +static void +ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist, + ProcSignalReason reason, uint32 wait_event_info, + bool report_waiting) +{ + TimestampTz waitStart = 0; + char *new_status = NULL; + bool logged_recovery_conflict = false; + + /* Fast exit, to avoid a kernel call if there's no work to be done. */ + if (!VirtualTransactionIdIsValid(*waitlist)) + return; + + /* Set the wait start timestamp for reporting */ + if (report_waiting && (log_recovery_conflict_waits || update_process_title)) + waitStart = GetCurrentTimestamp(); + + while (VirtualTransactionIdIsValid(*waitlist)) + { + /* reset standbyWait_us for each xact we wait for */ + standbyWait_us = STANDBY_INITIAL_WAIT_US; + + /* wait until the virtual xid is gone */ + while (!VirtualXactLock(*waitlist, false)) + { + /* Is it time to kill it? */ + if (WaitExceedsMaxStandbyDelay(wait_event_info)) + { + pid_t pid; + + /* + * Now find out who to throw out of the balloon. + */ + Assert(VirtualTransactionIdIsValid(*waitlist)); + pid = CancelVirtualTransaction(*waitlist, reason); + + /* + * Wait a little bit for it to die so that we avoid flooding + * an unresponsive backend when system is heavily loaded. + */ + if (pid != 0) + pg_usleep(5000L); + } + + if (waitStart != 0 && (!logged_recovery_conflict || new_status == NULL)) + { + TimestampTz now = 0; + bool maybe_log_conflict; + bool maybe_update_title; + + maybe_log_conflict = (log_recovery_conflict_waits && !logged_recovery_conflict); + maybe_update_title = (update_process_title && new_status == NULL); + + /* Get the current timestamp if not report yet */ + if (maybe_log_conflict || maybe_update_title) + now = GetCurrentTimestamp(); + + /* + * Report via ps if we have been waiting for more than 500 + * msec (should that be configurable?) + */ + if (maybe_update_title && + TimestampDifferenceExceeds(waitStart, now, 500)) + { + const char *old_status; + int len; + + old_status = get_ps_display(&len); + new_status = (char *) palloc(len + 8 + 1); + memcpy(new_status, old_status, len); + strcpy(new_status + len, " waiting"); + set_ps_display(new_status); + new_status[len] = '\0'; /* truncate off " waiting" */ + } + + /* + * Emit the log message if the startup process is waiting + * longer than deadlock_timeout for recovery conflict. + */ + if (maybe_log_conflict && + TimestampDifferenceExceeds(waitStart, now, DeadlockTimeout)) + { + LogRecoveryConflict(reason, waitStart, now, waitlist, true); + logged_recovery_conflict = true; + } + } + } + + /* The virtual transaction is gone now, wait for the next one */ + waitlist++; + } + + /* + * Emit the log message if recovery conflict was resolved but the startup + * process waited longer than deadlock_timeout for it. + */ + if (logged_recovery_conflict) + LogRecoveryConflict(reason, waitStart, GetCurrentTimestamp(), + NULL, false); + + /* Reset ps display if we changed it */ + if (new_status) + { + set_ps_display(new_status); + pfree(new_status); + } +} + +void +ResolveRecoveryConflictWithSnapshot(TransactionId latestRemovedXid, RelFileNode node) +{ + VirtualTransactionId *backends; + + /* + * If we get passed InvalidTransactionId then we do nothing (no conflict). + * + * This can happen when replaying already-applied WAL records after a + * standby crash or restart, or when replaying an XLOG_HEAP2_VISIBLE + * record that marks as frozen a page which was already all-visible. It's + * also quite common with records generated during index deletion + * (original execution of the deletion can reason that a recovery conflict + * which is sufficient for the deletion operation must take place before + * replay of the deletion record itself). + */ + if (!TransactionIdIsValid(latestRemovedXid)) + return; + + backends = GetConflictingVirtualXIDs(latestRemovedXid, + node.dbNode); + + ResolveRecoveryConflictWithVirtualXIDs(backends, + PROCSIG_RECOVERY_CONFLICT_SNAPSHOT, + WAIT_EVENT_RECOVERY_CONFLICT_SNAPSHOT, + true); +} + +/* + * Variant of ResolveRecoveryConflictWithSnapshot that works with + * FullTransactionId values + */ +void +ResolveRecoveryConflictWithSnapshotFullXid(FullTransactionId latestRemovedFullXid, + RelFileNode node) +{ + /* + * ResolveRecoveryConflictWithSnapshot operates on 32-bit TransactionIds, + * so truncate the logged FullTransactionId. If the logged value is very + * old, so that XID wrap-around already happened on it, there can't be any + * snapshots that still see it. + */ + FullTransactionId nextXid = ReadNextFullTransactionId(); + uint64 diff; + + diff = U64FromFullTransactionId(nextXid) - + U64FromFullTransactionId(latestRemovedFullXid); + if (diff < MaxTransactionId / 2) + { + TransactionId latestRemovedXid; + + latestRemovedXid = XidFromFullTransactionId(latestRemovedFullXid); + ResolveRecoveryConflictWithSnapshot(latestRemovedXid, node); + } +} + +void +ResolveRecoveryConflictWithTablespace(Oid tsid) +{ + VirtualTransactionId *temp_file_users; + + /* + * Standby users may be currently using this tablespace for their + * temporary files. We only care about current users because + * temp_tablespace parameter will just ignore tablespaces that no longer + * exist. + * + * Ask everybody to cancel their queries immediately so we can ensure no + * temp files remain and we can remove the tablespace. Nuke the entire + * site from orbit, it's the only way to be sure. + * + * XXX: We could work out the pids of active backends using this + * tablespace by examining the temp filenames in the directory. We would + * then convert the pids into VirtualXIDs before attempting to cancel + * them. + * + * We don't wait for commit because drop tablespace is non-transactional. + */ + temp_file_users = GetConflictingVirtualXIDs(InvalidTransactionId, + InvalidOid); + ResolveRecoveryConflictWithVirtualXIDs(temp_file_users, + PROCSIG_RECOVERY_CONFLICT_TABLESPACE, + WAIT_EVENT_RECOVERY_CONFLICT_TABLESPACE, + true); +} + +void +ResolveRecoveryConflictWithDatabase(Oid dbid) +{ + /* + * We don't do ResolveRecoveryConflictWithVirtualXIDs() here since that + * only waits for transactions and completely idle sessions would block + * us. This is rare enough that we do this as simply as possible: no wait, + * just force them off immediately. + * + * No locking is required here because we already acquired + * AccessExclusiveLock. Anybody trying to connect while we do this will + * block during InitPostgres() and then disconnect when they see the + * database has been removed. + */ + while (CountDBBackends(dbid) > 0) + { + CancelDBBackends(dbid, PROCSIG_RECOVERY_CONFLICT_DATABASE, true); + + /* + * Wait awhile for them to die so that we avoid flooding an + * unresponsive backend when system is heavily loaded. + */ + pg_usleep(10000); + } +} + +/* + * ResolveRecoveryConflictWithLock is called from ProcSleep() + * to resolve conflicts with other backends holding relation locks. + * + * The WaitLatch sleep normally done in ProcSleep() + * (when not InHotStandby) is performed here, for code clarity. + * + * We either resolve conflicts immediately or set a timeout to wake us at + * the limit of our patience. + * + * Resolve conflicts by canceling to all backends holding a conflicting + * lock. As we are already queued to be granted the lock, no new lock + * requests conflicting with ours will be granted in the meantime. + * + * We also must check for deadlocks involving the Startup process and + * hot-standby backend processes. If deadlock_timeout is reached in + * this function, all the backends holding the conflicting locks are + * requested to check themselves for deadlocks. + * + * logging_conflict should be true if the recovery conflict has not been + * logged yet even though logging is enabled. After deadlock_timeout is + * reached and the request for deadlock check is sent, we wait again to + * be signaled by the release of the lock if logging_conflict is false. + * Otherwise we return without waiting again so that the caller can report + * the recovery conflict. In this case, then, this function is called again + * with logging_conflict=false (because the recovery conflict has already + * been logged) and we will wait again for the lock to be released. + */ +void +ResolveRecoveryConflictWithLock(LOCKTAG locktag, bool logging_conflict) +{ + TimestampTz ltime; + TimestampTz now; + + Assert(InHotStandby); + + ltime = GetStandbyLimitTime(); + now = GetCurrentTimestamp(); + + /* + * Update waitStart if first time through after the startup process + * started waiting for the lock. It should not be updated every time + * ResolveRecoveryConflictWithLock() is called during the wait. + * + * Use the current time obtained for comparison with ltime as waitStart + * (i.e., the time when this process started waiting for the lock). Since + * getting the current time newly can cause overhead, we reuse the + * already-obtained time to avoid that overhead. + * + * Note that waitStart is updated without holding the lock table's + * partition lock, to avoid the overhead by additional lock acquisition. + * This can cause "waitstart" in pg_locks to become NULL for a very short + * period of time after the wait started even though "granted" is false. + * This is OK in practice because we can assume that users are likely to + * look at "waitstart" when waiting for the lock for a long time. + */ + if (pg_atomic_read_u64(&MyProc->waitStart) == 0) + pg_atomic_write_u64(&MyProc->waitStart, now); + + if (now >= ltime && ltime != 0) + { + /* + * We're already behind, so clear a path as quickly as possible. + */ + VirtualTransactionId *backends; + + backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL); + + /* + * Prevent ResolveRecoveryConflictWithVirtualXIDs() from reporting + * "waiting" in PS display by disabling its argument report_waiting + * because the caller, WaitOnLock(), has already reported that. + */ + ResolveRecoveryConflictWithVirtualXIDs(backends, + PROCSIG_RECOVERY_CONFLICT_LOCK, + PG_WAIT_LOCK | locktag.locktag_type, + false); + } + else + { + /* + * Wait (or wait again) until ltime, and check for deadlocks as well + * if we will be waiting longer than deadlock_timeout + */ + EnableTimeoutParams timeouts[2]; + int cnt = 0; + + if (ltime != 0) + { + got_standby_lock_timeout = false; + timeouts[cnt].id = STANDBY_LOCK_TIMEOUT; + timeouts[cnt].type = TMPARAM_AT; + timeouts[cnt].fin_time = ltime; + cnt++; + } + + got_standby_deadlock_timeout = false; + timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT; + timeouts[cnt].type = TMPARAM_AFTER; + timeouts[cnt].delay_ms = DeadlockTimeout; + cnt++; + + enable_timeouts(timeouts, cnt); + } + + /* Wait to be signaled by the release of the Relation Lock */ + ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type); + + /* + * Exit if ltime is reached. Then all the backends holding conflicting + * locks will be canceled in the next ResolveRecoveryConflictWithLock() + * call. + */ + if (got_standby_lock_timeout) + goto cleanup; + + if (got_standby_deadlock_timeout) + { + VirtualTransactionId *backends; + + backends = GetLockConflicts(&locktag, AccessExclusiveLock, NULL); + + /* Quick exit if there's no work to be done */ + if (!VirtualTransactionIdIsValid(*backends)) + goto cleanup; + + /* + * Send signals to all the backends holding the conflicting locks, to + * ask them to check themselves for deadlocks. + */ + while (VirtualTransactionIdIsValid(*backends)) + { + SignalVirtualTransaction(*backends, + PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK, + false); + backends++; + } + + /* + * Exit if the recovery conflict has not been logged yet even though + * logging is enabled, so that the caller can log that. Then + * RecoveryConflictWithLock() is called again and we will wait again + * for the lock to be released. + */ + if (logging_conflict) + goto cleanup; + + /* + * Wait again here to be signaled by the release of the Relation Lock, + * to prevent the subsequent RecoveryConflictWithLock() from causing + * deadlock_timeout and sending a request for deadlocks check again. + * Otherwise the request continues to be sent every deadlock_timeout + * until the relation locks are released or ltime is reached. + */ + got_standby_deadlock_timeout = false; + ProcWaitForSignal(PG_WAIT_LOCK | locktag.locktag_type); + } + +cleanup: + + /* + * Clear any timeout requests established above. We assume here that the + * Startup process doesn't have any other outstanding timeouts than those + * used by this function. If that stops being true, we could cancel the + * timeouts individually, but that'd be slower. + */ + disable_all_timeouts(false); + got_standby_lock_timeout = false; + got_standby_deadlock_timeout = false; +} + +/* + * ResolveRecoveryConflictWithBufferPin is called from LockBufferForCleanup() + * to resolve conflicts with other backends holding buffer pins. + * + * The ProcWaitForSignal() sleep normally done in LockBufferForCleanup() + * (when not InHotStandby) is performed here, for code clarity. + * + * We either resolve conflicts immediately or set a timeout to wake us at + * the limit of our patience. + * + * Resolve conflicts by sending a PROCSIG signal to all backends to check if + * they hold one of the buffer pins that is blocking Startup process. If so, + * those backends will take an appropriate error action, ERROR or FATAL. + * + * We also must check for deadlocks. Deadlocks occur because if queries + * wait on a lock, that must be behind an AccessExclusiveLock, which can only + * be cleared if the Startup process replays a transaction completion record. + * If Startup process is also waiting then that is a deadlock. The deadlock + * can occur if the query is waiting and then the Startup sleeps, or if + * Startup is sleeping and the query waits on a lock. We protect against + * only the former sequence here, the latter sequence is checked prior to + * the query sleeping, in CheckRecoveryConflictDeadlock(). + * + * Deadlocks are extremely rare, and relatively expensive to check for, + * so we don't do a deadlock check right away ... only if we have had to wait + * at least deadlock_timeout. + */ +void +ResolveRecoveryConflictWithBufferPin(void) +{ + TimestampTz ltime; + + Assert(InHotStandby); + + ltime = GetStandbyLimitTime(); + + if (GetCurrentTimestamp() >= ltime && ltime != 0) + { + /* + * We're already behind, so clear a path as quickly as possible. + */ + SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + } + else + { + /* + * Wake up at ltime, and check for deadlocks as well if we will be + * waiting longer than deadlock_timeout + */ + EnableTimeoutParams timeouts[2]; + int cnt = 0; + + if (ltime != 0) + { + timeouts[cnt].id = STANDBY_TIMEOUT; + timeouts[cnt].type = TMPARAM_AT; + timeouts[cnt].fin_time = ltime; + cnt++; + } + + got_standby_deadlock_timeout = false; + timeouts[cnt].id = STANDBY_DEADLOCK_TIMEOUT; + timeouts[cnt].type = TMPARAM_AFTER; + timeouts[cnt].delay_ms = DeadlockTimeout; + cnt++; + + enable_timeouts(timeouts, cnt); + } + + /* + * Wait to be signaled by UnpinBuffer() or for the wait to be interrupted + * by one of the timeouts established above. + * + * We assume that only UnpinBuffer() and the timeout requests established + * above can wake us up here. WakeupRecovery() called by walreceiver or + * SIGHUP signal handler, etc cannot do that because it uses the different + * latch from that ProcWaitForSignal() waits on. + */ + ProcWaitForSignal(PG_WAIT_BUFFER_PIN); + + if (got_standby_delay_timeout) + SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN); + else if (got_standby_deadlock_timeout) + { + /* + * Send out a request for hot-standby backends to check themselves for + * deadlocks. + * + * XXX The subsequent ResolveRecoveryConflictWithBufferPin() will wait + * to be signaled by UnpinBuffer() again and send a request for + * deadlocks check if deadlock_timeout happens. This causes the + * request to continue to be sent every deadlock_timeout until the + * buffer is unpinned or ltime is reached. This would increase the + * workload in the startup process and backends. In practice it may + * not be so harmful because the period that the buffer is kept pinned + * is basically no so long. But we should fix this? + */ + SendRecoveryConflictWithBufferPin( + PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + } + + /* + * Clear any timeout requests established above. We assume here that the + * Startup process doesn't have any other timeouts than what this function + * uses. If that stops being true, we could cancel the timeouts + * individually, but that'd be slower. + */ + disable_all_timeouts(false); + got_standby_delay_timeout = false; + got_standby_deadlock_timeout = false; +} + +static void +SendRecoveryConflictWithBufferPin(ProcSignalReason reason) +{ + Assert(reason == PROCSIG_RECOVERY_CONFLICT_BUFFERPIN || + reason == PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK); + + /* + * We send signal to all backends to ask them if they are holding the + * buffer pin which is delaying the Startup process. We must not set the + * conflict flag yet, since most backends will be innocent. Let the + * SIGUSR1 handling in each backend decide their own fate. + */ + CancelDBBackends(InvalidOid, reason, false); +} + +/* + * In Hot Standby perform early deadlock detection. We abort the lock + * wait if we are about to sleep while holding the buffer pin that Startup + * process is waiting for. + * + * Note: this code is pessimistic, because there is no way for it to + * determine whether an actual deadlock condition is present: the lock we + * need to wait for might be unrelated to any held by the Startup process. + * Sooner or later, this mechanism should get ripped out in favor of somehow + * accounting for buffer locks in DeadLockCheck(). However, errors here + * seem to be very low-probability in practice, so for now it's not worth + * the trouble. + */ +void +CheckRecoveryConflictDeadlock(void) +{ + Assert(!InRecovery); /* do not call in Startup process */ + + if (!HoldingBufferPinThatDelaysRecovery()) + return; + + /* + * Error message should match ProcessInterrupts() but we avoid calling + * that because we aren't handling an interrupt at this point. Note that + * we only cancel the current transaction here, so if we are in a + * subtransaction and the pin is held by a parent, then the Startup + * process will continue to wait even though we have avoided deadlock. + */ + ereport(ERROR, + (errcode(ERRCODE_T_R_DEADLOCK_DETECTED), + errmsg("canceling statement due to conflict with recovery"), + errdetail("User transaction caused buffer deadlock with recovery."))); +} + + +/* -------------------------------- + * timeout handler routines + * -------------------------------- + */ + +/* + * StandbyDeadLockHandler() will be called if STANDBY_DEADLOCK_TIMEOUT is + * exceeded. + */ +void +StandbyDeadLockHandler(void) +{ + got_standby_deadlock_timeout = true; +} + +/* + * StandbyTimeoutHandler() will be called if STANDBY_TIMEOUT is exceeded. + */ +void +StandbyTimeoutHandler(void) +{ + got_standby_delay_timeout = true; +} + +/* + * StandbyLockTimeoutHandler() will be called if STANDBY_LOCK_TIMEOUT is exceeded. + */ +void +StandbyLockTimeoutHandler(void) +{ + got_standby_lock_timeout = true; +} + +/* + * ----------------------------------------------------- + * Locking in Recovery Mode + * ----------------------------------------------------- + * + * All locks are held by the Startup process using a single virtual + * transaction. This implementation is both simpler and in some senses, + * more correct. The locks held mean "some original transaction held + * this lock, so query access is not allowed at this time". So the Startup + * process is the proxy by which the original locks are implemented. + * + * We only keep track of AccessExclusiveLocks, which are only ever held by + * one transaction on one relation. + * + * We keep a hash table of lists of locks in local memory keyed by xid, + * RecoveryLockLists, so we can keep track of the various entries made by + * the Startup process's virtual xid in the shared lock table. + * + * List elements use type xl_standby_lock, since the WAL record type exactly + * matches the information that we need to keep track of. + * + * We use session locks rather than normal locks so we don't need + * ResourceOwners. + */ + + +void +StandbyAcquireAccessExclusiveLock(TransactionId xid, Oid dbOid, Oid relOid) +{ + RecoveryLockListsEntry *entry; + xl_standby_lock *newlock; + LOCKTAG locktag; + bool found; + + /* Already processed? */ + if (!TransactionIdIsValid(xid) || + TransactionIdDidCommit(xid) || + TransactionIdDidAbort(xid)) + return; + + elog(trace_recovery(DEBUG4), + "adding recovery lock: db %u rel %u", dbOid, relOid); + + /* dbOid is InvalidOid when we are locking a shared relation. */ + Assert(OidIsValid(relOid)); + + /* Create a new list for this xid, if we don't have one already. */ + entry = hash_search(RecoveryLockLists, &xid, HASH_ENTER, &found); + if (!found) + { + entry->xid = xid; + entry->locks = NIL; + } + + newlock = palloc(sizeof(xl_standby_lock)); + newlock->xid = xid; + newlock->dbOid = dbOid; + newlock->relOid = relOid; + entry->locks = lappend(entry->locks, newlock); + + SET_LOCKTAG_RELATION(locktag, newlock->dbOid, newlock->relOid); + + (void) LockAcquire(&locktag, AccessExclusiveLock, true, false); +} + +static void +StandbyReleaseLockList(List *locks) +{ + ListCell *lc; + + foreach(lc, locks) + { + xl_standby_lock *lock = (xl_standby_lock *) lfirst(lc); + LOCKTAG locktag; + + elog(trace_recovery(DEBUG4), + "releasing recovery lock: xid %u db %u rel %u", + lock->xid, lock->dbOid, lock->relOid); + SET_LOCKTAG_RELATION(locktag, lock->dbOid, lock->relOid); + if (!LockRelease(&locktag, AccessExclusiveLock, true)) + { + elog(LOG, + "RecoveryLockLists contains entry for lock no longer recorded by lock manager: xid %u database %u relation %u", + lock->xid, lock->dbOid, lock->relOid); + Assert(false); + } + } + + list_free_deep(locks); +} + +static void +StandbyReleaseLocks(TransactionId xid) +{ + RecoveryLockListsEntry *entry; + + if (TransactionIdIsValid(xid)) + { + if ((entry = hash_search(RecoveryLockLists, &xid, HASH_FIND, NULL))) + { + StandbyReleaseLockList(entry->locks); + hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL); + } + } + else + StandbyReleaseAllLocks(); +} + +/* + * Release locks for a transaction tree, starting at xid down, from + * RecoveryLockLists. + * + * Called during WAL replay of COMMIT/ROLLBACK when in hot standby mode, + * to remove any AccessExclusiveLocks requested by a transaction. + */ +void +StandbyReleaseLockTree(TransactionId xid, int nsubxids, TransactionId *subxids) +{ + int i; + + StandbyReleaseLocks(xid); + + for (i = 0; i < nsubxids; i++) + StandbyReleaseLocks(subxids[i]); +} + +/* + * Called at end of recovery and when we see a shutdown checkpoint. + */ +void +StandbyReleaseAllLocks(void) +{ + HASH_SEQ_STATUS status; + RecoveryLockListsEntry *entry; + + elog(trace_recovery(DEBUG2), "release all standby locks"); + + hash_seq_init(&status, RecoveryLockLists); + while ((entry = hash_seq_search(&status))) + { + StandbyReleaseLockList(entry->locks); + hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL); + } +} + +/* + * StandbyReleaseOldLocks + * Release standby locks held by top-level XIDs that aren't running, + * as long as they're not prepared transactions. + */ +void +StandbyReleaseOldLocks(TransactionId oldxid) +{ + HASH_SEQ_STATUS status; + RecoveryLockListsEntry *entry; + + hash_seq_init(&status, RecoveryLockLists); + while ((entry = hash_seq_search(&status))) + { + Assert(TransactionIdIsValid(entry->xid)); + + /* Skip if prepared transaction. */ + if (StandbyTransactionIdIsPrepared(entry->xid)) + continue; + + /* Skip if >= oldxid. */ + if (!TransactionIdPrecedes(entry->xid, oldxid)) + continue; + + /* Remove all locks and hash table entry. */ + StandbyReleaseLockList(entry->locks); + hash_search(RecoveryLockLists, entry, HASH_REMOVE, NULL); + } +} + +/* + * -------------------------------------------------------------------- + * Recovery handling for Rmgr RM_STANDBY_ID + * + * These record types will only be created if XLogStandbyInfoActive() + * -------------------------------------------------------------------- + */ + +void +standby_redo(XLogReaderState *record) +{ + uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK; + + /* Backup blocks are not used in standby records */ + Assert(!XLogRecHasAnyBlockRefs(record)); + + /* Do nothing if we're not in hot standby mode */ + if (standbyState == STANDBY_DISABLED) + return; + + if (info == XLOG_STANDBY_LOCK) + { + xl_standby_locks *xlrec = (xl_standby_locks *) XLogRecGetData(record); + int i; + + for (i = 0; i < xlrec->nlocks; i++) + StandbyAcquireAccessExclusiveLock(xlrec->locks[i].xid, + xlrec->locks[i].dbOid, + xlrec->locks[i].relOid); + } + else if (info == XLOG_RUNNING_XACTS) + { + xl_running_xacts *xlrec = (xl_running_xacts *) XLogRecGetData(record); + RunningTransactionsData running; + + running.xcnt = xlrec->xcnt; + running.subxcnt = xlrec->subxcnt; + running.subxid_overflow = xlrec->subxid_overflow; + running.nextXid = xlrec->nextXid; + running.latestCompletedXid = xlrec->latestCompletedXid; + running.oldestRunningXid = xlrec->oldestRunningXid; + running.xids = xlrec->xids; + + ProcArrayApplyRecoveryInfo(&running); + } + else if (info == XLOG_INVALIDATIONS) + { + xl_invalidations *xlrec = (xl_invalidations *) XLogRecGetData(record); + + ProcessCommittedInvalidationMessages(xlrec->msgs, + xlrec->nmsgs, + xlrec->relcacheInitFileInval, + xlrec->dbId, + xlrec->tsId); + } + else + elog(PANIC, "standby_redo: unknown op code %u", info); +} + +/* + * Log details of the current snapshot to WAL. This allows the snapshot state + * to be reconstructed on the standby and for logical decoding. + * + * This is used for Hot Standby as follows: + * + * We can move directly to STANDBY_SNAPSHOT_READY at startup if we + * start from a shutdown checkpoint because we know nothing was running + * at that time and our recovery snapshot is known empty. In the more + * typical case of an online checkpoint we need to jump through a few + * hoops to get a correct recovery snapshot and this requires a two or + * sometimes a three stage process. + * + * The initial snapshot must contain all running xids and all current + * AccessExclusiveLocks at a point in time on the standby. Assembling + * that information while the server is running requires many and + * various LWLocks, so we choose to derive that information piece by + * piece and then re-assemble that info on the standby. When that + * information is fully assembled we move to STANDBY_SNAPSHOT_READY. + * + * Since locking on the primary when we derive the information is not + * strict, we note that there is a time window between the derivation and + * writing to WAL of the derived information. That allows race conditions + * that we must resolve, since xids and locks may enter or leave the + * snapshot during that window. This creates the issue that an xid or + * lock may start *after* the snapshot has been derived yet *before* the + * snapshot is logged in the running xacts WAL record. We resolve this by + * starting to accumulate changes at a point just prior to when we derive + * the snapshot on the primary, then ignore duplicates when we later apply + * the snapshot from the running xacts record. This is implemented during + * CreateCheckpoint() where we use the logical checkpoint location as + * our starting point and then write the running xacts record immediately + * before writing the main checkpoint WAL record. Since we always start + * up from a checkpoint and are immediately at our starting point, we + * unconditionally move to STANDBY_INITIALIZED. After this point we + * must do 4 things: + * * move shared nextXid forwards as we see new xids + * * extend the clog and subtrans with each new xid + * * keep track of uncommitted known assigned xids + * * keep track of uncommitted AccessExclusiveLocks + * + * When we see a commit/abort we must remove known assigned xids and locks + * from the completing transaction. Attempted removals that cannot locate + * an entry are expected and must not cause an error when we are in state + * STANDBY_INITIALIZED. This is implemented in StandbyReleaseLocks() and + * KnownAssignedXidsRemove(). + * + * Later, when we apply the running xact data we must be careful to ignore + * transactions already committed, since those commits raced ahead when + * making WAL entries. + * + * The loose timing also means that locks may be recorded that have a + * zero xid, since xids are removed from procs before locks are removed. + * So we must prune the lock list down to ensure we hold locks only for + * currently running xids, performed by StandbyReleaseOldLocks(). + * Zero xids should no longer be possible, but we may be replaying WAL + * from a time when they were possible. + * + * For logical decoding only the running xacts information is needed; + * there's no need to look at the locking information, but it's logged anyway, + * as there's no independent knob to just enable logical decoding. For + * details of how this is used, check snapbuild.c's introductory comment. + * + * + * Returns the RecPtr of the last inserted record. + */ +XLogRecPtr +LogStandbySnapshot(void) +{ + XLogRecPtr recptr; + RunningTransactions running; + xl_standby_lock *locks; + int nlocks; + + Assert(XLogStandbyInfoActive()); + + /* + * Get details of any AccessExclusiveLocks being held at the moment. + */ + locks = GetRunningTransactionLocks(&nlocks); + if (nlocks > 0) + LogAccessExclusiveLocks(nlocks, locks); + pfree(locks); + + /* + * Log details of all in-progress transactions. This should be the last + * record we write, because standby will open up when it sees this. + */ + running = GetRunningTransactionData(); + + /* + * GetRunningTransactionData() acquired ProcArrayLock, we must release it. + * For Hot Standby this can be done before inserting the WAL record + * because ProcArrayApplyRecoveryInfo() rechecks the commit status using + * the clog. For logical decoding, though, the lock can't be released + * early because the clog might be "in the future" from the POV of the + * historic snapshot. This would allow for situations where we're waiting + * for the end of a transaction listed in the xl_running_xacts record + * which, according to the WAL, has committed before the xl_running_xacts + * record. Fortunately this routine isn't executed frequently, and it's + * only a shared lock. + */ + if (wal_level < WAL_LEVEL_LOGICAL) + LWLockRelease(ProcArrayLock); + + recptr = LogCurrentRunningXacts(running); + + /* Release lock if we kept it longer ... */ + if (wal_level >= WAL_LEVEL_LOGICAL) + LWLockRelease(ProcArrayLock); + + /* GetRunningTransactionData() acquired XidGenLock, we must release it */ + LWLockRelease(XidGenLock); + + return recptr; +} + +/* + * Record an enhanced snapshot of running transactions into WAL. + * + * The definitions of RunningTransactionsData and xl_xact_running_xacts are + * similar. We keep them separate because xl_xact_running_xacts is a + * contiguous chunk of memory and never exists fully until it is assembled in + * WAL. The inserted records are marked as not being important for durability, + * to avoid triggering superfluous checkpoint / archiving activity. + */ +static XLogRecPtr +LogCurrentRunningXacts(RunningTransactions CurrRunningXacts) +{ + xl_running_xacts xlrec; + XLogRecPtr recptr; + + xlrec.xcnt = CurrRunningXacts->xcnt; + xlrec.subxcnt = CurrRunningXacts->subxcnt; + xlrec.subxid_overflow = CurrRunningXacts->subxid_overflow; + xlrec.nextXid = CurrRunningXacts->nextXid; + xlrec.oldestRunningXid = CurrRunningXacts->oldestRunningXid; + xlrec.latestCompletedXid = CurrRunningXacts->latestCompletedXid; + + /* Header */ + XLogBeginInsert(); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + XLogRegisterData((char *) (&xlrec), MinSizeOfXactRunningXacts); + + /* array of TransactionIds */ + if (xlrec.xcnt > 0) + XLogRegisterData((char *) CurrRunningXacts->xids, + (xlrec.xcnt + xlrec.subxcnt) * sizeof(TransactionId)); + + recptr = XLogInsert(RM_STANDBY_ID, XLOG_RUNNING_XACTS); + + if (CurrRunningXacts->subxid_overflow) + elog(trace_recovery(DEBUG2), + "snapshot of %u running transactions overflowed (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + CurrRunningXacts->xcnt, + LSN_FORMAT_ARGS(recptr), + CurrRunningXacts->oldestRunningXid, + CurrRunningXacts->latestCompletedXid, + CurrRunningXacts->nextXid); + else + elog(trace_recovery(DEBUG2), + "snapshot of %u+%u running transaction ids (lsn %X/%X oldest xid %u latest complete %u next xid %u)", + CurrRunningXacts->xcnt, CurrRunningXacts->subxcnt, + LSN_FORMAT_ARGS(recptr), + CurrRunningXacts->oldestRunningXid, + CurrRunningXacts->latestCompletedXid, + CurrRunningXacts->nextXid); + + /* + * Ensure running_xacts information is synced to disk not too far in the + * future. We don't want to stall anything though (i.e. use XLogFlush()), + * so we let the wal writer do it during normal operation. + * XLogSetAsyncXactLSN() conveniently will mark the LSN as to-be-synced + * and nudge the WALWriter into action if sleeping. Check + * XLogBackgroundFlush() for details why a record might not be flushed + * without it. + */ + XLogSetAsyncXactLSN(recptr); + + return recptr; +} + +/* + * Wholesale logging of AccessExclusiveLocks. Other lock types need not be + * logged, as described in backend/storage/lmgr/README. + */ +static void +LogAccessExclusiveLocks(int nlocks, xl_standby_lock *locks) +{ + xl_standby_locks xlrec; + + xlrec.nlocks = nlocks; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, offsetof(xl_standby_locks, locks)); + XLogRegisterData((char *) locks, nlocks * sizeof(xl_standby_lock)); + XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT); + + (void) XLogInsert(RM_STANDBY_ID, XLOG_STANDBY_LOCK); +} + +/* + * Individual logging of AccessExclusiveLocks for use during LockAcquire() + */ +void +LogAccessExclusiveLock(Oid dbOid, Oid relOid) +{ + xl_standby_lock xlrec; + + xlrec.xid = GetCurrentTransactionId(); + + xlrec.dbOid = dbOid; + xlrec.relOid = relOid; + + LogAccessExclusiveLocks(1, &xlrec); + MyXactFlags |= XACT_FLAGS_ACQUIREDACCESSEXCLUSIVELOCK; +} + +/* + * Prepare to log an AccessExclusiveLock, for use during LockAcquire() + */ +void +LogAccessExclusiveLockPrepare(void) +{ + /* + * Ensure that a TransactionId has been assigned to this transaction, for + * two reasons, both related to lock release on the standby. First, we + * must assign an xid so that RecordTransactionCommit() and + * RecordTransactionAbort() do not optimise away the transaction + * completion record which recovery relies upon to release locks. It's a + * hack, but for a corner case not worth adding code for into the main + * commit path. Second, we must assign an xid before the lock is recorded + * in shared memory, otherwise a concurrently executing + * GetRunningTransactionLocks() might see a lock associated with an + * InvalidTransactionId which we later assert cannot happen. + */ + (void) GetCurrentTransactionId(); +} + +/* + * Emit WAL for invalidations. This currently is only used for commits without + * an xid but which contain invalidations. + */ +void +LogStandbyInvalidations(int nmsgs, SharedInvalidationMessage *msgs, + bool relcacheInitFileInval) +{ + xl_invalidations xlrec; + + /* prepare record */ + memset(&xlrec, 0, sizeof(xlrec)); + xlrec.dbId = MyDatabaseId; + xlrec.tsId = MyDatabaseTableSpace; + xlrec.relcacheInitFileInval = relcacheInitFileInval; + xlrec.nmsgs = nmsgs; + + /* perform insertion */ + XLogBeginInsert(); + XLogRegisterData((char *) (&xlrec), MinSizeOfInvalidations); + XLogRegisterData((char *) msgs, + nmsgs * sizeof(SharedInvalidationMessage)); + XLogInsert(RM_STANDBY_ID, XLOG_INVALIDATIONS); +} + +/* Return the description of recovery conflict */ +static const char * +get_recovery_conflict_desc(ProcSignalReason reason) +{ + const char *reasonDesc = _("unknown reason"); + + switch (reason) + { + case PROCSIG_RECOVERY_CONFLICT_BUFFERPIN: + reasonDesc = _("recovery conflict on buffer pin"); + break; + case PROCSIG_RECOVERY_CONFLICT_LOCK: + reasonDesc = _("recovery conflict on lock"); + break; + case PROCSIG_RECOVERY_CONFLICT_TABLESPACE: + reasonDesc = _("recovery conflict on tablespace"); + break; + case PROCSIG_RECOVERY_CONFLICT_SNAPSHOT: + reasonDesc = _("recovery conflict on snapshot"); + break; + case PROCSIG_RECOVERY_CONFLICT_STARTUP_DEADLOCK: + reasonDesc = _("recovery conflict on buffer deadlock"); + break; + case PROCSIG_RECOVERY_CONFLICT_DATABASE: + reasonDesc = _("recovery conflict on database"); + break; + default: + break; + } + + return reasonDesc; +} |