summaryrefslogtreecommitdiffstats
path: root/src/backend/storage/lmgr/lwlock.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/storage/lmgr/lwlock.c')
-rw-r--r--src/backend/storage/lmgr/lwlock.c1977
1 files changed, 1977 insertions, 0 deletions
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
new file mode 100644
index 0000000..07eb6f6
--- /dev/null
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -0,0 +1,1977 @@
+/*-------------------------------------------------------------------------
+ *
+ * lwlock.c
+ * Lightweight lock manager
+ *
+ * Lightweight locks are intended primarily to provide mutual exclusion of
+ * access to shared-memory data structures. Therefore, they offer both
+ * exclusive and shared lock modes (to support read/write and read-only
+ * access to a shared object). There are few other frammishes. User-level
+ * locking should be done with the full lock manager --- which depends on
+ * LWLocks to protect its shared state.
+ *
+ * In addition to exclusive and shared modes, lightweight locks can be used to
+ * wait until a variable changes value. The variable is initially not set
+ * when the lock is acquired with LWLockAcquire, i.e. it remains set to the
+ * value it was set to when the lock was released last, and can be updated
+ * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar
+ * waits for the variable to be updated, or until the lock is free. When
+ * releasing the lock with LWLockReleaseClearVar() the value can be set to an
+ * appropriate value for a free lock. The meaning of the variable is up to
+ * the caller, the lightweight lock code just assigns and compares it.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/lwlock.c
+ *
+ * NOTES:
+ *
+ * This used to be a pretty straight forward reader-writer lock
+ * implementation, in which the internal state was protected by a
+ * spinlock. Unfortunately the overhead of taking the spinlock proved to be
+ * too high for workloads/locks that were taken in shared mode very
+ * frequently. Often we were spinning in the (obviously exclusive) spinlock,
+ * while trying to acquire a shared lock that was actually free.
+ *
+ * Thus a new implementation was devised that provides wait-free shared lock
+ * acquisition for locks that aren't exclusively locked.
+ *
+ * The basic idea is to have a single atomic variable 'lockcount' instead of
+ * the formerly separate shared and exclusive counters and to use atomic
+ * operations to acquire the lock. That's fairly easy to do for plain
+ * rw-spinlocks, but a lot harder for something like LWLocks that want to wait
+ * in the OS.
+ *
+ * For lock acquisition we use an atomic compare-and-exchange on the lockcount
+ * variable. For exclusive lock we swap in a sentinel value
+ * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders.
+ *
+ * To release the lock we use an atomic decrement to release the lock. If the
+ * new value is zero (we get that atomically), we know we can/have to release
+ * waiters.
+ *
+ * Obviously it is important that the sentinel value for exclusive locks
+ * doesn't conflict with the maximum number of possible share lockers -
+ * luckily MAX_BACKENDS makes that easily possible.
+ *
+ *
+ * The attentive reader might have noticed that naively doing the above has a
+ * glaring race condition: We try to lock using the atomic operations and
+ * notice that we have to wait. Unfortunately by the time we have finished
+ * queuing, the former locker very well might have already finished it's
+ * work. That's problematic because we're now stuck waiting inside the OS.
+
+ * To mitigate those races we use a two phased attempt at locking:
+ * Phase 1: Try to do it atomically, if we succeed, nice
+ * Phase 2: Add ourselves to the waitqueue of the lock
+ * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from
+ * the queue
+ * Phase 4: Sleep till wake-up, goto Phase 1
+ *
+ * This protects us against the problem from above as nobody can release too
+ * quick, before we're queued, since after Phase 2 we're already queued.
+ * -------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pg_trace.h"
+#include "pgstat.h"
+#include "postmaster/postmaster.h"
+#include "replication/slot.h"
+#include "storage/ipc.h"
+#include "storage/predicate.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "utils/memutils.h"
+
+#ifdef LWLOCK_STATS
+#include "utils/hsearch.h"
+#endif
+
+
+/* We use the ShmemLock spinlock to protect LWLockCounter */
+extern slock_t *ShmemLock;
+
+#define LW_FLAG_HAS_WAITERS ((uint32) 1 << 30)
+#define LW_FLAG_RELEASE_OK ((uint32) 1 << 29)
+#define LW_FLAG_LOCKED ((uint32) 1 << 28)
+
+#define LW_VAL_EXCLUSIVE ((uint32) 1 << 24)
+#define LW_VAL_SHARED 1
+
+#define LW_LOCK_MASK ((uint32) ((1 << 25)-1))
+/* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */
+#define LW_SHARED_MASK ((uint32) ((1 << 24)-1))
+
+/*
+ * There are three sorts of LWLock "tranches":
+ *
+ * 1. The individually-named locks defined in lwlocknames.h each have their
+ * own tranche. The names of these tranches appear in IndividualLWLockNames[]
+ * in lwlocknames.c.
+ *
+ * 2. There are some predefined tranches for built-in groups of locks.
+ * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names
+ * appear in BuiltinTrancheNames[] below.
+ *
+ * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche
+ * or LWLockRegisterTranche. The names of these that are known in the current
+ * process appear in LWLockTrancheNames[].
+ *
+ * All these names are user-visible as wait event names, so choose with care
+ * ... and do not forget to update the documentation's list of wait events.
+ */
+extern const char *const IndividualLWLockNames[]; /* in lwlocknames.c */
+
+static const char *const BuiltinTrancheNames[] = {
+ /* LWTRANCHE_XACT_BUFFER: */
+ "XactBuffer",
+ /* LWTRANCHE_COMMITTS_BUFFER: */
+ "CommitTSBuffer",
+ /* LWTRANCHE_SUBTRANS_BUFFER: */
+ "SubtransBuffer",
+ /* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */
+ "MultiXactOffsetBuffer",
+ /* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */
+ "MultiXactMemberBuffer",
+ /* LWTRANCHE_NOTIFY_BUFFER: */
+ "NotifyBuffer",
+ /* LWTRANCHE_SERIAL_BUFFER: */
+ "SerialBuffer",
+ /* LWTRANCHE_WAL_INSERT: */
+ "WALInsert",
+ /* LWTRANCHE_BUFFER_CONTENT: */
+ "BufferContent",
+ /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */
+ "ReplicationOriginState",
+ /* LWTRANCHE_REPLICATION_SLOT_IO: */
+ "ReplicationSlotIO",
+ /* LWTRANCHE_LOCK_FASTPATH: */
+ "LockFastPath",
+ /* LWTRANCHE_BUFFER_MAPPING: */
+ "BufferMapping",
+ /* LWTRANCHE_LOCK_MANAGER: */
+ "LockManager",
+ /* LWTRANCHE_PREDICATE_LOCK_MANAGER: */
+ "PredicateLockManager",
+ /* LWTRANCHE_PARALLEL_HASH_JOIN: */
+ "ParallelHashJoin",
+ /* LWTRANCHE_PARALLEL_QUERY_DSA: */
+ "ParallelQueryDSA",
+ /* LWTRANCHE_PER_SESSION_DSA: */
+ "PerSessionDSA",
+ /* LWTRANCHE_PER_SESSION_RECORD_TYPE: */
+ "PerSessionRecordType",
+ /* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */
+ "PerSessionRecordTypmod",
+ /* LWTRANCHE_SHARED_TUPLESTORE: */
+ "SharedTupleStore",
+ /* LWTRANCHE_SHARED_TIDBITMAP: */
+ "SharedTidBitmap",
+ /* LWTRANCHE_PARALLEL_APPEND: */
+ "ParallelAppend",
+ /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */
+ "PerXactPredicateList"
+};
+
+StaticAssertDecl(lengthof(BuiltinTrancheNames) ==
+ LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS,
+ "missing entries in BuiltinTrancheNames[]");
+
+/*
+ * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and
+ * stores the names of all dynamically-created tranches known to the current
+ * process. Any unused entries in the array will contain NULL.
+ */
+static const char **LWLockTrancheNames = NULL;
+static int LWLockTrancheNamesAllocated = 0;
+
+/*
+ * This points to the main array of LWLocks in shared memory. Backends inherit
+ * the pointer by fork from the postmaster (except in the EXEC_BACKEND case,
+ * where we have special measures to pass it down).
+ */
+LWLockPadded *MainLWLockArray = NULL;
+
+/*
+ * We use this structure to keep track of locked LWLocks for release
+ * during error recovery. Normally, only a few will be held at once, but
+ * occasionally the number can be much higher; for example, the pg_buffercache
+ * extension locks all buffer partitions simultaneously.
+ */
+#define MAX_SIMUL_LWLOCKS 200
+
+/* struct representing the LWLocks we're holding */
+typedef struct LWLockHandle
+{
+ LWLock *lock;
+ LWLockMode mode;
+} LWLockHandle;
+
+static int num_held_lwlocks = 0;
+static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS];
+
+/* struct representing the LWLock tranche request for named tranche */
+typedef struct NamedLWLockTrancheRequest
+{
+ char tranche_name[NAMEDATALEN];
+ int num_lwlocks;
+} NamedLWLockTrancheRequest;
+
+static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL;
+static int NamedLWLockTrancheRequestsAllocated = 0;
+
+/*
+ * NamedLWLockTrancheRequests is both the valid length of the request array,
+ * and the length of the shared-memory NamedLWLockTrancheArray later on.
+ * This variable and NamedLWLockTrancheArray are non-static so that
+ * postmaster.c can copy them to child processes in EXEC_BACKEND builds.
+ */
+int NamedLWLockTrancheRequests = 0;
+
+/* points to data in shared memory: */
+NamedLWLockTranche *NamedLWLockTrancheArray = NULL;
+
+static bool lock_named_request_allowed = true;
+
+static void InitializeLWLocks(void);
+static inline void LWLockReportWaitStart(LWLock *lock);
+static inline void LWLockReportWaitEnd(void);
+static const char *GetLWTrancheName(uint16 trancheId);
+
+#define T_NAME(lock) \
+ GetLWTrancheName((lock)->tranche)
+
+#ifdef LWLOCK_STATS
+typedef struct lwlock_stats_key
+{
+ int tranche;
+ void *instance;
+} lwlock_stats_key;
+
+typedef struct lwlock_stats
+{
+ lwlock_stats_key key;
+ int sh_acquire_count;
+ int ex_acquire_count;
+ int block_count;
+ int dequeue_self_count;
+ int spin_delay_count;
+} lwlock_stats;
+
+static HTAB *lwlock_stats_htab;
+static lwlock_stats lwlock_stats_dummy;
+#endif
+
+#ifdef LOCK_DEBUG
+bool Trace_lwlocks = false;
+
+inline static void
+PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode)
+{
+ /* hide statement & context here, otherwise the log is just too verbose */
+ if (Trace_lwlocks)
+ {
+ uint32 state = pg_atomic_read_u32(&lock->state);
+
+ ereport(LOG,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d",
+ MyProcPid,
+ where, T_NAME(lock), lock,
+ (state & LW_VAL_EXCLUSIVE) != 0,
+ state & LW_SHARED_MASK,
+ (state & LW_FLAG_HAS_WAITERS) != 0,
+ pg_atomic_read_u32(&lock->nwaiters),
+ (state & LW_FLAG_RELEASE_OK) != 0)));
+ }
+}
+
+inline static void
+LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg)
+{
+ /* hide statement & context here, otherwise the log is just too verbose */
+ if (Trace_lwlocks)
+ {
+ ereport(LOG,
+ (errhidestmt(true),
+ errhidecontext(true),
+ errmsg_internal("%s(%s %p): %s", where,
+ T_NAME(lock), lock, msg)));
+ }
+}
+
+#else /* not LOCK_DEBUG */
+#define PRINT_LWDEBUG(a,b,c) ((void)0)
+#define LOG_LWDEBUG(a,b,c) ((void)0)
+#endif /* LOCK_DEBUG */
+
+#ifdef LWLOCK_STATS
+
+static void init_lwlock_stats(void);
+static void print_lwlock_stats(int code, Datum arg);
+static lwlock_stats * get_lwlock_stats_entry(LWLock *lock);
+
+static void
+init_lwlock_stats(void)
+{
+ HASHCTL ctl;
+ static MemoryContext lwlock_stats_cxt = NULL;
+ static bool exit_registered = false;
+
+ if (lwlock_stats_cxt != NULL)
+ MemoryContextDelete(lwlock_stats_cxt);
+
+ /*
+ * The LWLock stats will be updated within a critical section, which
+ * requires allocating new hash entries. Allocations within a critical
+ * section are normally not allowed because running out of memory would
+ * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally
+ * turned on in production, so that's an acceptable risk. The hash entries
+ * are small, so the risk of running out of memory is minimal in practice.
+ */
+ lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext,
+ "LWLock stats",
+ ALLOCSET_DEFAULT_SIZES);
+ MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true);
+
+ ctl.keysize = sizeof(lwlock_stats_key);
+ ctl.entrysize = sizeof(lwlock_stats);
+ ctl.hcxt = lwlock_stats_cxt;
+ lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl,
+ HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
+ if (!exit_registered)
+ {
+ on_shmem_exit(print_lwlock_stats, 0);
+ exit_registered = true;
+ }
+}
+
+static void
+print_lwlock_stats(int code, Datum arg)
+{
+ HASH_SEQ_STATUS scan;
+ lwlock_stats *lwstats;
+
+ hash_seq_init(&scan, lwlock_stats_htab);
+
+ /* Grab an LWLock to keep different backends from mixing reports */
+ LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE);
+
+ while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL)
+ {
+ fprintf(stderr,
+ "PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n",
+ MyProcPid, GetLWTrancheName(lwstats->key.tranche),
+ lwstats->key.instance, lwstats->sh_acquire_count,
+ lwstats->ex_acquire_count, lwstats->block_count,
+ lwstats->spin_delay_count, lwstats->dequeue_self_count);
+ }
+
+ LWLockRelease(&MainLWLockArray[0].lock);
+}
+
+static lwlock_stats *
+get_lwlock_stats_entry(LWLock *lock)
+{
+ lwlock_stats_key key;
+ lwlock_stats *lwstats;
+ bool found;
+
+ /*
+ * During shared memory initialization, the hash table doesn't exist yet.
+ * Stats of that phase aren't very interesting, so just collect operations
+ * on all locks in a single dummy entry.
+ */
+ if (lwlock_stats_htab == NULL)
+ return &lwlock_stats_dummy;
+
+ /* Fetch or create the entry. */
+ MemSet(&key, 0, sizeof(key));
+ key.tranche = lock->tranche;
+ key.instance = lock;
+ lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found);
+ if (!found)
+ {
+ lwstats->sh_acquire_count = 0;
+ lwstats->ex_acquire_count = 0;
+ lwstats->block_count = 0;
+ lwstats->dequeue_self_count = 0;
+ lwstats->spin_delay_count = 0;
+ }
+ return lwstats;
+}
+#endif /* LWLOCK_STATS */
+
+
+/*
+ * Compute number of LWLocks required by named tranches. These will be
+ * allocated in the main array.
+ */
+static int
+NumLWLocksForNamedTranches(void)
+{
+ int numLocks = 0;
+ int i;
+
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+
+ return numLocks;
+}
+
+/*
+ * Compute shmem space needed for LWLocks and named tranches.
+ */
+Size
+LWLockShmemSize(void)
+{
+ Size size;
+ int i;
+ int numLocks = NUM_FIXED_LWLOCKS;
+
+ /* Calculate total number of locks needed in the main array. */
+ numLocks += NumLWLocksForNamedTranches();
+
+ /* Space for the LWLock array. */
+ size = mul_size(numLocks, sizeof(LWLockPadded));
+
+ /* Space for dynamic allocation counter, plus room for alignment. */
+ size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE);
+
+ /* space for named tranches. */
+ size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche)));
+
+ /* space for name of each tranche. */
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1);
+
+ /* Disallow adding any more named tranches. */
+ lock_named_request_allowed = false;
+
+ return size;
+}
+
+/*
+ * Allocate shmem space for the main LWLock array and all tranches and
+ * initialize it. We also register extension LWLock tranches here.
+ */
+void
+CreateLWLocks(void)
+{
+ StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS,
+ "MAX_BACKENDS too big for lwlock.c");
+
+ StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE,
+ "Miscalculated LWLock padding");
+
+ if (!IsUnderPostmaster)
+ {
+ Size spaceLocks = LWLockShmemSize();
+ int *LWLockCounter;
+ char *ptr;
+
+ /* Allocate space */
+ ptr = (char *) ShmemAlloc(spaceLocks);
+
+ /* Leave room for dynamic allocation of tranches */
+ ptr += sizeof(int);
+
+ /* Ensure desired alignment of LWLock array */
+ ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE;
+
+ MainLWLockArray = (LWLockPadded *) ptr;
+
+ /*
+ * Initialize the dynamic-allocation counter for tranches, which is
+ * stored just before the first LWLock.
+ */
+ LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+ *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED;
+
+ /* Initialize all LWLocks */
+ InitializeLWLocks();
+ }
+
+ /* Register named extension LWLock tranches in the current process. */
+ for (int i = 0; i < NamedLWLockTrancheRequests; i++)
+ LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId,
+ NamedLWLockTrancheArray[i].trancheName);
+}
+
+/*
+ * Initialize LWLocks that are fixed and those belonging to named tranches.
+ */
+static void
+InitializeLWLocks(void)
+{
+ int numNamedLocks = NumLWLocksForNamedTranches();
+ int id;
+ int i;
+ int j;
+ LWLockPadded *lock;
+
+ /* Initialize all individual LWLocks in main array */
+ for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++)
+ LWLockInitialize(&lock->lock, id);
+
+ /* Initialize buffer mapping LWLocks in main array */
+ lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING);
+
+ /* Initialize lmgrs' LWLocks in main array */
+ lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER);
+
+ /* Initialize predicate lmgrs' LWLocks in main array */
+ lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET;
+ for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++)
+ LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER);
+
+ /*
+ * Copy the info about any named tranches into shared memory (so that
+ * other processes can see it), and initialize the requested LWLocks.
+ */
+ if (NamedLWLockTrancheRequests > 0)
+ {
+ char *trancheNames;
+
+ NamedLWLockTrancheArray = (NamedLWLockTranche *)
+ &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks];
+
+ trancheNames = (char *) NamedLWLockTrancheArray +
+ (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche));
+ lock = &MainLWLockArray[NUM_FIXED_LWLOCKS];
+
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ {
+ NamedLWLockTrancheRequest *request;
+ NamedLWLockTranche *tranche;
+ char *name;
+
+ request = &NamedLWLockTrancheRequestArray[i];
+ tranche = &NamedLWLockTrancheArray[i];
+
+ name = trancheNames;
+ trancheNames += strlen(request->tranche_name) + 1;
+ strcpy(name, request->tranche_name);
+ tranche->trancheId = LWLockNewTrancheId();
+ tranche->trancheName = name;
+
+ for (j = 0; j < request->num_lwlocks; j++, lock++)
+ LWLockInitialize(&lock->lock, tranche->trancheId);
+ }
+ }
+}
+
+/*
+ * InitLWLockAccess - initialize backend-local state needed to hold LWLocks
+ */
+void
+InitLWLockAccess(void)
+{
+#ifdef LWLOCK_STATS
+ init_lwlock_stats();
+#endif
+}
+
+/*
+ * GetNamedLWLockTranche - returns the base address of LWLock from the
+ * specified tranche.
+ *
+ * Caller needs to retrieve the requested number of LWLocks starting from
+ * the base lock address returned by this API. This can be used for
+ * tranches that are requested by using RequestNamedLWLockTranche() API.
+ */
+LWLockPadded *
+GetNamedLWLockTranche(const char *tranche_name)
+{
+ int lock_pos;
+ int i;
+
+ /*
+ * Obtain the position of base address of LWLock belonging to requested
+ * tranche_name in MainLWLockArray. LWLocks for named tranches are placed
+ * in MainLWLockArray after fixed locks.
+ */
+ lock_pos = NUM_FIXED_LWLOCKS;
+ for (i = 0; i < NamedLWLockTrancheRequests; i++)
+ {
+ if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name,
+ tranche_name) == 0)
+ return &MainLWLockArray[lock_pos];
+
+ lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks;
+ }
+
+ elog(ERROR, "requested tranche is not registered");
+
+ /* just to keep compiler quiet */
+ return NULL;
+}
+
+/*
+ * Allocate a new tranche ID.
+ */
+int
+LWLockNewTrancheId(void)
+{
+ int result;
+ int *LWLockCounter;
+
+ LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int));
+ SpinLockAcquire(ShmemLock);
+ result = (*LWLockCounter)++;
+ SpinLockRelease(ShmemLock);
+
+ return result;
+}
+
+/*
+ * Register a dynamic tranche name in the lookup table of the current process.
+ *
+ * This routine will save a pointer to the tranche name passed as an argument,
+ * so the name should be allocated in a backend-lifetime context
+ * (shared memory, TopMemoryContext, static constant, or similar).
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+LWLockRegisterTranche(int tranche_id, const char *tranche_name)
+{
+ /* This should only be called for user-defined tranches. */
+ if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED)
+ return;
+
+ /* Convert to array index. */
+ tranche_id -= LWTRANCHE_FIRST_USER_DEFINED;
+
+ /* If necessary, create or enlarge array. */
+ if (tranche_id >= LWLockTrancheNamesAllocated)
+ {
+ int newalloc;
+
+ newalloc = Max(LWLockTrancheNamesAllocated, 8);
+ while (newalloc <= tranche_id)
+ newalloc *= 2;
+
+ if (LWLockTrancheNames == NULL)
+ LWLockTrancheNames = (const char **)
+ MemoryContextAllocZero(TopMemoryContext,
+ newalloc * sizeof(char *));
+ else
+ {
+ LWLockTrancheNames = (const char **)
+ repalloc(LWLockTrancheNames, newalloc * sizeof(char *));
+ memset(LWLockTrancheNames + LWLockTrancheNamesAllocated,
+ 0,
+ (newalloc - LWLockTrancheNamesAllocated) * sizeof(char *));
+ }
+ LWLockTrancheNamesAllocated = newalloc;
+ }
+
+ LWLockTrancheNames[tranche_id] = tranche_name;
+}
+
+/*
+ * RequestNamedLWLockTranche
+ * Request that extra LWLocks be allocated during postmaster
+ * startup.
+ *
+ * This is only useful for extensions if called from the _PG_init hook
+ * of a library that is loaded into the postmaster via
+ * shared_preload_libraries. Once shared memory has been allocated, calls
+ * will be ignored. (We could raise an error, but it seems better to make
+ * it a no-op, so that libraries containing such calls can be reloaded if
+ * needed.)
+ *
+ * The tranche name will be user-visible as a wait event name, so try to
+ * use a name that fits the style for those.
+ */
+void
+RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks)
+{
+ NamedLWLockTrancheRequest *request;
+
+ if (IsUnderPostmaster || !lock_named_request_allowed)
+ return; /* too late */
+
+ if (NamedLWLockTrancheRequestArray == NULL)
+ {
+ NamedLWLockTrancheRequestsAllocated = 16;
+ NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+ MemoryContextAlloc(TopMemoryContext,
+ NamedLWLockTrancheRequestsAllocated
+ * sizeof(NamedLWLockTrancheRequest));
+ }
+
+ if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated)
+ {
+ int i = NamedLWLockTrancheRequestsAllocated;
+
+ while (i <= NamedLWLockTrancheRequests)
+ i *= 2;
+
+ NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *)
+ repalloc(NamedLWLockTrancheRequestArray,
+ i * sizeof(NamedLWLockTrancheRequest));
+ NamedLWLockTrancheRequestsAllocated = i;
+ }
+
+ request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests];
+ Assert(strlen(tranche_name) + 1 <= NAMEDATALEN);
+ strlcpy(request->tranche_name, tranche_name, NAMEDATALEN);
+ request->num_lwlocks = num_lwlocks;
+ NamedLWLockTrancheRequests++;
+}
+
+/*
+ * LWLockInitialize - initialize a new lwlock; it's initially unlocked
+ */
+void
+LWLockInitialize(LWLock *lock, int tranche_id)
+{
+ pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK);
+#ifdef LOCK_DEBUG
+ pg_atomic_init_u32(&lock->nwaiters, 0);
+#endif
+ lock->tranche = tranche_id;
+ proclist_init(&lock->waiters);
+}
+
+/*
+ * Report start of wait event for light-weight locks.
+ *
+ * This function will be used by all the light-weight lock calls which
+ * needs to wait to acquire the lock. This function distinguishes wait
+ * event based on tranche and lock id.
+ */
+static inline void
+LWLockReportWaitStart(LWLock *lock)
+{
+ pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche);
+}
+
+/*
+ * Report end of wait event for light-weight locks.
+ */
+static inline void
+LWLockReportWaitEnd(void)
+{
+ pgstat_report_wait_end();
+}
+
+/*
+ * Return the name of an LWLock tranche.
+ */
+static const char *
+GetLWTrancheName(uint16 trancheId)
+{
+ /* Individual LWLock? */
+ if (trancheId < NUM_INDIVIDUAL_LWLOCKS)
+ return IndividualLWLockNames[trancheId];
+
+ /* Built-in tranche? */
+ if (trancheId < LWTRANCHE_FIRST_USER_DEFINED)
+ return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS];
+
+ /*
+ * It's an extension tranche, so look in LWLockTrancheNames[]. However,
+ * it's possible that the tranche has never been registered in the current
+ * process, in which case give up and return "extension".
+ */
+ trancheId -= LWTRANCHE_FIRST_USER_DEFINED;
+
+ if (trancheId >= LWLockTrancheNamesAllocated ||
+ LWLockTrancheNames[trancheId] == NULL)
+ return "extension";
+
+ return LWLockTrancheNames[trancheId];
+}
+
+/*
+ * Return an identifier for an LWLock based on the wait class and event.
+ */
+const char *
+GetLWLockIdentifier(uint32 classId, uint16 eventId)
+{
+ Assert(classId == PG_WAIT_LWLOCK);
+ /* The event IDs are just tranche numbers. */
+ return GetLWTrancheName(eventId);
+}
+
+/*
+ * Internal function that tries to atomically acquire the lwlock in the passed
+ * in mode.
+ *
+ * This function will not block waiting for a lock to become free - that's the
+ * callers job.
+ *
+ * Returns true if the lock isn't free and we need to wait.
+ */
+static bool
+LWLockAttemptLock(LWLock *lock, LWLockMode mode)
+{
+ uint32 old_state;
+
+ AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED);
+
+ /*
+ * Read once outside the loop, later iterations will get the newer value
+ * via compare & exchange.
+ */
+ old_state = pg_atomic_read_u32(&lock->state);
+
+ /* loop until we've determined whether we could acquire the lock or not */
+ while (true)
+ {
+ uint32 desired_state;
+ bool lock_free;
+
+ desired_state = old_state;
+
+ if (mode == LW_EXCLUSIVE)
+ {
+ lock_free = (old_state & LW_LOCK_MASK) == 0;
+ if (lock_free)
+ desired_state += LW_VAL_EXCLUSIVE;
+ }
+ else
+ {
+ lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0;
+ if (lock_free)
+ desired_state += LW_VAL_SHARED;
+ }
+
+ /*
+ * Attempt to swap in the state we are expecting. If we didn't see
+ * lock to be free, that's just the old value. If we saw it as free,
+ * we'll attempt to mark it acquired. The reason that we always swap
+ * in the value is that this doubles as a memory barrier. We could try
+ * to be smarter and only swap in values if we saw the lock as free,
+ * but benchmark haven't shown it as beneficial so far.
+ *
+ * Retry if the value changed since we last looked at it.
+ */
+ if (pg_atomic_compare_exchange_u32(&lock->state,
+ &old_state, desired_state))
+ {
+ if (lock_free)
+ {
+ /* Great! Got the lock. */
+#ifdef LOCK_DEBUG
+ if (mode == LW_EXCLUSIVE)
+ lock->owner = MyProc;
+#endif
+ return false;
+ }
+ else
+ return true; /* somebody else has the lock */
+ }
+ }
+ pg_unreachable();
+}
+
+/*
+ * Lock the LWLock's wait list against concurrent activity.
+ *
+ * NB: even though the wait list is locked, non-conflicting lock operations
+ * may still happen concurrently.
+ *
+ * Time spent holding mutex should be short!
+ */
+static void
+LWLockWaitListLock(LWLock *lock)
+{
+ uint32 old_state;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+ uint32 delays = 0;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ while (true)
+ {
+ /* always try once to acquire lock directly */
+ old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED);
+ if (!(old_state & LW_FLAG_LOCKED))
+ break; /* got lock */
+
+ /* and then spin without atomic operations until lock is released */
+ {
+ SpinDelayStatus delayStatus;
+
+ init_local_spin_delay(&delayStatus);
+
+ while (old_state & LW_FLAG_LOCKED)
+ {
+ perform_spin_delay(&delayStatus);
+ old_state = pg_atomic_read_u32(&lock->state);
+ }
+#ifdef LWLOCK_STATS
+ delays += delayStatus.delays;
+#endif
+ finish_spin_delay(&delayStatus);
+ }
+
+ /*
+ * Retry. The lock might obviously already be re-acquired by the time
+ * we're attempting to get it again.
+ */
+ }
+
+#ifdef LWLOCK_STATS
+ lwstats->spin_delay_count += delays;
+#endif
+}
+
+/*
+ * Unlock the LWLock's wait list.
+ *
+ * Note that it can be more efficient to manipulate flags and release the
+ * locks in a single atomic operation.
+ */
+static void
+LWLockWaitListUnlock(LWLock *lock)
+{
+ uint32 old_state PG_USED_FOR_ASSERTS_ONLY;
+
+ old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED);
+
+ Assert(old_state & LW_FLAG_LOCKED);
+}
+
+/*
+ * Wakeup all the lockers that currently have a chance to acquire the lock.
+ */
+static void
+LWLockWakeup(LWLock *lock)
+{
+ bool new_release_ok;
+ bool wokeup_somebody = false;
+ proclist_head wakeup;
+ proclist_mutable_iter iter;
+
+ proclist_init(&wakeup);
+
+ new_release_ok = true;
+
+ /* lock wait list while collecting backends to wake up */
+ LWLockWaitListLock(lock);
+
+ proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE)
+ continue;
+
+ proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+ proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+
+ if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+ {
+ /*
+ * Prevent additional wakeups until retryer gets to run. Backends
+ * that are just waiting for the lock to become free don't retry
+ * automatically.
+ */
+ new_release_ok = false;
+
+ /*
+ * Don't wakeup (further) exclusive locks.
+ */
+ wokeup_somebody = true;
+ }
+
+ /*
+ * Once we've woken up an exclusive lock, there's no point in waking
+ * up anybody else.
+ */
+ if (waiter->lwWaitMode == LW_EXCLUSIVE)
+ break;
+ }
+
+ Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS);
+
+ /* unset required flags, and release lock, in one fell swoop */
+ {
+ uint32 old_state;
+ uint32 desired_state;
+
+ old_state = pg_atomic_read_u32(&lock->state);
+ while (true)
+ {
+ desired_state = old_state;
+
+ /* compute desired flags */
+
+ if (new_release_ok)
+ desired_state |= LW_FLAG_RELEASE_OK;
+ else
+ desired_state &= ~LW_FLAG_RELEASE_OK;
+
+ if (proclist_is_empty(&wakeup))
+ desired_state &= ~LW_FLAG_HAS_WAITERS;
+
+ desired_state &= ~LW_FLAG_LOCKED; /* release lock */
+
+ if (pg_atomic_compare_exchange_u32(&lock->state, &old_state,
+ desired_state))
+ break;
+ }
+ }
+
+ /* Awaken any waiters I removed from the queue. */
+ proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ LOG_LWDEBUG("LWLockRelease", lock, "release waiter");
+ proclist_delete(&wakeup, iter.cur, lwWaitLink);
+
+ /*
+ * Guarantee that lwWaiting being unset only becomes visible once the
+ * unlink from the link has completed. Otherwise the target backend
+ * could be woken up for other reason and enqueue for a new lock - if
+ * that happens before the list unlink happens, the list would end up
+ * being corrupted.
+ *
+ * The barrier pairs with the LWLockWaitListLock() when enqueuing for
+ * another lock.
+ */
+ pg_write_barrier();
+ waiter->lwWaiting = false;
+ PGSemaphoreUnlock(waiter->sem);
+ }
+}
+
+/*
+ * Add ourselves to the end of the queue.
+ *
+ * NB: Mode can be LW_WAIT_UNTIL_FREE here!
+ */
+static void
+LWLockQueueSelf(LWLock *lock, LWLockMode mode)
+{
+ /*
+ * If we don't have a PGPROC structure, there's no way to wait. This
+ * should never occur, since MyProc should only be null during shared
+ * memory initialization.
+ */
+ if (MyProc == NULL)
+ elog(PANIC, "cannot wait without a PGPROC structure");
+
+ if (MyProc->lwWaiting)
+ elog(PANIC, "queueing for lock while waiting on another one");
+
+ LWLockWaitListLock(lock);
+
+ /* setting the flag is protected by the spinlock */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS);
+
+ MyProc->lwWaiting = true;
+ MyProc->lwWaitMode = mode;
+
+ /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */
+ if (mode == LW_WAIT_UNTIL_FREE)
+ proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+ else
+ proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink);
+
+ /* Can release the mutex now */
+ LWLockWaitListUnlock(lock);
+
+#ifdef LOCK_DEBUG
+ pg_atomic_fetch_add_u32(&lock->nwaiters, 1);
+#endif
+
+}
+
+/*
+ * Remove ourselves from the waitlist.
+ *
+ * This is used if we queued ourselves because we thought we needed to sleep
+ * but, after further checking, we discovered that we don't actually need to
+ * do so.
+ */
+static void
+LWLockDequeueSelf(LWLock *lock)
+{
+ bool found = false;
+ proclist_mutable_iter iter;
+
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+
+ lwstats->dequeue_self_count++;
+#endif
+
+ LWLockWaitListLock(lock);
+
+ /*
+ * Can't just remove ourselves from the list, but we need to iterate over
+ * all entries as somebody else could have dequeued us.
+ */
+ proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+ {
+ if (iter.cur == MyProc->pgprocno)
+ {
+ found = true;
+ proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+ break;
+ }
+ }
+
+ if (proclist_is_empty(&lock->waiters) &&
+ (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0)
+ {
+ pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS);
+ }
+
+ /* XXX: combine with fetch_and above? */
+ LWLockWaitListUnlock(lock);
+
+ /* clear waiting state again, nice for debugging */
+ if (found)
+ MyProc->lwWaiting = false;
+ else
+ {
+ int extraWaits = 0;
+
+ /*
+ * Somebody else dequeued us and has or will wake us up. Deal with the
+ * superfluous absorption of a wakeup.
+ */
+
+ /*
+ * Reset RELEASE_OK flag if somebody woke us before we removed
+ * ourselves - they'll have set it to false.
+ */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+ /*
+ * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would
+ * get reset at some inconvenient point later. Most of the time this
+ * will immediately return.
+ */
+ for (;;)
+ {
+ PGSemaphoreLock(MyProc->sem);
+ if (!MyProc->lwWaiting)
+ break;
+ extraWaits++;
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(MyProc->sem);
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+}
+
+/*
+ * LWLockAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, sleep until it is. Returns true if the lock
+ * was available immediately, false if we had to sleep.
+ *
+ * Side effect: cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockAcquire(LWLock *lock, LWLockMode mode)
+{
+ PGPROC *proc = MyProc;
+ bool result = true;
+ int extraWaits = 0;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockAcquire", lock, mode);
+
+#ifdef LWLOCK_STATS
+ /* Count lock acquisition attempts */
+ if (mode == LW_EXCLUSIVE)
+ lwstats->ex_acquire_count++;
+ else
+ lwstats->sh_acquire_count++;
+#endif /* LWLOCK_STATS */
+
+ /*
+ * We can't wait if we haven't got a PGPROC. This should only occur
+ * during bootstrap or shared memory initialization. Put an Assert here
+ * to catch unsafe coding practices.
+ */
+ Assert(!(proc == NULL && IsUnderPostmaster));
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * Loop here to try to acquire lock after each time we are signaled by
+ * LWLockRelease.
+ *
+ * NOTE: it might seem better to have LWLockRelease actually grant us the
+ * lock, rather than retrying and possibly having to go back to sleep. But
+ * in practice that is no good because it means a process swap for every
+ * lock acquisition when two or more processes are contending for the same
+ * lock. Since LWLocks are normally used to protect not-very-long
+ * sections of computation, a process needs to be able to acquire and
+ * release the same lock many times during a single CPU time slice, even
+ * in the presence of contention. The efficiency of being able to do that
+ * outweighs the inefficiency of sometimes wasting a process dispatch
+ * cycle because the lock is not free when a released waiter finally gets
+ * to run. See pgsql-hackers archives for 29-Dec-01.
+ */
+ for (;;)
+ {
+ bool mustwait;
+
+ /*
+ * Try to grab the lock the first time, we're not in the waitqueue
+ * yet/anymore.
+ */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock");
+ break; /* got the lock */
+ }
+
+ /*
+ * Ok, at this point we couldn't grab the lock on the first try. We
+ * cannot simply queue ourselves to the end of the list and wait to be
+ * woken up because by now the lock could long have been released.
+ * Instead add us to the queue and try to grab the lock again. If we
+ * succeed we need to revert the queuing and be happy, otherwise we
+ * recheck the lock. If we still couldn't grab it, we know that the
+ * other locker will see our queue entries when releasing since they
+ * existed before we checked for the lock.
+ */
+
+ /* add to the queue */
+ LWLockQueueSelf(lock, mode);
+
+ /* we're now guaranteed to be woken up if necessary */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ /* ok, grabbed the lock the second time round, need to undo queueing */
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue");
+
+ LWLockDequeueSelf(lock);
+ break;
+ }
+
+ /*
+ * Wait until awakened.
+ *
+ * It is possible that we get awakened for a reason other than being
+ * signaled by LWLockRelease. If so, loop back and wait again. Once
+ * we've gotten the LWLock, re-increment the sema by the number of
+ * additional signals received.
+ */
+ LOG_LWDEBUG("LWLockAcquire", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (!proc->lwWaiting)
+ break;
+ extraWaits++;
+ }
+
+ /* Retrying, allow LWLockRelease to release waiters again. */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockAcquire", lock, "awakened");
+
+ /* Now loop back and try to acquire lock again. */
+ result = false;
+ }
+
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode);
+
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ return result;
+}
+
+/*
+ * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode
+ *
+ * If the lock is not available, return false with no side-effects.
+ *
+ * If successful, cancel/die interrupts are held off until lock release.
+ */
+bool
+LWLockConditionalAcquire(LWLock *lock, LWLockMode mode)
+{
+ bool mustwait;
+
+ AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode);
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /* Check for the lock */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ /* Failed to get lock, so release interrupt holdoff */
+ RESUME_INTERRUPTS();
+
+ LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed");
+ if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode);
+ }
+ else
+ {
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+ if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode);
+ }
+ return !mustwait;
+}
+
+/*
+ * LWLockAcquireOrWait - Acquire lock, or wait until it's free
+ *
+ * The semantics of this function are a bit funky. If the lock is currently
+ * free, it is acquired in the given mode, and the function returns true. If
+ * the lock isn't immediately free, the function waits until it is released
+ * and returns false, but does not acquire the lock.
+ *
+ * This is currently used for WALWriteLock: when a backend flushes the WAL,
+ * holding WALWriteLock, it can flush the commit records of many other
+ * backends as a side-effect. Those other backends need to wait until the
+ * flush finishes, but don't need to acquire the lock anymore. They can just
+ * wake up, observe that their records have already been flushed, and return.
+ */
+bool
+LWLockAcquireOrWait(LWLock *lock, LWLockMode mode)
+{
+ PGPROC *proc = MyProc;
+ bool mustwait;
+ int extraWaits = 0;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE);
+
+ PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode);
+
+ /* Ensure we will have room to remember the lock */
+ if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS)
+ elog(ERROR, "too many LWLocks taken");
+
+ /*
+ * Lock out cancel/die interrupts until we exit the code section protected
+ * by the LWLock. This ensures that interrupts will not interfere with
+ * manipulations of data structures in shared memory.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * NB: We're using nearly the same twice-in-a-row lock acquisition
+ * protocol as LWLockAcquire(). Check its comments for details.
+ */
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+ mustwait = LWLockAttemptLock(lock, mode);
+
+ if (mustwait)
+ {
+ /*
+ * Wait until awakened. Like in LWLockAcquire, be prepared for
+ * bogus wakeups.
+ */
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (!proc->lwWaiting)
+ break;
+ extraWaits++;
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened");
+ }
+ else
+ {
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue");
+
+ /*
+ * Got lock in the second attempt, undo queueing. We need to treat
+ * this as having successfully acquired the lock, otherwise we'd
+ * not necessarily wake up people we've prevented from acquiring
+ * the lock.
+ */
+ LWLockDequeueSelf(lock);
+ }
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ if (mustwait)
+ {
+ /* Failed to get lock, so release interrupt holdoff */
+ RESUME_INTERRUPTS();
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed");
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode);
+ }
+ else
+ {
+ LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded");
+ /* Add lock to list of locks held by this backend */
+ held_lwlocks[num_held_lwlocks].lock = lock;
+ held_lwlocks[num_held_lwlocks++].mode = mode;
+ if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode);
+ }
+
+ return !mustwait;
+}
+
+/*
+ * Does the lwlock in its current state need to wait for the variable value to
+ * change?
+ *
+ * If we don't need to wait, and it's because the value of the variable has
+ * changed, store the current value in newval.
+ *
+ * *result is set to true if the lock was free, and false otherwise.
+ */
+static bool
+LWLockConflictsWithVar(LWLock *lock,
+ uint64 *valptr, uint64 oldval, uint64 *newval,
+ bool *result)
+{
+ bool mustwait;
+ uint64 value;
+
+ /*
+ * Test first to see if it the slot is free right now.
+ *
+ * XXX: the caller uses a spinlock before this, so we don't need a memory
+ * barrier here as far as the current usage is concerned. But that might
+ * not be safe in general.
+ */
+ mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0;
+
+ if (!mustwait)
+ {
+ *result = true;
+ return false;
+ }
+
+ *result = false;
+
+ /*
+ * Read value using the lwlock's wait list lock, as we can't generally
+ * rely on atomic 64 bit reads/stores. TODO: On platforms with a way to
+ * do atomic 64 bit reads/writes the spinlock should be optimized away.
+ */
+ LWLockWaitListLock(lock);
+ value = *valptr;
+ LWLockWaitListUnlock(lock);
+
+ if (value != oldval)
+ {
+ mustwait = false;
+ *newval = value;
+ }
+ else
+ {
+ mustwait = true;
+ }
+
+ return mustwait;
+}
+
+/*
+ * LWLockWaitForVar - Wait until lock is free, or a variable is updated.
+ *
+ * If the lock is held and *valptr equals oldval, waits until the lock is
+ * either freed, or the lock holder updates *valptr by calling
+ * LWLockUpdateVar. If the lock is free on exit (immediately or after
+ * waiting), returns true. If the lock is still held, but *valptr no longer
+ * matches oldval, returns false and sets *newval to the current value in
+ * *valptr.
+ *
+ * Note: this function ignores shared lock holders; if the lock is held
+ * in shared mode, returns 'true'.
+ */
+bool
+LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
+{
+ PGPROC *proc = MyProc;
+ int extraWaits = 0;
+ bool result = false;
+#ifdef LWLOCK_STATS
+ lwlock_stats *lwstats;
+
+ lwstats = get_lwlock_stats_entry(lock);
+#endif
+
+ PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE);
+
+ /*
+ * Lock out cancel/die interrupts while we sleep on the lock. There is no
+ * cleanup mechanism to remove us from the wait queue if we got
+ * interrupted.
+ */
+ HOLD_INTERRUPTS();
+
+ /*
+ * Loop here to check the lock's status after each time we are signaled.
+ */
+ for (;;)
+ {
+ bool mustwait;
+
+ mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+ &result);
+
+ if (!mustwait)
+ break; /* the lock was free or value didn't match */
+
+ /*
+ * Add myself to wait queue. Note that this is racy, somebody else
+ * could wakeup before we're finished queuing. NB: We're using nearly
+ * the same twice-in-a-row lock acquisition protocol as
+ * LWLockAcquire(). Check its comments for details. The only
+ * difference is that we also have to check the variable's values when
+ * checking the state of the lock.
+ */
+ LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);
+
+ /*
+ * Set RELEASE_OK flag, to make sure we get woken up as soon as the
+ * lock is released.
+ */
+ pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);
+
+ /*
+ * We're now guaranteed to be woken up if necessary. Recheck the lock
+ * and variables state.
+ */
+ mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
+ &result);
+
+ /* Ok, no conflict after we queued ourselves. Undo queueing. */
+ if (!mustwait)
+ {
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue");
+
+ LWLockDequeueSelf(lock);
+ break;
+ }
+
+ /*
+ * Wait until awakened.
+ *
+ * It is possible that we get awakened for a reason other than being
+ * signaled by LWLockRelease. If so, loop back and wait again. Once
+ * we've gotten the LWLock, re-increment the sema by the number of
+ * additional signals received.
+ */
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting");
+
+#ifdef LWLOCK_STATS
+ lwstats->block_count++;
+#endif
+
+ LWLockReportWaitStart(lock);
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE);
+
+ for (;;)
+ {
+ PGSemaphoreLock(proc->sem);
+ if (!proc->lwWaiting)
+ break;
+ extraWaits++;
+ }
+
+#ifdef LOCK_DEBUG
+ {
+ /* not waiting anymore */
+ uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);
+
+ Assert(nwaiters < MAX_BACKENDS);
+ }
+#endif
+
+ if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE);
+ LWLockReportWaitEnd();
+
+ LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened");
+
+ /* Now loop back and check the status of the lock again. */
+ }
+
+ /*
+ * Fix the process wait semaphore's count for any absorbed wakeups.
+ */
+ while (extraWaits-- > 0)
+ PGSemaphoreUnlock(proc->sem);
+
+ /*
+ * Now okay to allow cancel/die interrupts.
+ */
+ RESUME_INTERRUPTS();
+
+ return result;
+}
+
+
+/*
+ * LWLockUpdateVar - Update a variable and wake up waiters atomically
+ *
+ * Sets *valptr to 'val', and wakes up all processes waiting for us with
+ * LWLockWaitForVar(). Setting the value and waking up the processes happen
+ * atomically so that any process calling LWLockWaitForVar() on the same lock
+ * is guaranteed to see the new value, and act accordingly.
+ *
+ * The caller must be holding the lock in exclusive mode.
+ */
+void
+LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+ proclist_head wakeup;
+ proclist_mutable_iter iter;
+
+ PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE);
+
+ proclist_init(&wakeup);
+
+ LWLockWaitListLock(lock);
+
+ Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE);
+
+ /* Update the lock's value */
+ *valptr = val;
+
+ /*
+ * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken
+ * up. They are always in the front of the queue.
+ */
+ proclist_foreach_modify(iter, &lock->waiters, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
+ break;
+
+ proclist_delete(&lock->waiters, iter.cur, lwWaitLink);
+ proclist_push_tail(&wakeup, iter.cur, lwWaitLink);
+ }
+
+ /* We are done updating shared state of the lock itself. */
+ LWLockWaitListUnlock(lock);
+
+ /*
+ * Awaken any waiters I removed from the queue.
+ */
+ proclist_foreach_modify(iter, &wakeup, lwWaitLink)
+ {
+ PGPROC *waiter = GetPGProcByNumber(iter.cur);
+
+ proclist_delete(&wakeup, iter.cur, lwWaitLink);
+ /* check comment in LWLockWakeup() about this barrier */
+ pg_write_barrier();
+ waiter->lwWaiting = false;
+ PGSemaphoreUnlock(waiter->sem);
+ }
+}
+
+
+/*
+ * LWLockRelease - release a previously acquired lock
+ */
+void
+LWLockRelease(LWLock *lock)
+{
+ LWLockMode mode;
+ uint32 oldstate;
+ bool check_waiters;
+ int i;
+
+ /*
+ * Remove lock from list of locks held. Usually, but not always, it will
+ * be the latest-acquired lock; so search array backwards.
+ */
+ for (i = num_held_lwlocks; --i >= 0;)
+ if (lock == held_lwlocks[i].lock)
+ break;
+
+ if (i < 0)
+ elog(ERROR, "lock %s is not held", T_NAME(lock));
+
+ mode = held_lwlocks[i].mode;
+
+ num_held_lwlocks--;
+ for (; i < num_held_lwlocks; i++)
+ held_lwlocks[i] = held_lwlocks[i + 1];
+
+ PRINT_LWDEBUG("LWLockRelease", lock, mode);
+
+ /*
+ * Release my hold on lock, after that it can immediately be acquired by
+ * others, even if we still have to wakeup other waiters.
+ */
+ if (mode == LW_EXCLUSIVE)
+ oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE);
+ else
+ oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED);
+
+ /* nobody else can have that kind of lock */
+ Assert(!(oldstate & LW_VAL_EXCLUSIVE));
+
+ if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED())
+ TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock));
+
+ /*
+ * We're still waiting for backends to get scheduled, don't wake them up
+ * again.
+ */
+ if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) ==
+ (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) &&
+ (oldstate & LW_LOCK_MASK) == 0)
+ check_waiters = true;
+ else
+ check_waiters = false;
+
+ /*
+ * As waking up waiters requires the spinlock to be acquired, only do so
+ * if necessary.
+ */
+ if (check_waiters)
+ {
+ /* XXX: remove before commit? */
+ LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters");
+ LWLockWakeup(lock);
+ }
+
+ /*
+ * Now okay to allow cancel/die interrupts.
+ */
+ RESUME_INTERRUPTS();
+}
+
+/*
+ * LWLockReleaseClearVar - release a previously acquired lock, reset variable
+ */
+void
+LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val)
+{
+ LWLockWaitListLock(lock);
+
+ /*
+ * Set the variable's value before releasing the lock, that prevents race
+ * a race condition wherein a new locker acquires the lock, but hasn't yet
+ * set the variables value.
+ */
+ *valptr = val;
+ LWLockWaitListUnlock(lock);
+
+ LWLockRelease(lock);
+}
+
+
+/*
+ * LWLockReleaseAll - release all currently-held locks
+ *
+ * Used to clean up after ereport(ERROR). An important difference between this
+ * function and retail LWLockRelease calls is that InterruptHoldoffCount is
+ * unchanged by this operation. This is necessary since InterruptHoldoffCount
+ * has been set to an appropriate level earlier in error recovery. We could
+ * decrement it below zero if we allow it to drop for each released lock!
+ */
+void
+LWLockReleaseAll(void)
+{
+ while (num_held_lwlocks > 0)
+ {
+ HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */
+
+ LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock);
+ }
+}
+
+
+/*
+ * LWLockHeldByMe - test whether my process holds a lock in any mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMe(LWLock *l)
+{
+ int i;
+
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ if (held_lwlocks[i].lock == l)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * LWLockHeldByMe - test whether my process holds any of an array of locks
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockAnyHeldByMe(LWLock *l, int nlocks, size_t stride)
+{
+ char *held_lock_addr;
+ char *begin;
+ char *end;
+ int i;
+
+ begin = (char *) l;
+ end = begin + nlocks * stride;
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ held_lock_addr = (char *) held_lwlocks[i].lock;
+ if (held_lock_addr >= begin &&
+ held_lock_addr < end &&
+ (held_lock_addr - begin) % stride == 0)
+ return true;
+ }
+ return false;
+}
+
+/*
+ * LWLockHeldByMeInMode - test whether my process holds a lock in given mode
+ *
+ * This is meant as debug support only.
+ */
+bool
+LWLockHeldByMeInMode(LWLock *l, LWLockMode mode)
+{
+ int i;
+
+ for (i = 0; i < num_held_lwlocks; i++)
+ {
+ if (held_lwlocks[i].lock == l && held_lwlocks[i].mode == mode)
+ return true;
+ }
+ return false;
+}