diff options
Diffstat (limited to 'src/backend/storage/lmgr/lwlock.c')
-rw-r--r-- | src/backend/storage/lmgr/lwlock.c | 1977 |
1 files changed, 1977 insertions, 0 deletions
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c new file mode 100644 index 0000000..07eb6f6 --- /dev/null +++ b/src/backend/storage/lmgr/lwlock.c @@ -0,0 +1,1977 @@ +/*------------------------------------------------------------------------- + * + * lwlock.c + * Lightweight lock manager + * + * Lightweight locks are intended primarily to provide mutual exclusion of + * access to shared-memory data structures. Therefore, they offer both + * exclusive and shared lock modes (to support read/write and read-only + * access to a shared object). There are few other frammishes. User-level + * locking should be done with the full lock manager --- which depends on + * LWLocks to protect its shared state. + * + * In addition to exclusive and shared modes, lightweight locks can be used to + * wait until a variable changes value. The variable is initially not set + * when the lock is acquired with LWLockAcquire, i.e. it remains set to the + * value it was set to when the lock was released last, and can be updated + * without releasing the lock by calling LWLockUpdateVar. LWLockWaitForVar + * waits for the variable to be updated, or until the lock is free. When + * releasing the lock with LWLockReleaseClearVar() the value can be set to an + * appropriate value for a free lock. The meaning of the variable is up to + * the caller, the lightweight lock code just assigns and compares it. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/storage/lmgr/lwlock.c + * + * NOTES: + * + * This used to be a pretty straight forward reader-writer lock + * implementation, in which the internal state was protected by a + * spinlock. Unfortunately the overhead of taking the spinlock proved to be + * too high for workloads/locks that were taken in shared mode very + * frequently. Often we were spinning in the (obviously exclusive) spinlock, + * while trying to acquire a shared lock that was actually free. + * + * Thus a new implementation was devised that provides wait-free shared lock + * acquisition for locks that aren't exclusively locked. + * + * The basic idea is to have a single atomic variable 'lockcount' instead of + * the formerly separate shared and exclusive counters and to use atomic + * operations to acquire the lock. That's fairly easy to do for plain + * rw-spinlocks, but a lot harder for something like LWLocks that want to wait + * in the OS. + * + * For lock acquisition we use an atomic compare-and-exchange on the lockcount + * variable. For exclusive lock we swap in a sentinel value + * (LW_VAL_EXCLUSIVE), for shared locks we count the number of holders. + * + * To release the lock we use an atomic decrement to release the lock. If the + * new value is zero (we get that atomically), we know we can/have to release + * waiters. + * + * Obviously it is important that the sentinel value for exclusive locks + * doesn't conflict with the maximum number of possible share lockers - + * luckily MAX_BACKENDS makes that easily possible. + * + * + * The attentive reader might have noticed that naively doing the above has a + * glaring race condition: We try to lock using the atomic operations and + * notice that we have to wait. Unfortunately by the time we have finished + * queuing, the former locker very well might have already finished it's + * work. That's problematic because we're now stuck waiting inside the OS. + + * To mitigate those races we use a two phased attempt at locking: + * Phase 1: Try to do it atomically, if we succeed, nice + * Phase 2: Add ourselves to the waitqueue of the lock + * Phase 3: Try to grab the lock again, if we succeed, remove ourselves from + * the queue + * Phase 4: Sleep till wake-up, goto Phase 1 + * + * This protects us against the problem from above as nobody can release too + * quick, before we're queued, since after Phase 2 we're already queued. + * ------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "pg_trace.h" +#include "pgstat.h" +#include "postmaster/postmaster.h" +#include "replication/slot.h" +#include "storage/ipc.h" +#include "storage/predicate.h" +#include "storage/proc.h" +#include "storage/proclist.h" +#include "storage/spin.h" +#include "utils/memutils.h" + +#ifdef LWLOCK_STATS +#include "utils/hsearch.h" +#endif + + +/* We use the ShmemLock spinlock to protect LWLockCounter */ +extern slock_t *ShmemLock; + +#define LW_FLAG_HAS_WAITERS ((uint32) 1 << 30) +#define LW_FLAG_RELEASE_OK ((uint32) 1 << 29) +#define LW_FLAG_LOCKED ((uint32) 1 << 28) + +#define LW_VAL_EXCLUSIVE ((uint32) 1 << 24) +#define LW_VAL_SHARED 1 + +#define LW_LOCK_MASK ((uint32) ((1 << 25)-1)) +/* Must be greater than MAX_BACKENDS - which is 2^23-1, so we're fine. */ +#define LW_SHARED_MASK ((uint32) ((1 << 24)-1)) + +/* + * There are three sorts of LWLock "tranches": + * + * 1. The individually-named locks defined in lwlocknames.h each have their + * own tranche. The names of these tranches appear in IndividualLWLockNames[] + * in lwlocknames.c. + * + * 2. There are some predefined tranches for built-in groups of locks. + * These are listed in enum BuiltinTrancheIds in lwlock.h, and their names + * appear in BuiltinTrancheNames[] below. + * + * 3. Extensions can create new tranches, via either RequestNamedLWLockTranche + * or LWLockRegisterTranche. The names of these that are known in the current + * process appear in LWLockTrancheNames[]. + * + * All these names are user-visible as wait event names, so choose with care + * ... and do not forget to update the documentation's list of wait events. + */ +extern const char *const IndividualLWLockNames[]; /* in lwlocknames.c */ + +static const char *const BuiltinTrancheNames[] = { + /* LWTRANCHE_XACT_BUFFER: */ + "XactBuffer", + /* LWTRANCHE_COMMITTS_BUFFER: */ + "CommitTSBuffer", + /* LWTRANCHE_SUBTRANS_BUFFER: */ + "SubtransBuffer", + /* LWTRANCHE_MULTIXACTOFFSET_BUFFER: */ + "MultiXactOffsetBuffer", + /* LWTRANCHE_MULTIXACTMEMBER_BUFFER: */ + "MultiXactMemberBuffer", + /* LWTRANCHE_NOTIFY_BUFFER: */ + "NotifyBuffer", + /* LWTRANCHE_SERIAL_BUFFER: */ + "SerialBuffer", + /* LWTRANCHE_WAL_INSERT: */ + "WALInsert", + /* LWTRANCHE_BUFFER_CONTENT: */ + "BufferContent", + /* LWTRANCHE_REPLICATION_ORIGIN_STATE: */ + "ReplicationOriginState", + /* LWTRANCHE_REPLICATION_SLOT_IO: */ + "ReplicationSlotIO", + /* LWTRANCHE_LOCK_FASTPATH: */ + "LockFastPath", + /* LWTRANCHE_BUFFER_MAPPING: */ + "BufferMapping", + /* LWTRANCHE_LOCK_MANAGER: */ + "LockManager", + /* LWTRANCHE_PREDICATE_LOCK_MANAGER: */ + "PredicateLockManager", + /* LWTRANCHE_PARALLEL_HASH_JOIN: */ + "ParallelHashJoin", + /* LWTRANCHE_PARALLEL_QUERY_DSA: */ + "ParallelQueryDSA", + /* LWTRANCHE_PER_SESSION_DSA: */ + "PerSessionDSA", + /* LWTRANCHE_PER_SESSION_RECORD_TYPE: */ + "PerSessionRecordType", + /* LWTRANCHE_PER_SESSION_RECORD_TYPMOD: */ + "PerSessionRecordTypmod", + /* LWTRANCHE_SHARED_TUPLESTORE: */ + "SharedTupleStore", + /* LWTRANCHE_SHARED_TIDBITMAP: */ + "SharedTidBitmap", + /* LWTRANCHE_PARALLEL_APPEND: */ + "ParallelAppend", + /* LWTRANCHE_PER_XACT_PREDICATE_LIST: */ + "PerXactPredicateList" +}; + +StaticAssertDecl(lengthof(BuiltinTrancheNames) == + LWTRANCHE_FIRST_USER_DEFINED - NUM_INDIVIDUAL_LWLOCKS, + "missing entries in BuiltinTrancheNames[]"); + +/* + * This is indexed by tranche ID minus LWTRANCHE_FIRST_USER_DEFINED, and + * stores the names of all dynamically-created tranches known to the current + * process. Any unused entries in the array will contain NULL. + */ +static const char **LWLockTrancheNames = NULL; +static int LWLockTrancheNamesAllocated = 0; + +/* + * This points to the main array of LWLocks in shared memory. Backends inherit + * the pointer by fork from the postmaster (except in the EXEC_BACKEND case, + * where we have special measures to pass it down). + */ +LWLockPadded *MainLWLockArray = NULL; + +/* + * We use this structure to keep track of locked LWLocks for release + * during error recovery. Normally, only a few will be held at once, but + * occasionally the number can be much higher; for example, the pg_buffercache + * extension locks all buffer partitions simultaneously. + */ +#define MAX_SIMUL_LWLOCKS 200 + +/* struct representing the LWLocks we're holding */ +typedef struct LWLockHandle +{ + LWLock *lock; + LWLockMode mode; +} LWLockHandle; + +static int num_held_lwlocks = 0; +static LWLockHandle held_lwlocks[MAX_SIMUL_LWLOCKS]; + +/* struct representing the LWLock tranche request for named tranche */ +typedef struct NamedLWLockTrancheRequest +{ + char tranche_name[NAMEDATALEN]; + int num_lwlocks; +} NamedLWLockTrancheRequest; + +static NamedLWLockTrancheRequest *NamedLWLockTrancheRequestArray = NULL; +static int NamedLWLockTrancheRequestsAllocated = 0; + +/* + * NamedLWLockTrancheRequests is both the valid length of the request array, + * and the length of the shared-memory NamedLWLockTrancheArray later on. + * This variable and NamedLWLockTrancheArray are non-static so that + * postmaster.c can copy them to child processes in EXEC_BACKEND builds. + */ +int NamedLWLockTrancheRequests = 0; + +/* points to data in shared memory: */ +NamedLWLockTranche *NamedLWLockTrancheArray = NULL; + +static bool lock_named_request_allowed = true; + +static void InitializeLWLocks(void); +static inline void LWLockReportWaitStart(LWLock *lock); +static inline void LWLockReportWaitEnd(void); +static const char *GetLWTrancheName(uint16 trancheId); + +#define T_NAME(lock) \ + GetLWTrancheName((lock)->tranche) + +#ifdef LWLOCK_STATS +typedef struct lwlock_stats_key +{ + int tranche; + void *instance; +} lwlock_stats_key; + +typedef struct lwlock_stats +{ + lwlock_stats_key key; + int sh_acquire_count; + int ex_acquire_count; + int block_count; + int dequeue_self_count; + int spin_delay_count; +} lwlock_stats; + +static HTAB *lwlock_stats_htab; +static lwlock_stats lwlock_stats_dummy; +#endif + +#ifdef LOCK_DEBUG +bool Trace_lwlocks = false; + +inline static void +PRINT_LWDEBUG(const char *where, LWLock *lock, LWLockMode mode) +{ + /* hide statement & context here, otherwise the log is just too verbose */ + if (Trace_lwlocks) + { + uint32 state = pg_atomic_read_u32(&lock->state); + + ereport(LOG, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("%d: %s(%s %p): excl %u shared %u haswaiters %u waiters %u rOK %d", + MyProcPid, + where, T_NAME(lock), lock, + (state & LW_VAL_EXCLUSIVE) != 0, + state & LW_SHARED_MASK, + (state & LW_FLAG_HAS_WAITERS) != 0, + pg_atomic_read_u32(&lock->nwaiters), + (state & LW_FLAG_RELEASE_OK) != 0))); + } +} + +inline static void +LOG_LWDEBUG(const char *where, LWLock *lock, const char *msg) +{ + /* hide statement & context here, otherwise the log is just too verbose */ + if (Trace_lwlocks) + { + ereport(LOG, + (errhidestmt(true), + errhidecontext(true), + errmsg_internal("%s(%s %p): %s", where, + T_NAME(lock), lock, msg))); + } +} + +#else /* not LOCK_DEBUG */ +#define PRINT_LWDEBUG(a,b,c) ((void)0) +#define LOG_LWDEBUG(a,b,c) ((void)0) +#endif /* LOCK_DEBUG */ + +#ifdef LWLOCK_STATS + +static void init_lwlock_stats(void); +static void print_lwlock_stats(int code, Datum arg); +static lwlock_stats * get_lwlock_stats_entry(LWLock *lock); + +static void +init_lwlock_stats(void) +{ + HASHCTL ctl; + static MemoryContext lwlock_stats_cxt = NULL; + static bool exit_registered = false; + + if (lwlock_stats_cxt != NULL) + MemoryContextDelete(lwlock_stats_cxt); + + /* + * The LWLock stats will be updated within a critical section, which + * requires allocating new hash entries. Allocations within a critical + * section are normally not allowed because running out of memory would + * lead to a PANIC, but LWLOCK_STATS is debugging code that's not normally + * turned on in production, so that's an acceptable risk. The hash entries + * are small, so the risk of running out of memory is minimal in practice. + */ + lwlock_stats_cxt = AllocSetContextCreate(TopMemoryContext, + "LWLock stats", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(lwlock_stats_cxt, true); + + ctl.keysize = sizeof(lwlock_stats_key); + ctl.entrysize = sizeof(lwlock_stats); + ctl.hcxt = lwlock_stats_cxt; + lwlock_stats_htab = hash_create("lwlock stats", 16384, &ctl, + HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + if (!exit_registered) + { + on_shmem_exit(print_lwlock_stats, 0); + exit_registered = true; + } +} + +static void +print_lwlock_stats(int code, Datum arg) +{ + HASH_SEQ_STATUS scan; + lwlock_stats *lwstats; + + hash_seq_init(&scan, lwlock_stats_htab); + + /* Grab an LWLock to keep different backends from mixing reports */ + LWLockAcquire(&MainLWLockArray[0].lock, LW_EXCLUSIVE); + + while ((lwstats = (lwlock_stats *) hash_seq_search(&scan)) != NULL) + { + fprintf(stderr, + "PID %d lwlock %s %p: shacq %u exacq %u blk %u spindelay %u dequeue self %u\n", + MyProcPid, GetLWTrancheName(lwstats->key.tranche), + lwstats->key.instance, lwstats->sh_acquire_count, + lwstats->ex_acquire_count, lwstats->block_count, + lwstats->spin_delay_count, lwstats->dequeue_self_count); + } + + LWLockRelease(&MainLWLockArray[0].lock); +} + +static lwlock_stats * +get_lwlock_stats_entry(LWLock *lock) +{ + lwlock_stats_key key; + lwlock_stats *lwstats; + bool found; + + /* + * During shared memory initialization, the hash table doesn't exist yet. + * Stats of that phase aren't very interesting, so just collect operations + * on all locks in a single dummy entry. + */ + if (lwlock_stats_htab == NULL) + return &lwlock_stats_dummy; + + /* Fetch or create the entry. */ + MemSet(&key, 0, sizeof(key)); + key.tranche = lock->tranche; + key.instance = lock; + lwstats = hash_search(lwlock_stats_htab, &key, HASH_ENTER, &found); + if (!found) + { + lwstats->sh_acquire_count = 0; + lwstats->ex_acquire_count = 0; + lwstats->block_count = 0; + lwstats->dequeue_self_count = 0; + lwstats->spin_delay_count = 0; + } + return lwstats; +} +#endif /* LWLOCK_STATS */ + + +/* + * Compute number of LWLocks required by named tranches. These will be + * allocated in the main array. + */ +static int +NumLWLocksForNamedTranches(void) +{ + int numLocks = 0; + int i; + + for (i = 0; i < NamedLWLockTrancheRequests; i++) + numLocks += NamedLWLockTrancheRequestArray[i].num_lwlocks; + + return numLocks; +} + +/* + * Compute shmem space needed for LWLocks and named tranches. + */ +Size +LWLockShmemSize(void) +{ + Size size; + int i; + int numLocks = NUM_FIXED_LWLOCKS; + + /* Calculate total number of locks needed in the main array. */ + numLocks += NumLWLocksForNamedTranches(); + + /* Space for the LWLock array. */ + size = mul_size(numLocks, sizeof(LWLockPadded)); + + /* Space for dynamic allocation counter, plus room for alignment. */ + size = add_size(size, sizeof(int) + LWLOCK_PADDED_SIZE); + + /* space for named tranches. */ + size = add_size(size, mul_size(NamedLWLockTrancheRequests, sizeof(NamedLWLockTranche))); + + /* space for name of each tranche. */ + for (i = 0; i < NamedLWLockTrancheRequests; i++) + size = add_size(size, strlen(NamedLWLockTrancheRequestArray[i].tranche_name) + 1); + + /* Disallow adding any more named tranches. */ + lock_named_request_allowed = false; + + return size; +} + +/* + * Allocate shmem space for the main LWLock array and all tranches and + * initialize it. We also register extension LWLock tranches here. + */ +void +CreateLWLocks(void) +{ + StaticAssertStmt(LW_VAL_EXCLUSIVE > (uint32) MAX_BACKENDS, + "MAX_BACKENDS too big for lwlock.c"); + + StaticAssertStmt(sizeof(LWLock) <= LWLOCK_PADDED_SIZE, + "Miscalculated LWLock padding"); + + if (!IsUnderPostmaster) + { + Size spaceLocks = LWLockShmemSize(); + int *LWLockCounter; + char *ptr; + + /* Allocate space */ + ptr = (char *) ShmemAlloc(spaceLocks); + + /* Leave room for dynamic allocation of tranches */ + ptr += sizeof(int); + + /* Ensure desired alignment of LWLock array */ + ptr += LWLOCK_PADDED_SIZE - ((uintptr_t) ptr) % LWLOCK_PADDED_SIZE; + + MainLWLockArray = (LWLockPadded *) ptr; + + /* + * Initialize the dynamic-allocation counter for tranches, which is + * stored just before the first LWLock. + */ + LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); + *LWLockCounter = LWTRANCHE_FIRST_USER_DEFINED; + + /* Initialize all LWLocks */ + InitializeLWLocks(); + } + + /* Register named extension LWLock tranches in the current process. */ + for (int i = 0; i < NamedLWLockTrancheRequests; i++) + LWLockRegisterTranche(NamedLWLockTrancheArray[i].trancheId, + NamedLWLockTrancheArray[i].trancheName); +} + +/* + * Initialize LWLocks that are fixed and those belonging to named tranches. + */ +static void +InitializeLWLocks(void) +{ + int numNamedLocks = NumLWLocksForNamedTranches(); + int id; + int i; + int j; + LWLockPadded *lock; + + /* Initialize all individual LWLocks in main array */ + for (id = 0, lock = MainLWLockArray; id < NUM_INDIVIDUAL_LWLOCKS; id++, lock++) + LWLockInitialize(&lock->lock, id); + + /* Initialize buffer mapping LWLocks in main array */ + lock = MainLWLockArray + BUFFER_MAPPING_LWLOCK_OFFSET; + for (id = 0; id < NUM_BUFFER_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_BUFFER_MAPPING); + + /* Initialize lmgrs' LWLocks in main array */ + lock = MainLWLockArray + LOCK_MANAGER_LWLOCK_OFFSET; + for (id = 0; id < NUM_LOCK_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_LOCK_MANAGER); + + /* Initialize predicate lmgrs' LWLocks in main array */ + lock = MainLWLockArray + PREDICATELOCK_MANAGER_LWLOCK_OFFSET; + for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++) + LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER); + + /* + * Copy the info about any named tranches into shared memory (so that + * other processes can see it), and initialize the requested LWLocks. + */ + if (NamedLWLockTrancheRequests > 0) + { + char *trancheNames; + + NamedLWLockTrancheArray = (NamedLWLockTranche *) + &MainLWLockArray[NUM_FIXED_LWLOCKS + numNamedLocks]; + + trancheNames = (char *) NamedLWLockTrancheArray + + (NamedLWLockTrancheRequests * sizeof(NamedLWLockTranche)); + lock = &MainLWLockArray[NUM_FIXED_LWLOCKS]; + + for (i = 0; i < NamedLWLockTrancheRequests; i++) + { + NamedLWLockTrancheRequest *request; + NamedLWLockTranche *tranche; + char *name; + + request = &NamedLWLockTrancheRequestArray[i]; + tranche = &NamedLWLockTrancheArray[i]; + + name = trancheNames; + trancheNames += strlen(request->tranche_name) + 1; + strcpy(name, request->tranche_name); + tranche->trancheId = LWLockNewTrancheId(); + tranche->trancheName = name; + + for (j = 0; j < request->num_lwlocks; j++, lock++) + LWLockInitialize(&lock->lock, tranche->trancheId); + } + } +} + +/* + * InitLWLockAccess - initialize backend-local state needed to hold LWLocks + */ +void +InitLWLockAccess(void) +{ +#ifdef LWLOCK_STATS + init_lwlock_stats(); +#endif +} + +/* + * GetNamedLWLockTranche - returns the base address of LWLock from the + * specified tranche. + * + * Caller needs to retrieve the requested number of LWLocks starting from + * the base lock address returned by this API. This can be used for + * tranches that are requested by using RequestNamedLWLockTranche() API. + */ +LWLockPadded * +GetNamedLWLockTranche(const char *tranche_name) +{ + int lock_pos; + int i; + + /* + * Obtain the position of base address of LWLock belonging to requested + * tranche_name in MainLWLockArray. LWLocks for named tranches are placed + * in MainLWLockArray after fixed locks. + */ + lock_pos = NUM_FIXED_LWLOCKS; + for (i = 0; i < NamedLWLockTrancheRequests; i++) + { + if (strcmp(NamedLWLockTrancheRequestArray[i].tranche_name, + tranche_name) == 0) + return &MainLWLockArray[lock_pos]; + + lock_pos += NamedLWLockTrancheRequestArray[i].num_lwlocks; + } + + elog(ERROR, "requested tranche is not registered"); + + /* just to keep compiler quiet */ + return NULL; +} + +/* + * Allocate a new tranche ID. + */ +int +LWLockNewTrancheId(void) +{ + int result; + int *LWLockCounter; + + LWLockCounter = (int *) ((char *) MainLWLockArray - sizeof(int)); + SpinLockAcquire(ShmemLock); + result = (*LWLockCounter)++; + SpinLockRelease(ShmemLock); + + return result; +} + +/* + * Register a dynamic tranche name in the lookup table of the current process. + * + * This routine will save a pointer to the tranche name passed as an argument, + * so the name should be allocated in a backend-lifetime context + * (shared memory, TopMemoryContext, static constant, or similar). + * + * The tranche name will be user-visible as a wait event name, so try to + * use a name that fits the style for those. + */ +void +LWLockRegisterTranche(int tranche_id, const char *tranche_name) +{ + /* This should only be called for user-defined tranches. */ + if (tranche_id < LWTRANCHE_FIRST_USER_DEFINED) + return; + + /* Convert to array index. */ + tranche_id -= LWTRANCHE_FIRST_USER_DEFINED; + + /* If necessary, create or enlarge array. */ + if (tranche_id >= LWLockTrancheNamesAllocated) + { + int newalloc; + + newalloc = Max(LWLockTrancheNamesAllocated, 8); + while (newalloc <= tranche_id) + newalloc *= 2; + + if (LWLockTrancheNames == NULL) + LWLockTrancheNames = (const char **) + MemoryContextAllocZero(TopMemoryContext, + newalloc * sizeof(char *)); + else + { + LWLockTrancheNames = (const char **) + repalloc(LWLockTrancheNames, newalloc * sizeof(char *)); + memset(LWLockTrancheNames + LWLockTrancheNamesAllocated, + 0, + (newalloc - LWLockTrancheNamesAllocated) * sizeof(char *)); + } + LWLockTrancheNamesAllocated = newalloc; + } + + LWLockTrancheNames[tranche_id] = tranche_name; +} + +/* + * RequestNamedLWLockTranche + * Request that extra LWLocks be allocated during postmaster + * startup. + * + * This is only useful for extensions if called from the _PG_init hook + * of a library that is loaded into the postmaster via + * shared_preload_libraries. Once shared memory has been allocated, calls + * will be ignored. (We could raise an error, but it seems better to make + * it a no-op, so that libraries containing such calls can be reloaded if + * needed.) + * + * The tranche name will be user-visible as a wait event name, so try to + * use a name that fits the style for those. + */ +void +RequestNamedLWLockTranche(const char *tranche_name, int num_lwlocks) +{ + NamedLWLockTrancheRequest *request; + + if (IsUnderPostmaster || !lock_named_request_allowed) + return; /* too late */ + + if (NamedLWLockTrancheRequestArray == NULL) + { + NamedLWLockTrancheRequestsAllocated = 16; + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) + MemoryContextAlloc(TopMemoryContext, + NamedLWLockTrancheRequestsAllocated + * sizeof(NamedLWLockTrancheRequest)); + } + + if (NamedLWLockTrancheRequests >= NamedLWLockTrancheRequestsAllocated) + { + int i = NamedLWLockTrancheRequestsAllocated; + + while (i <= NamedLWLockTrancheRequests) + i *= 2; + + NamedLWLockTrancheRequestArray = (NamedLWLockTrancheRequest *) + repalloc(NamedLWLockTrancheRequestArray, + i * sizeof(NamedLWLockTrancheRequest)); + NamedLWLockTrancheRequestsAllocated = i; + } + + request = &NamedLWLockTrancheRequestArray[NamedLWLockTrancheRequests]; + Assert(strlen(tranche_name) + 1 <= NAMEDATALEN); + strlcpy(request->tranche_name, tranche_name, NAMEDATALEN); + request->num_lwlocks = num_lwlocks; + NamedLWLockTrancheRequests++; +} + +/* + * LWLockInitialize - initialize a new lwlock; it's initially unlocked + */ +void +LWLockInitialize(LWLock *lock, int tranche_id) +{ + pg_atomic_init_u32(&lock->state, LW_FLAG_RELEASE_OK); +#ifdef LOCK_DEBUG + pg_atomic_init_u32(&lock->nwaiters, 0); +#endif + lock->tranche = tranche_id; + proclist_init(&lock->waiters); +} + +/* + * Report start of wait event for light-weight locks. + * + * This function will be used by all the light-weight lock calls which + * needs to wait to acquire the lock. This function distinguishes wait + * event based on tranche and lock id. + */ +static inline void +LWLockReportWaitStart(LWLock *lock) +{ + pgstat_report_wait_start(PG_WAIT_LWLOCK | lock->tranche); +} + +/* + * Report end of wait event for light-weight locks. + */ +static inline void +LWLockReportWaitEnd(void) +{ + pgstat_report_wait_end(); +} + +/* + * Return the name of an LWLock tranche. + */ +static const char * +GetLWTrancheName(uint16 trancheId) +{ + /* Individual LWLock? */ + if (trancheId < NUM_INDIVIDUAL_LWLOCKS) + return IndividualLWLockNames[trancheId]; + + /* Built-in tranche? */ + if (trancheId < LWTRANCHE_FIRST_USER_DEFINED) + return BuiltinTrancheNames[trancheId - NUM_INDIVIDUAL_LWLOCKS]; + + /* + * It's an extension tranche, so look in LWLockTrancheNames[]. However, + * it's possible that the tranche has never been registered in the current + * process, in which case give up and return "extension". + */ + trancheId -= LWTRANCHE_FIRST_USER_DEFINED; + + if (trancheId >= LWLockTrancheNamesAllocated || + LWLockTrancheNames[trancheId] == NULL) + return "extension"; + + return LWLockTrancheNames[trancheId]; +} + +/* + * Return an identifier for an LWLock based on the wait class and event. + */ +const char * +GetLWLockIdentifier(uint32 classId, uint16 eventId) +{ + Assert(classId == PG_WAIT_LWLOCK); + /* The event IDs are just tranche numbers. */ + return GetLWTrancheName(eventId); +} + +/* + * Internal function that tries to atomically acquire the lwlock in the passed + * in mode. + * + * This function will not block waiting for a lock to become free - that's the + * callers job. + * + * Returns true if the lock isn't free and we need to wait. + */ +static bool +LWLockAttemptLock(LWLock *lock, LWLockMode mode) +{ + uint32 old_state; + + AssertArg(mode == LW_EXCLUSIVE || mode == LW_SHARED); + + /* + * Read once outside the loop, later iterations will get the newer value + * via compare & exchange. + */ + old_state = pg_atomic_read_u32(&lock->state); + + /* loop until we've determined whether we could acquire the lock or not */ + while (true) + { + uint32 desired_state; + bool lock_free; + + desired_state = old_state; + + if (mode == LW_EXCLUSIVE) + { + lock_free = (old_state & LW_LOCK_MASK) == 0; + if (lock_free) + desired_state += LW_VAL_EXCLUSIVE; + } + else + { + lock_free = (old_state & LW_VAL_EXCLUSIVE) == 0; + if (lock_free) + desired_state += LW_VAL_SHARED; + } + + /* + * Attempt to swap in the state we are expecting. If we didn't see + * lock to be free, that's just the old value. If we saw it as free, + * we'll attempt to mark it acquired. The reason that we always swap + * in the value is that this doubles as a memory barrier. We could try + * to be smarter and only swap in values if we saw the lock as free, + * but benchmark haven't shown it as beneficial so far. + * + * Retry if the value changed since we last looked at it. + */ + if (pg_atomic_compare_exchange_u32(&lock->state, + &old_state, desired_state)) + { + if (lock_free) + { + /* Great! Got the lock. */ +#ifdef LOCK_DEBUG + if (mode == LW_EXCLUSIVE) + lock->owner = MyProc; +#endif + return false; + } + else + return true; /* somebody else has the lock */ + } + } + pg_unreachable(); +} + +/* + * Lock the LWLock's wait list against concurrent activity. + * + * NB: even though the wait list is locked, non-conflicting lock operations + * may still happen concurrently. + * + * Time spent holding mutex should be short! + */ +static void +LWLockWaitListLock(LWLock *lock) +{ + uint32 old_state; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + uint32 delays = 0; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + while (true) + { + /* always try once to acquire lock directly */ + old_state = pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_LOCKED); + if (!(old_state & LW_FLAG_LOCKED)) + break; /* got lock */ + + /* and then spin without atomic operations until lock is released */ + { + SpinDelayStatus delayStatus; + + init_local_spin_delay(&delayStatus); + + while (old_state & LW_FLAG_LOCKED) + { + perform_spin_delay(&delayStatus); + old_state = pg_atomic_read_u32(&lock->state); + } +#ifdef LWLOCK_STATS + delays += delayStatus.delays; +#endif + finish_spin_delay(&delayStatus); + } + + /* + * Retry. The lock might obviously already be re-acquired by the time + * we're attempting to get it again. + */ + } + +#ifdef LWLOCK_STATS + lwstats->spin_delay_count += delays; +#endif +} + +/* + * Unlock the LWLock's wait list. + * + * Note that it can be more efficient to manipulate flags and release the + * locks in a single atomic operation. + */ +static void +LWLockWaitListUnlock(LWLock *lock) +{ + uint32 old_state PG_USED_FOR_ASSERTS_ONLY; + + old_state = pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_LOCKED); + + Assert(old_state & LW_FLAG_LOCKED); +} + +/* + * Wakeup all the lockers that currently have a chance to acquire the lock. + */ +static void +LWLockWakeup(LWLock *lock) +{ + bool new_release_ok; + bool wokeup_somebody = false; + proclist_head wakeup; + proclist_mutable_iter iter; + + proclist_init(&wakeup); + + new_release_ok = true; + + /* lock wait list while collecting backends to wake up */ + LWLockWaitListLock(lock); + + proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + if (wokeup_somebody && waiter->lwWaitMode == LW_EXCLUSIVE) + continue; + + proclist_delete(&lock->waiters, iter.cur, lwWaitLink); + proclist_push_tail(&wakeup, iter.cur, lwWaitLink); + + if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) + { + /* + * Prevent additional wakeups until retryer gets to run. Backends + * that are just waiting for the lock to become free don't retry + * automatically. + */ + new_release_ok = false; + + /* + * Don't wakeup (further) exclusive locks. + */ + wokeup_somebody = true; + } + + /* + * Once we've woken up an exclusive lock, there's no point in waking + * up anybody else. + */ + if (waiter->lwWaitMode == LW_EXCLUSIVE) + break; + } + + Assert(proclist_is_empty(&wakeup) || pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS); + + /* unset required flags, and release lock, in one fell swoop */ + { + uint32 old_state; + uint32 desired_state; + + old_state = pg_atomic_read_u32(&lock->state); + while (true) + { + desired_state = old_state; + + /* compute desired flags */ + + if (new_release_ok) + desired_state |= LW_FLAG_RELEASE_OK; + else + desired_state &= ~LW_FLAG_RELEASE_OK; + + if (proclist_is_empty(&wakeup)) + desired_state &= ~LW_FLAG_HAS_WAITERS; + + desired_state &= ~LW_FLAG_LOCKED; /* release lock */ + + if (pg_atomic_compare_exchange_u32(&lock->state, &old_state, + desired_state)) + break; + } + } + + /* Awaken any waiters I removed from the queue. */ + proclist_foreach_modify(iter, &wakeup, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + LOG_LWDEBUG("LWLockRelease", lock, "release waiter"); + proclist_delete(&wakeup, iter.cur, lwWaitLink); + + /* + * Guarantee that lwWaiting being unset only becomes visible once the + * unlink from the link has completed. Otherwise the target backend + * could be woken up for other reason and enqueue for a new lock - if + * that happens before the list unlink happens, the list would end up + * being corrupted. + * + * The barrier pairs with the LWLockWaitListLock() when enqueuing for + * another lock. + */ + pg_write_barrier(); + waiter->lwWaiting = false; + PGSemaphoreUnlock(waiter->sem); + } +} + +/* + * Add ourselves to the end of the queue. + * + * NB: Mode can be LW_WAIT_UNTIL_FREE here! + */ +static void +LWLockQueueSelf(LWLock *lock, LWLockMode mode) +{ + /* + * If we don't have a PGPROC structure, there's no way to wait. This + * should never occur, since MyProc should only be null during shared + * memory initialization. + */ + if (MyProc == NULL) + elog(PANIC, "cannot wait without a PGPROC structure"); + + if (MyProc->lwWaiting) + elog(PANIC, "queueing for lock while waiting on another one"); + + LWLockWaitListLock(lock); + + /* setting the flag is protected by the spinlock */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_HAS_WAITERS); + + MyProc->lwWaiting = true; + MyProc->lwWaitMode = mode; + + /* LW_WAIT_UNTIL_FREE waiters are always at the front of the queue */ + if (mode == LW_WAIT_UNTIL_FREE) + proclist_push_head(&lock->waiters, MyProc->pgprocno, lwWaitLink); + else + proclist_push_tail(&lock->waiters, MyProc->pgprocno, lwWaitLink); + + /* Can release the mutex now */ + LWLockWaitListUnlock(lock); + +#ifdef LOCK_DEBUG + pg_atomic_fetch_add_u32(&lock->nwaiters, 1); +#endif + +} + +/* + * Remove ourselves from the waitlist. + * + * This is used if we queued ourselves because we thought we needed to sleep + * but, after further checking, we discovered that we don't actually need to + * do so. + */ +static void +LWLockDequeueSelf(LWLock *lock) +{ + bool found = false; + proclist_mutable_iter iter; + +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); + + lwstats->dequeue_self_count++; +#endif + + LWLockWaitListLock(lock); + + /* + * Can't just remove ourselves from the list, but we need to iterate over + * all entries as somebody else could have dequeued us. + */ + proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) + { + if (iter.cur == MyProc->pgprocno) + { + found = true; + proclist_delete(&lock->waiters, iter.cur, lwWaitLink); + break; + } + } + + if (proclist_is_empty(&lock->waiters) && + (pg_atomic_read_u32(&lock->state) & LW_FLAG_HAS_WAITERS) != 0) + { + pg_atomic_fetch_and_u32(&lock->state, ~LW_FLAG_HAS_WAITERS); + } + + /* XXX: combine with fetch_and above? */ + LWLockWaitListUnlock(lock); + + /* clear waiting state again, nice for debugging */ + if (found) + MyProc->lwWaiting = false; + else + { + int extraWaits = 0; + + /* + * Somebody else dequeued us and has or will wake us up. Deal with the + * superfluous absorption of a wakeup. + */ + + /* + * Reset RELEASE_OK flag if somebody woke us before we removed + * ourselves - they'll have set it to false. + */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + + /* + * Now wait for the scheduled wakeup, otherwise our ->lwWaiting would + * get reset at some inconvenient point later. Most of the time this + * will immediately return. + */ + for (;;) + { + PGSemaphoreLock(MyProc->sem); + if (!MyProc->lwWaiting) + break; + extraWaits++; + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(MyProc->sem); + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif +} + +/* + * LWLockAcquire - acquire a lightweight lock in the specified mode + * + * If the lock is not available, sleep until it is. Returns true if the lock + * was available immediately, false if we had to sleep. + * + * Side effect: cancel/die interrupts are held off until lock release. + */ +bool +LWLockAcquire(LWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + bool result = true; + int extraWaits = 0; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockAcquire", lock, mode); + +#ifdef LWLOCK_STATS + /* Count lock acquisition attempts */ + if (mode == LW_EXCLUSIVE) + lwstats->ex_acquire_count++; + else + lwstats->sh_acquire_count++; +#endif /* LWLOCK_STATS */ + + /* + * We can't wait if we haven't got a PGPROC. This should only occur + * during bootstrap or shared memory initialization. Put an Assert here + * to catch unsafe coding practices. + */ + Assert(!(proc == NULL && IsUnderPostmaster)); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* + * Loop here to try to acquire lock after each time we are signaled by + * LWLockRelease. + * + * NOTE: it might seem better to have LWLockRelease actually grant us the + * lock, rather than retrying and possibly having to go back to sleep. But + * in practice that is no good because it means a process swap for every + * lock acquisition when two or more processes are contending for the same + * lock. Since LWLocks are normally used to protect not-very-long + * sections of computation, a process needs to be able to acquire and + * release the same lock many times during a single CPU time slice, even + * in the presence of contention. The efficiency of being able to do that + * outweighs the inefficiency of sometimes wasting a process dispatch + * cycle because the lock is not free when a released waiter finally gets + * to run. See pgsql-hackers archives for 29-Dec-01. + */ + for (;;) + { + bool mustwait; + + /* + * Try to grab the lock the first time, we're not in the waitqueue + * yet/anymore. + */ + mustwait = LWLockAttemptLock(lock, mode); + + if (!mustwait) + { + LOG_LWDEBUG("LWLockAcquire", lock, "immediately acquired lock"); + break; /* got the lock */ + } + + /* + * Ok, at this point we couldn't grab the lock on the first try. We + * cannot simply queue ourselves to the end of the list and wait to be + * woken up because by now the lock could long have been released. + * Instead add us to the queue and try to grab the lock again. If we + * succeed we need to revert the queuing and be happy, otherwise we + * recheck the lock. If we still couldn't grab it, we know that the + * other locker will see our queue entries when releasing since they + * existed before we checked for the lock. + */ + + /* add to the queue */ + LWLockQueueSelf(lock, mode); + + /* we're now guaranteed to be woken up if necessary */ + mustwait = LWLockAttemptLock(lock, mode); + + /* ok, grabbed the lock the second time round, need to undo queueing */ + if (!mustwait) + { + LOG_LWDEBUG("LWLockAcquire", lock, "acquired, undoing queue"); + + LWLockDequeueSelf(lock); + break; + } + + /* + * Wait until awakened. + * + * It is possible that we get awakened for a reason other than being + * signaled by LWLockRelease. If so, loop back and wait again. Once + * we've gotten the LWLock, re-increment the sema by the number of + * additional signals received. + */ + LOG_LWDEBUG("LWLockAcquire", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (!proc->lwWaiting) + break; + extraWaits++; + } + + /* Retrying, allow LWLockRelease to release waiters again. */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockAcquire", lock, "awakened"); + + /* Now loop back and try to acquire lock again. */ + result = false; + } + + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), mode); + + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + return result; +} + +/* + * LWLockConditionalAcquire - acquire a lightweight lock in the specified mode + * + * If the lock is not available, return false with no side-effects. + * + * If successful, cancel/die interrupts are held off until lock release. + */ +bool +LWLockConditionalAcquire(LWLock *lock, LWLockMode mode) +{ + bool mustwait; + + AssertArg(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockConditionalAcquire", lock, mode); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* Check for the lock */ + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + /* Failed to get lock, so release interrupt holdoff */ + RESUME_INTERRUPTS(); + + LOG_LWDEBUG("LWLockConditionalAcquire", lock, "failed"); + if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_FAIL(T_NAME(lock), mode); + } + else + { + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + if (TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_CONDACQUIRE(T_NAME(lock), mode); + } + return !mustwait; +} + +/* + * LWLockAcquireOrWait - Acquire lock, or wait until it's free + * + * The semantics of this function are a bit funky. If the lock is currently + * free, it is acquired in the given mode, and the function returns true. If + * the lock isn't immediately free, the function waits until it is released + * and returns false, but does not acquire the lock. + * + * This is currently used for WALWriteLock: when a backend flushes the WAL, + * holding WALWriteLock, it can flush the commit records of many other + * backends as a side-effect. Those other backends need to wait until the + * flush finishes, but don't need to acquire the lock anymore. They can just + * wake up, observe that their records have already been flushed, and return. + */ +bool +LWLockAcquireOrWait(LWLock *lock, LWLockMode mode) +{ + PGPROC *proc = MyProc; + bool mustwait; + int extraWaits = 0; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + Assert(mode == LW_SHARED || mode == LW_EXCLUSIVE); + + PRINT_LWDEBUG("LWLockAcquireOrWait", lock, mode); + + /* Ensure we will have room to remember the lock */ + if (num_held_lwlocks >= MAX_SIMUL_LWLOCKS) + elog(ERROR, "too many LWLocks taken"); + + /* + * Lock out cancel/die interrupts until we exit the code section protected + * by the LWLock. This ensures that interrupts will not interfere with + * manipulations of data structures in shared memory. + */ + HOLD_INTERRUPTS(); + + /* + * NB: We're using nearly the same twice-in-a-row lock acquisition + * protocol as LWLockAcquire(). Check its comments for details. + */ + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); + + mustwait = LWLockAttemptLock(lock, mode); + + if (mustwait) + { + /* + * Wait until awakened. Like in LWLockAcquire, be prepared for + * bogus wakeups. + */ + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), mode); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (!proc->lwWaiting) + break; + extraWaits++; + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), mode); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "awakened"); + } + else + { + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "acquired, undoing queue"); + + /* + * Got lock in the second attempt, undo queueing. We need to treat + * this as having successfully acquired the lock, otherwise we'd + * not necessarily wake up people we've prevented from acquiring + * the lock. + */ + LWLockDequeueSelf(lock); + } + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + if (mustwait) + { + /* Failed to get lock, so release interrupt holdoff */ + RESUME_INTERRUPTS(); + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "failed"); + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_FAIL(T_NAME(lock), mode); + } + else + { + LOG_LWDEBUG("LWLockAcquireOrWait", lock, "succeeded"); + /* Add lock to list of locks held by this backend */ + held_lwlocks[num_held_lwlocks].lock = lock; + held_lwlocks[num_held_lwlocks++].mode = mode; + if (TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_ACQUIRE_OR_WAIT(T_NAME(lock), mode); + } + + return !mustwait; +} + +/* + * Does the lwlock in its current state need to wait for the variable value to + * change? + * + * If we don't need to wait, and it's because the value of the variable has + * changed, store the current value in newval. + * + * *result is set to true if the lock was free, and false otherwise. + */ +static bool +LWLockConflictsWithVar(LWLock *lock, + uint64 *valptr, uint64 oldval, uint64 *newval, + bool *result) +{ + bool mustwait; + uint64 value; + + /* + * Test first to see if it the slot is free right now. + * + * XXX: the caller uses a spinlock before this, so we don't need a memory + * barrier here as far as the current usage is concerned. But that might + * not be safe in general. + */ + mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0; + + if (!mustwait) + { + *result = true; + return false; + } + + *result = false; + + /* + * Read value using the lwlock's wait list lock, as we can't generally + * rely on atomic 64 bit reads/stores. TODO: On platforms with a way to + * do atomic 64 bit reads/writes the spinlock should be optimized away. + */ + LWLockWaitListLock(lock); + value = *valptr; + LWLockWaitListUnlock(lock); + + if (value != oldval) + { + mustwait = false; + *newval = value; + } + else + { + mustwait = true; + } + + return mustwait; +} + +/* + * LWLockWaitForVar - Wait until lock is free, or a variable is updated. + * + * If the lock is held and *valptr equals oldval, waits until the lock is + * either freed, or the lock holder updates *valptr by calling + * LWLockUpdateVar. If the lock is free on exit (immediately or after + * waiting), returns true. If the lock is still held, but *valptr no longer + * matches oldval, returns false and sets *newval to the current value in + * *valptr. + * + * Note: this function ignores shared lock holders; if the lock is held + * in shared mode, returns 'true'. + */ +bool +LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval) +{ + PGPROC *proc = MyProc; + int extraWaits = 0; + bool result = false; +#ifdef LWLOCK_STATS + lwlock_stats *lwstats; + + lwstats = get_lwlock_stats_entry(lock); +#endif + + PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE); + + /* + * Lock out cancel/die interrupts while we sleep on the lock. There is no + * cleanup mechanism to remove us from the wait queue if we got + * interrupted. + */ + HOLD_INTERRUPTS(); + + /* + * Loop here to check the lock's status after each time we are signaled. + */ + for (;;) + { + bool mustwait; + + mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, + &result); + + if (!mustwait) + break; /* the lock was free or value didn't match */ + + /* + * Add myself to wait queue. Note that this is racy, somebody else + * could wakeup before we're finished queuing. NB: We're using nearly + * the same twice-in-a-row lock acquisition protocol as + * LWLockAcquire(). Check its comments for details. The only + * difference is that we also have to check the variable's values when + * checking the state of the lock. + */ + LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE); + + /* + * Set RELEASE_OK flag, to make sure we get woken up as soon as the + * lock is released. + */ + pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK); + + /* + * We're now guaranteed to be woken up if necessary. Recheck the lock + * and variables state. + */ + mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval, + &result); + + /* Ok, no conflict after we queued ourselves. Undo queueing. */ + if (!mustwait) + { + LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue"); + + LWLockDequeueSelf(lock); + break; + } + + /* + * Wait until awakened. + * + * It is possible that we get awakened for a reason other than being + * signaled by LWLockRelease. If so, loop back and wait again. Once + * we've gotten the LWLock, re-increment the sema by the number of + * additional signals received. + */ + LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting"); + +#ifdef LWLOCK_STATS + lwstats->block_count++; +#endif + + LWLockReportWaitStart(lock); + if (TRACE_POSTGRESQL_LWLOCK_WAIT_START_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), LW_EXCLUSIVE); + + for (;;) + { + PGSemaphoreLock(proc->sem); + if (!proc->lwWaiting) + break; + extraWaits++; + } + +#ifdef LOCK_DEBUG + { + /* not waiting anymore */ + uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1); + + Assert(nwaiters < MAX_BACKENDS); + } +#endif + + if (TRACE_POSTGRESQL_LWLOCK_WAIT_DONE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), LW_EXCLUSIVE); + LWLockReportWaitEnd(); + + LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened"); + + /* Now loop back and check the status of the lock again. */ + } + + /* + * Fix the process wait semaphore's count for any absorbed wakeups. + */ + while (extraWaits-- > 0) + PGSemaphoreUnlock(proc->sem); + + /* + * Now okay to allow cancel/die interrupts. + */ + RESUME_INTERRUPTS(); + + return result; +} + + +/* + * LWLockUpdateVar - Update a variable and wake up waiters atomically + * + * Sets *valptr to 'val', and wakes up all processes waiting for us with + * LWLockWaitForVar(). Setting the value and waking up the processes happen + * atomically so that any process calling LWLockWaitForVar() on the same lock + * is guaranteed to see the new value, and act accordingly. + * + * The caller must be holding the lock in exclusive mode. + */ +void +LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val) +{ + proclist_head wakeup; + proclist_mutable_iter iter; + + PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE); + + proclist_init(&wakeup); + + LWLockWaitListLock(lock); + + Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE); + + /* Update the lock's value */ + *valptr = val; + + /* + * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken + * up. They are always in the front of the queue. + */ + proclist_foreach_modify(iter, &lock->waiters, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE) + break; + + proclist_delete(&lock->waiters, iter.cur, lwWaitLink); + proclist_push_tail(&wakeup, iter.cur, lwWaitLink); + } + + /* We are done updating shared state of the lock itself. */ + LWLockWaitListUnlock(lock); + + /* + * Awaken any waiters I removed from the queue. + */ + proclist_foreach_modify(iter, &wakeup, lwWaitLink) + { + PGPROC *waiter = GetPGProcByNumber(iter.cur); + + proclist_delete(&wakeup, iter.cur, lwWaitLink); + /* check comment in LWLockWakeup() about this barrier */ + pg_write_barrier(); + waiter->lwWaiting = false; + PGSemaphoreUnlock(waiter->sem); + } +} + + +/* + * LWLockRelease - release a previously acquired lock + */ +void +LWLockRelease(LWLock *lock) +{ + LWLockMode mode; + uint32 oldstate; + bool check_waiters; + int i; + + /* + * Remove lock from list of locks held. Usually, but not always, it will + * be the latest-acquired lock; so search array backwards. + */ + for (i = num_held_lwlocks; --i >= 0;) + if (lock == held_lwlocks[i].lock) + break; + + if (i < 0) + elog(ERROR, "lock %s is not held", T_NAME(lock)); + + mode = held_lwlocks[i].mode; + + num_held_lwlocks--; + for (; i < num_held_lwlocks; i++) + held_lwlocks[i] = held_lwlocks[i + 1]; + + PRINT_LWDEBUG("LWLockRelease", lock, mode); + + /* + * Release my hold on lock, after that it can immediately be acquired by + * others, even if we still have to wakeup other waiters. + */ + if (mode == LW_EXCLUSIVE) + oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_EXCLUSIVE); + else + oldstate = pg_atomic_sub_fetch_u32(&lock->state, LW_VAL_SHARED); + + /* nobody else can have that kind of lock */ + Assert(!(oldstate & LW_VAL_EXCLUSIVE)); + + if (TRACE_POSTGRESQL_LWLOCK_RELEASE_ENABLED()) + TRACE_POSTGRESQL_LWLOCK_RELEASE(T_NAME(lock)); + + /* + * We're still waiting for backends to get scheduled, don't wake them up + * again. + */ + if ((oldstate & (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK)) == + (LW_FLAG_HAS_WAITERS | LW_FLAG_RELEASE_OK) && + (oldstate & LW_LOCK_MASK) == 0) + check_waiters = true; + else + check_waiters = false; + + /* + * As waking up waiters requires the spinlock to be acquired, only do so + * if necessary. + */ + if (check_waiters) + { + /* XXX: remove before commit? */ + LOG_LWDEBUG("LWLockRelease", lock, "releasing waiters"); + LWLockWakeup(lock); + } + + /* + * Now okay to allow cancel/die interrupts. + */ + RESUME_INTERRUPTS(); +} + +/* + * LWLockReleaseClearVar - release a previously acquired lock, reset variable + */ +void +LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val) +{ + LWLockWaitListLock(lock); + + /* + * Set the variable's value before releasing the lock, that prevents race + * a race condition wherein a new locker acquires the lock, but hasn't yet + * set the variables value. + */ + *valptr = val; + LWLockWaitListUnlock(lock); + + LWLockRelease(lock); +} + + +/* + * LWLockReleaseAll - release all currently-held locks + * + * Used to clean up after ereport(ERROR). An important difference between this + * function and retail LWLockRelease calls is that InterruptHoldoffCount is + * unchanged by this operation. This is necessary since InterruptHoldoffCount + * has been set to an appropriate level earlier in error recovery. We could + * decrement it below zero if we allow it to drop for each released lock! + */ +void +LWLockReleaseAll(void) +{ + while (num_held_lwlocks > 0) + { + HOLD_INTERRUPTS(); /* match the upcoming RESUME_INTERRUPTS */ + + LWLockRelease(held_lwlocks[num_held_lwlocks - 1].lock); + } +} + + +/* + * LWLockHeldByMe - test whether my process holds a lock in any mode + * + * This is meant as debug support only. + */ +bool +LWLockHeldByMe(LWLock *l) +{ + int i; + + for (i = 0; i < num_held_lwlocks; i++) + { + if (held_lwlocks[i].lock == l) + return true; + } + return false; +} + +/* + * LWLockHeldByMe - test whether my process holds any of an array of locks + * + * This is meant as debug support only. + */ +bool +LWLockAnyHeldByMe(LWLock *l, int nlocks, size_t stride) +{ + char *held_lock_addr; + char *begin; + char *end; + int i; + + begin = (char *) l; + end = begin + nlocks * stride; + for (i = 0; i < num_held_lwlocks; i++) + { + held_lock_addr = (char *) held_lwlocks[i].lock; + if (held_lock_addr >= begin && + held_lock_addr < end && + (held_lock_addr - begin) % stride == 0) + return true; + } + return false; +} + +/* + * LWLockHeldByMeInMode - test whether my process holds a lock in given mode + * + * This is meant as debug support only. + */ +bool +LWLockHeldByMeInMode(LWLock *l, LWLockMode mode) +{ + int i; + + for (i = 0; i < num_held_lwlocks; i++) + { + if (held_lwlocks[i].lock == l && held_lwlocks[i].mode == mode) + return true; + } + return false; +} |