diff options
Diffstat (limited to '')
-rw-r--r-- | src/backend/storage/freespace/README | 16 | ||||
-rw-r--r-- | src/backend/storage/freespace/freespace.c | 106 | ||||
-rw-r--r-- | src/backend/storage/ipc/dsm_impl.c | 2 | ||||
-rw-r--r-- | src/backend/storage/ipc/latch.c | 70 | ||||
-rw-r--r-- | src/backend/storage/lmgr/lmgr.c | 38 | ||||
-rw-r--r-- | src/backend/storage/smgr/smgr.c | 5 |
6 files changed, 187 insertions, 50 deletions
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README index e7ff23b..dc2a63a 100644 --- a/src/backend/storage/freespace/README +++ b/src/backend/storage/freespace/README @@ -169,9 +169,7 @@ Recovery -------- The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of -self-correcting measures to repair possible corruption. As a result when -we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint() -rather than MarkBufferDirty(). +self-correcting measures to repair possible corruption. First of all, whenever a value is set on an FSM page, the root node of the page is compared against the new value after bubbling up the change is @@ -188,6 +186,18 @@ goes through fsm_set_avail(), so that the upper nodes on those pages are immediately updated. Periodically, VACUUM calls FreeSpaceMapVacuum[Range] to propagate the new free-space info into the upper pages of the FSM tree. +As a result when we write to the FSM we treat that as a hint and thus use +MarkBufferDirtyHint() rather than MarkBufferDirty(). Every read here uses +RBM_ZERO_ON_ERROR to bypass checksum mismatches and other verification +failures. We'd operate correctly without the full page images that +MarkBufferDirtyHint() provides, but they do decrease the chance of losing slot +knowledge to RBM_ZERO_ON_ERROR. + +Relation extension is not WAL-logged. Hence, after WAL replay, an on-disk FSM +slot may indicate free space in PageIsNew() blocks that never reached disk. +We detect this case by comparing against the actual relation size, and we mark +the block as full in that case. + TODO ---- diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c index d41ae37..01adf28 100644 --- a/src/backend/storage/freespace/freespace.c +++ b/src/backend/storage/freespace/freespace.c @@ -112,6 +112,7 @@ static BlockNumber fsm_search(Relation rel, uint8 min_cat); static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, BlockNumber start, BlockNumber end, bool *eof); +static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber); /******** Public API ********/ @@ -128,6 +129,9 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr, * amount of free space available on that page and then try again (see * RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned, * extend the relation. + * + * This can trigger FSM updates if any FSM entry is found to point to a block + * past the end of the relation. */ BlockNumber GetPageWithFreeSpace(Relation rel, Size spaceNeeded) @@ -166,9 +170,17 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage, * Otherwise, search as usual. */ if (search_slot != -1) - return fsm_get_heap_blk(addr, search_slot); - else - return fsm_search(rel, search_cat); + { + BlockNumber blknum = fsm_get_heap_blk(addr, search_slot); + + /* + * Check that the blknum is actually in the relation. Don't try to + * update the FSM in that case, just fall back to the other case + */ + if (fsm_does_block_exist(rel, blknum)) + return blknum; + } + return fsm_search(rel, search_cat); } /* @@ -297,14 +309,25 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks) fsm_truncate_avail(BufferGetPage(buf), first_removed_slot); /* - * Truncation of a relation is WAL-logged at a higher-level, and we - * will be called at WAL replay. But if checksums are enabled, we need - * to still write a WAL record to protect against a torn page, if the - * page is flushed to disk before the truncation WAL record. We cannot - * use MarkBufferDirtyHint here, because that will not dirty the page - * during recovery. + * This change is non-critical, because fsm_does_block_exist() would + * stop us from returning a truncated-away block. However, since this + * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost + * of that many fsm_does_block_exist() rejections. Use a full + * MarkBufferDirty(), not MarkBufferDirtyHint(). */ MarkBufferDirty(buf); + + /* + * WAL-log like MarkBufferDirtyHint() might have done, just to avoid + * differing from the rest of the file in this respect. This is + * optional; see README mention of full page images. XXX consider + * XLogSaveBufferForHint() for even closer similarity. + * + * A higher-level operation calls us at WAL replay. If we crash + * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has + * not changed, and our fork remains valid. If we crash after that + * flush, redo will return here. + */ if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded()) log_newpage_buffer(buf, false); @@ -721,8 +744,15 @@ fsm_search(Relation rel, uint8 min_cat) (addr.level == FSM_BOTTOM_LEVEL), false); if (slot == -1) + { max_avail = fsm_get_max_avail(BufferGetPage(buf)); - UnlockReleaseBuffer(buf); + UnlockReleaseBuffer(buf); + } + else + { + /* Keep the pin for possible update below */ + LockBuffer(buf, BUFFER_LOCK_UNLOCK); + } } else slot = -1; @@ -734,8 +764,37 @@ fsm_search(Relation rel, uint8 min_cat) * bottom. */ if (addr.level == FSM_BOTTOM_LEVEL) - return fsm_get_heap_blk(addr, slot); - + { + BlockNumber blkno = fsm_get_heap_blk(addr, slot); + Page page; + + if (fsm_does_block_exist(rel, blkno)) + { + ReleaseBuffer(buf); + return blkno; + } + + /* + * Block is past the end of the relation. Update FSM, and + * restart from root. The usual "advancenext" behavior is + * pessimal for this rare scenario, since every later slot is + * unusable in the same way. We could zero all affected slots + * on the same FSM page, but don't bet on the benefits of that + * optimization justifying its compiled code bulk. + */ + page = BufferGetPage(buf); + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + fsm_set_avail(page, slot, 0); + MarkBufferDirtyHint(buf, false); + UnlockReleaseBuffer(buf); + if (restarts++ > 10000) /* same rationale as below */ + return InvalidBlockNumber; + addr = FSM_ROOT_ADDRESS; + } + else + { + ReleaseBuffer(buf); + } addr = fsm_get_child(addr, slot); } else if (addr.level == FSM_ROOT_LEVEL) @@ -903,3 +962,26 @@ fsm_vacuum_page(Relation rel, FSMAddress addr, return max_avail; } + + +/* + * Check whether a block number is past the end of the relation. This can + * happen after WAL replay, if the FSM reached disk but newly-extended pages + * it refers to did not. + */ +static bool +fsm_does_block_exist(Relation rel, BlockNumber blknumber) +{ + SMgrRelation smgr = RelationGetSmgr(rel); + + /* + * If below the cached nblocks, the block surely exists. Otherwise, we + * face a trade-off. We opt to compare to a fresh nblocks, incurring + * lseek() overhead. The alternative would be to assume the block does + * not exist, but that would cause FSM to set zero space available for + * blocks that main fork extension just recorded. + */ + return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) && + blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) || + blknumber < RelationGetNumberOfBlocks(rel)); +} diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c index 340048a..77a195b 100644 --- a/src/backend/storage/ipc/dsm_impl.c +++ b/src/backend/storage/ipc/dsm_impl.c @@ -880,7 +880,7 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size, * transferring data to the kernel. */ char *zbuffer = (char *) palloc0(ZBUFFER_SIZE); - uint32 remaining = request_size; + Size remaining = request_size; bool success = true; /* diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c index 8bdb549..c865281 100644 --- a/src/backend/storage/ipc/latch.c +++ b/src/backend/storage/ipc/latch.c @@ -529,48 +529,54 @@ WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock, WaitEvent event; WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3); - if (wakeEvents & WL_TIMEOUT) - Assert(timeout >= 0); - else - timeout = -1; + PG_TRY(); + { + if (wakeEvents & WL_TIMEOUT) + Assert(timeout >= 0); + else + timeout = -1; - if (wakeEvents & WL_LATCH_SET) - AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET, - latch, NULL); + if (wakeEvents & WL_LATCH_SET) + AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET, + latch, NULL); - /* Postmaster-managed callers must handle postmaster death somehow. */ - Assert(!IsUnderPostmaster || - (wakeEvents & WL_EXIT_ON_PM_DEATH) || - (wakeEvents & WL_POSTMASTER_DEATH)); + /* Postmaster-managed callers must handle postmaster death somehow. */ + Assert(!IsUnderPostmaster || + (wakeEvents & WL_EXIT_ON_PM_DEATH) || + (wakeEvents & WL_POSTMASTER_DEATH)); - if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster) - AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, - NULL, NULL); + if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster) + AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET, + NULL, NULL); - if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster) - AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, - NULL, NULL); + if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster) + AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET, + NULL, NULL); - if (wakeEvents & WL_SOCKET_MASK) - { - int ev; + if (wakeEvents & WL_SOCKET_MASK) + { + int ev; - ev = wakeEvents & WL_SOCKET_MASK; - AddWaitEventToSet(set, ev, sock, NULL, NULL); - } + ev = wakeEvents & WL_SOCKET_MASK; + AddWaitEventToSet(set, ev, sock, NULL, NULL); + } - rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info); + rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info); - if (rc == 0) - ret |= WL_TIMEOUT; - else + if (rc == 0) + ret |= WL_TIMEOUT; + else + { + ret |= event.events & (WL_LATCH_SET | + WL_POSTMASTER_DEATH | + WL_SOCKET_MASK); + } + } + PG_FINALLY(); { - ret |= event.events & (WL_LATCH_SET | - WL_POSTMASTER_DEATH | - WL_SOCKET_MASK); + FreeWaitEventSet(set); } - - FreeWaitEventSet(set); + PG_END_TRY(); return ret; } diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c index 1543da6..2acba39 100644 --- a/src/backend/storage/lmgr/lmgr.c +++ b/src/backend/storage/lmgr/lmgr.c @@ -1020,6 +1020,44 @@ LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, } /* + * ConditionalLockDatabaseObject + * + * As above, but only lock if we can get the lock without blocking. + * Returns true iff the lock was acquired. + */ +bool +ConditionalLockDatabaseObject(Oid classid, Oid objid, uint16 objsubid, + LOCKMODE lockmode) +{ + LOCKTAG tag; + LOCALLOCK *locallock; + LockAcquireResult res; + + SET_LOCKTAG_OBJECT(tag, + MyDatabaseId, + classid, + objid, + objsubid); + + res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock); + + if (res == LOCKACQUIRE_NOT_AVAIL) + return false; + + /* + * Now that we have the lock, check for invalidation messages; see notes + * in LockRelationOid. + */ + if (res != LOCKACQUIRE_ALREADY_CLEAR) + { + AcceptInvalidationMessages(); + MarkLockClear(locallock); + } + + return true; +} + +/* * UnlockDatabaseObject */ void diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 631733b..8ded269 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -604,8 +604,9 @@ BlockNumber smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum) { /* - * For now, we only use cached values in recovery due to lack of a shared - * invalidation mechanism for changes in file size. + * For now, this function uses cached values only in recovery due to lack + * of a shared invalidation mechanism for changes in file size. Code + * elsewhere reads smgr_cached_nblocks and copes with stale data. */ if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber) return reln->smgr_cached_nblocks[forknum]; |