summaryrefslogtreecommitdiffstats
path: root/src/backend/storage
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-14 19:16:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-14 19:16:20 +0000
commit323bcca5249c707b68d9f6d921d86fd750bcf33e (patch)
tree07b4722c510482f5ee2fdcc3d381fc77747b0178 /src/backend/storage
parentAdding debian version 16.2-2. (diff)
downloadpostgresql-16-323bcca5249c707b68d9f6d921d86fd750bcf33e.tar.xz
postgresql-16-323bcca5249c707b68d9f6d921d86fd750bcf33e.zip
Merging upstream version 16.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/storage')
-rw-r--r--src/backend/storage/freespace/README16
-rw-r--r--src/backend/storage/freespace/freespace.c106
-rw-r--r--src/backend/storage/ipc/dsm_impl.c2
-rw-r--r--src/backend/storage/ipc/latch.c70
-rw-r--r--src/backend/storage/lmgr/lmgr.c38
-rw-r--r--src/backend/storage/smgr/smgr.c5
6 files changed, 187 insertions, 50 deletions
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README
index e7ff23b..dc2a63a 100644
--- a/src/backend/storage/freespace/README
+++ b/src/backend/storage/freespace/README
@@ -169,9 +169,7 @@ Recovery
--------
The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of
-self-correcting measures to repair possible corruption. As a result when
-we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint()
-rather than MarkBufferDirty().
+self-correcting measures to repair possible corruption.
First of all, whenever a value is set on an FSM page, the root node of the
page is compared against the new value after bubbling up the change is
@@ -188,6 +186,18 @@ goes through fsm_set_avail(), so that the upper nodes on those pages are
immediately updated. Periodically, VACUUM calls FreeSpaceMapVacuum[Range]
to propagate the new free-space info into the upper pages of the FSM tree.
+As a result when we write to the FSM we treat that as a hint and thus use
+MarkBufferDirtyHint() rather than MarkBufferDirty(). Every read here uses
+RBM_ZERO_ON_ERROR to bypass checksum mismatches and other verification
+failures. We'd operate correctly without the full page images that
+MarkBufferDirtyHint() provides, but they do decrease the chance of losing slot
+knowledge to RBM_ZERO_ON_ERROR.
+
+Relation extension is not WAL-logged. Hence, after WAL replay, an on-disk FSM
+slot may indicate free space in PageIsNew() blocks that never reached disk.
+We detect this case by comparing against the actual relation size, and we mark
+the block as full in that case.
+
TODO
----
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index fb9440f..e27e8ca 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -112,6 +112,7 @@ static BlockNumber fsm_search(Relation rel, uint8 min_cat);
static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
BlockNumber start, BlockNumber end,
bool *eof_p);
+static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
/******** Public API ********/
@@ -128,6 +129,9 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
* amount of free space available on that page and then try again (see
* RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
* extend the relation.
+ *
+ * This can trigger FSM updates if any FSM entry is found to point to a block
+ * past the end of the relation.
*/
BlockNumber
GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
@@ -166,9 +170,17 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
* Otherwise, search as usual.
*/
if (search_slot != -1)
- return fsm_get_heap_blk(addr, search_slot);
- else
- return fsm_search(rel, search_cat);
+ {
+ BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
+
+ /*
+ * Check that the blknum is actually in the relation. Don't try to
+ * update the FSM in that case, just fall back to the other case
+ */
+ if (fsm_does_block_exist(rel, blknum))
+ return blknum;
+ }
+ return fsm_search(rel, search_cat);
}
/*
@@ -297,14 +309,25 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
/*
- * Truncation of a relation is WAL-logged at a higher-level, and we
- * will be called at WAL replay. But if checksums are enabled, we need
- * to still write a WAL record to protect against a torn page, if the
- * page is flushed to disk before the truncation WAL record. We cannot
- * use MarkBufferDirtyHint here, because that will not dirty the page
- * during recovery.
+ * This change is non-critical, because fsm_does_block_exist() would
+ * stop us from returning a truncated-away block. However, since this
+ * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
+ * of that many fsm_does_block_exist() rejections. Use a full
+ * MarkBufferDirty(), not MarkBufferDirtyHint().
*/
MarkBufferDirty(buf);
+
+ /*
+ * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
+ * differing from the rest of the file in this respect. This is
+ * optional; see README mention of full page images. XXX consider
+ * XLogSaveBufferForHint() for even closer similarity.
+ *
+ * A higher-level operation calls us at WAL replay. If we crash
+ * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
+ * not changed, and our fork remains valid. If we crash after that
+ * flush, redo will return here.
+ */
if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
log_newpage_buffer(buf, false);
@@ -681,8 +704,15 @@ fsm_search(Relation rel, uint8 min_cat)
(addr.level == FSM_BOTTOM_LEVEL),
false);
if (slot == -1)
+ {
max_avail = fsm_get_max_avail(BufferGetPage(buf));
- UnlockReleaseBuffer(buf);
+ UnlockReleaseBuffer(buf);
+ }
+ else
+ {
+ /* Keep the pin for possible update below */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
}
else
slot = -1;
@@ -694,8 +724,37 @@ fsm_search(Relation rel, uint8 min_cat)
* bottom.
*/
if (addr.level == FSM_BOTTOM_LEVEL)
- return fsm_get_heap_blk(addr, slot);
-
+ {
+ BlockNumber blkno = fsm_get_heap_blk(addr, slot);
+ Page page;
+
+ if (fsm_does_block_exist(rel, blkno))
+ {
+ ReleaseBuffer(buf);
+ return blkno;
+ }
+
+ /*
+ * Block is past the end of the relation. Update FSM, and
+ * restart from root. The usual "advancenext" behavior is
+ * pessimal for this rare scenario, since every later slot is
+ * unusable in the same way. We could zero all affected slots
+ * on the same FSM page, but don't bet on the benefits of that
+ * optimization justifying its compiled code bulk.
+ */
+ page = BufferGetPage(buf);
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ fsm_set_avail(page, slot, 0);
+ MarkBufferDirtyHint(buf, false);
+ UnlockReleaseBuffer(buf);
+ if (restarts++ > 10000) /* same rationale as below */
+ return InvalidBlockNumber;
+ addr = FSM_ROOT_ADDRESS;
+ }
+ else
+ {
+ ReleaseBuffer(buf);
+ }
addr = fsm_get_child(addr, slot);
}
else if (addr.level == FSM_ROOT_LEVEL)
@@ -863,3 +922,26 @@ fsm_vacuum_page(Relation rel, FSMAddress addr,
return max_avail;
}
+
+
+/*
+ * Check whether a block number is past the end of the relation. This can
+ * happen after WAL replay, if the FSM reached disk but newly-extended pages
+ * it refers to did not.
+ */
+static bool
+fsm_does_block_exist(Relation rel, BlockNumber blknumber)
+{
+ SMgrRelation smgr = RelationGetSmgr(rel);
+
+ /*
+ * If below the cached nblocks, the block surely exists. Otherwise, we
+ * face a trade-off. We opt to compare to a fresh nblocks, incurring
+ * lseek() overhead. The alternative would be to assume the block does
+ * not exist, but that would cause FSM to set zero space available for
+ * blocks that main fork extension just recorded.
+ */
+ return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
+ blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
+ blknumber < RelationGetNumberOfBlocks(rel));
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 6399fa2..0ac0035 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -873,7 +873,7 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
* transferring data to the kernel.
*/
char *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
- uint32 remaining = request_size;
+ Size remaining = request_size;
bool success = true;
/*
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index cdb95c1..7673280 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -545,48 +545,54 @@ WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
WaitEvent event;
WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
- if (wakeEvents & WL_TIMEOUT)
- Assert(timeout >= 0);
- else
- timeout = -1;
+ PG_TRY();
+ {
+ if (wakeEvents & WL_TIMEOUT)
+ Assert(timeout >= 0);
+ else
+ timeout = -1;
- if (wakeEvents & WL_LATCH_SET)
- AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
- latch, NULL);
+ if (wakeEvents & WL_LATCH_SET)
+ AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
+ latch, NULL);
- /* Postmaster-managed callers must handle postmaster death somehow. */
- Assert(!IsUnderPostmaster ||
- (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
- (wakeEvents & WL_POSTMASTER_DEATH));
+ /* Postmaster-managed callers must handle postmaster death somehow. */
+ Assert(!IsUnderPostmaster ||
+ (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+ (wakeEvents & WL_POSTMASTER_DEATH));
- if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
- AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
- NULL, NULL);
+ if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
+ AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
+ NULL, NULL);
- if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
- AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
- NULL, NULL);
+ if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
+ AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+ NULL, NULL);
- if (wakeEvents & WL_SOCKET_MASK)
- {
- int ev;
+ if (wakeEvents & WL_SOCKET_MASK)
+ {
+ int ev;
- ev = wakeEvents & WL_SOCKET_MASK;
- AddWaitEventToSet(set, ev, sock, NULL, NULL);
- }
+ ev = wakeEvents & WL_SOCKET_MASK;
+ AddWaitEventToSet(set, ev, sock, NULL, NULL);
+ }
- rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
+ rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
- if (rc == 0)
- ret |= WL_TIMEOUT;
- else
+ if (rc == 0)
+ ret |= WL_TIMEOUT;
+ else
+ {
+ ret |= event.events & (WL_LATCH_SET |
+ WL_POSTMASTER_DEATH |
+ WL_SOCKET_MASK);
+ }
+ }
+ PG_FINALLY();
{
- ret |= event.events & (WL_LATCH_SET |
- WL_POSTMASTER_DEATH |
- WL_SOCKET_MASK);
+ FreeWaitEventSet(set);
}
-
- FreeWaitEventSet(set);
+ PG_END_TRY();
return ret;
}
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index ee9b89a..2da91eb 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -1020,6 +1020,44 @@ LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
}
/*
+ * ConditionalLockDatabaseObject
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+ LOCKMODE lockmode)
+{
+ LOCKTAG tag;
+ LOCALLOCK *locallock;
+ LockAcquireResult res;
+
+ SET_LOCKTAG_OBJECT(tag,
+ MyDatabaseId,
+ classid,
+ objid,
+ objsubid);
+
+ res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+ if (res == LOCKACQUIRE_NOT_AVAIL)
+ return false;
+
+ /*
+ * Now that we have the lock, check for invalidation messages; see notes
+ * in LockRelationOid.
+ */
+ if (res != LOCKACQUIRE_ALREADY_CLEAR)
+ {
+ AcceptInvalidationMessages();
+ MarkLockClear(locallock);
+ }
+
+ return true;
+}
+
+/*
* UnlockDatabaseObject
*/
void
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 5d0f3d5..e4a4f66 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -633,8 +633,9 @@ BlockNumber
smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
{
/*
- * For now, we only use cached values in recovery due to lack of a shared
- * invalidation mechanism for changes in file size.
+ * For now, this function uses cached values only in recovery due to lack
+ * of a shared invalidation mechanism for changes in file size. Code
+ * elsewhere reads smgr_cached_nblocks and copes with stale data.
*/
if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
return reln->smgr_cached_nblocks[forknum];