summaryrefslogtreecommitdiffstats
path: root/src/backend/storage/freespace
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-14 19:16:24 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-14 19:16:24 +0000
commit2a0f262beff32ba86bcb58f3273214e5d0517c09 (patch)
tree24c0ad10dab36bbd5c22743d3c88c4e0ccd5bc65 /src/backend/storage/freespace
parentReleasing progress-linux version 16.2-2~progress7.99u1. (diff)
downloadpostgresql-16-2a0f262beff32ba86bcb58f3273214e5d0517c09.tar.xz
postgresql-16-2a0f262beff32ba86bcb58f3273214e5d0517c09.zip
Merging upstream version 16.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'src/backend/storage/freespace')
-rw-r--r--src/backend/storage/freespace/README16
-rw-r--r--src/backend/storage/freespace/freespace.c106
2 files changed, 107 insertions, 15 deletions
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README
index e7ff23b..dc2a63a 100644
--- a/src/backend/storage/freespace/README
+++ b/src/backend/storage/freespace/README
@@ -169,9 +169,7 @@ Recovery
--------
The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of
-self-correcting measures to repair possible corruption. As a result when
-we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint()
-rather than MarkBufferDirty().
+self-correcting measures to repair possible corruption.
First of all, whenever a value is set on an FSM page, the root node of the
page is compared against the new value after bubbling up the change is
@@ -188,6 +186,18 @@ goes through fsm_set_avail(), so that the upper nodes on those pages are
immediately updated. Periodically, VACUUM calls FreeSpaceMapVacuum[Range]
to propagate the new free-space info into the upper pages of the FSM tree.
+As a result when we write to the FSM we treat that as a hint and thus use
+MarkBufferDirtyHint() rather than MarkBufferDirty(). Every read here uses
+RBM_ZERO_ON_ERROR to bypass checksum mismatches and other verification
+failures. We'd operate correctly without the full page images that
+MarkBufferDirtyHint() provides, but they do decrease the chance of losing slot
+knowledge to RBM_ZERO_ON_ERROR.
+
+Relation extension is not WAL-logged. Hence, after WAL replay, an on-disk FSM
+slot may indicate free space in PageIsNew() blocks that never reached disk.
+We detect this case by comparing against the actual relation size, and we mark
+the block as full in that case.
+
TODO
----
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index fb9440f..e27e8ca 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -112,6 +112,7 @@ static BlockNumber fsm_search(Relation rel, uint8 min_cat);
static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
BlockNumber start, BlockNumber end,
bool *eof_p);
+static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
/******** Public API ********/
@@ -128,6 +129,9 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
* amount of free space available on that page and then try again (see
* RecordAndGetPageWithFreeSpace). If InvalidBlockNumber is returned,
* extend the relation.
+ *
+ * This can trigger FSM updates if any FSM entry is found to point to a block
+ * past the end of the relation.
*/
BlockNumber
GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
@@ -166,9 +170,17 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
* Otherwise, search as usual.
*/
if (search_slot != -1)
- return fsm_get_heap_blk(addr, search_slot);
- else
- return fsm_search(rel, search_cat);
+ {
+ BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
+
+ /*
+ * Check that the blknum is actually in the relation. Don't try to
+ * update the FSM in that case, just fall back to the other case
+ */
+ if (fsm_does_block_exist(rel, blknum))
+ return blknum;
+ }
+ return fsm_search(rel, search_cat);
}
/*
@@ -297,14 +309,25 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
/*
- * Truncation of a relation is WAL-logged at a higher-level, and we
- * will be called at WAL replay. But if checksums are enabled, we need
- * to still write a WAL record to protect against a torn page, if the
- * page is flushed to disk before the truncation WAL record. We cannot
- * use MarkBufferDirtyHint here, because that will not dirty the page
- * during recovery.
+ * This change is non-critical, because fsm_does_block_exist() would
+ * stop us from returning a truncated-away block. However, since this
+ * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
+ * of that many fsm_does_block_exist() rejections. Use a full
+ * MarkBufferDirty(), not MarkBufferDirtyHint().
*/
MarkBufferDirty(buf);
+
+ /*
+ * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
+ * differing from the rest of the file in this respect. This is
+ * optional; see README mention of full page images. XXX consider
+ * XLogSaveBufferForHint() for even closer similarity.
+ *
+ * A higher-level operation calls us at WAL replay. If we crash
+ * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
+ * not changed, and our fork remains valid. If we crash after that
+ * flush, redo will return here.
+ */
if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
log_newpage_buffer(buf, false);
@@ -681,8 +704,15 @@ fsm_search(Relation rel, uint8 min_cat)
(addr.level == FSM_BOTTOM_LEVEL),
false);
if (slot == -1)
+ {
max_avail = fsm_get_max_avail(BufferGetPage(buf));
- UnlockReleaseBuffer(buf);
+ UnlockReleaseBuffer(buf);
+ }
+ else
+ {
+ /* Keep the pin for possible update below */
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ }
}
else
slot = -1;
@@ -694,8 +724,37 @@ fsm_search(Relation rel, uint8 min_cat)
* bottom.
*/
if (addr.level == FSM_BOTTOM_LEVEL)
- return fsm_get_heap_blk(addr, slot);
-
+ {
+ BlockNumber blkno = fsm_get_heap_blk(addr, slot);
+ Page page;
+
+ if (fsm_does_block_exist(rel, blkno))
+ {
+ ReleaseBuffer(buf);
+ return blkno;
+ }
+
+ /*
+ * Block is past the end of the relation. Update FSM, and
+ * restart from root. The usual "advancenext" behavior is
+ * pessimal for this rare scenario, since every later slot is
+ * unusable in the same way. We could zero all affected slots
+ * on the same FSM page, but don't bet on the benefits of that
+ * optimization justifying its compiled code bulk.
+ */
+ page = BufferGetPage(buf);
+ LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+ fsm_set_avail(page, slot, 0);
+ MarkBufferDirtyHint(buf, false);
+ UnlockReleaseBuffer(buf);
+ if (restarts++ > 10000) /* same rationale as below */
+ return InvalidBlockNumber;
+ addr = FSM_ROOT_ADDRESS;
+ }
+ else
+ {
+ ReleaseBuffer(buf);
+ }
addr = fsm_get_child(addr, slot);
}
else if (addr.level == FSM_ROOT_LEVEL)
@@ -863,3 +922,26 @@ fsm_vacuum_page(Relation rel, FSMAddress addr,
return max_avail;
}
+
+
+/*
+ * Check whether a block number is past the end of the relation. This can
+ * happen after WAL replay, if the FSM reached disk but newly-extended pages
+ * it refers to did not.
+ */
+static bool
+fsm_does_block_exist(Relation rel, BlockNumber blknumber)
+{
+ SMgrRelation smgr = RelationGetSmgr(rel);
+
+ /*
+ * If below the cached nblocks, the block surely exists. Otherwise, we
+ * face a trade-off. We opt to compare to a fresh nblocks, incurring
+ * lseek() overhead. The alternative would be to assume the block does
+ * not exist, but that would cause FSM to set zero space available for
+ * blocks that main fork extension just recorded.
+ */
+ return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
+ blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
+ blknumber < RelationGetNumberOfBlocks(rel));
+}