6 files changed, 187 insertions, 50 deletions
diff --git a/src/backend/storage/freespace/README b/src/backend/storage/freespace/README
index e7ff23b..dc2a63a 100644
--- a/src/backend/storage/freespace/README
+++ b/src/backend/storage/freespace/README
@@ -169,9 +169,7 @@ Recovery
 --------
 
 The FSM is not explicitly WAL-logged. Instead, we rely on a bunch of
-self-correcting measures to repair possible corruption. As a result when
-we write to the FSM we treat that as a hint and thus use MarkBufferDirtyHint()
-rather than MarkBufferDirty().
+self-correcting measures to repair possible corruption.
 
 First of all, whenever a value is set on an FSM page, the root node of the
 page is compared against the new value after bubbling up the change is
@@ -188,6 +186,18 @@ goes through fsm_set_avail(), so that the upper nodes on those pages are
 immediately updated.  Periodically, VACUUM calls FreeSpaceMapVacuum[Range]
 to propagate the new free-space info into the upper pages of the FSM tree.
 
+As a result when we write to the FSM we treat that as a hint and thus use
+MarkBufferDirtyHint() rather than MarkBufferDirty().  Every read here uses
+RBM_ZERO_ON_ERROR to bypass checksum mismatches and other verification
+failures.  We'd operate correctly without the full page images that
+MarkBufferDirtyHint() provides, but they do decrease the chance of losing slot
+knowledge to RBM_ZERO_ON_ERROR.
+
+Relation extension is not WAL-logged.  Hence, after WAL replay, an on-disk FSM
+slot may indicate free space in PageIsNew() blocks that never reached disk.
+We detect this case by comparing against the actual relation size, and we mark
+the block as full in that case.
+
 TODO
 ----
 
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index d41ae37..01adf28 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -112,6 +112,7 @@ static BlockNumber fsm_search(Relation rel, uint8 min_cat);
 static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
 							 BlockNumber start, BlockNumber end,
 							 bool *eof);
+static bool fsm_does_block_exist(Relation rel, BlockNumber blknumber);
 
 
 /******** Public API ********/
@@ -128,6 +129,9 @@ static uint8 fsm_vacuum_page(Relation rel, FSMAddress addr,
  * amount of free space available on that page and then try again (see
  * RecordAndGetPageWithFreeSpace).  If InvalidBlockNumber is returned,
  * extend the relation.
+ *
+ * This can trigger FSM updates if any FSM entry is found to point to a block
+ * past the end of the relation.
  */
 BlockNumber
 GetPageWithFreeSpace(Relation rel, Size spaceNeeded)
@@ -166,9 +170,17 @@ RecordAndGetPageWithFreeSpace(Relation rel, BlockNumber oldPage,
 	 * Otherwise, search as usual.
 	 */
 	if (search_slot != -1)
-		return fsm_get_heap_blk(addr, search_slot);
-	else
-		return fsm_search(rel, search_cat);
+	{
+		BlockNumber blknum = fsm_get_heap_blk(addr, search_slot);
+
+		/*
+		 * Check that the blknum is actually in the relation. Don't try to
+		 * update the FSM in that case, just fall back to the other case
+		 */
+		if (fsm_does_block_exist(rel, blknum))
+			return blknum;
+	}
+	return fsm_search(rel, search_cat);
 }
 
 /*
@@ -297,14 +309,25 @@ FreeSpaceMapPrepareTruncateRel(Relation rel, BlockNumber nblocks)
 		fsm_truncate_avail(BufferGetPage(buf), first_removed_slot);
 
 		/*
-		 * Truncation of a relation is WAL-logged at a higher-level, and we
-		 * will be called at WAL replay. But if checksums are enabled, we need
-		 * to still write a WAL record to protect against a torn page, if the
-		 * page is flushed to disk before the truncation WAL record. We cannot
-		 * use MarkBufferDirtyHint here, because that will not dirty the page
-		 * during recovery.
+		 * This change is non-critical, because fsm_does_block_exist() would
+		 * stop us from returning a truncated-away block.  However, since this
+		 * may remove up to SlotsPerFSMPage slots, it's nice to avoid the cost
+		 * of that many fsm_does_block_exist() rejections.  Use a full
+		 * MarkBufferDirty(), not MarkBufferDirtyHint().
 		 */
 		MarkBufferDirty(buf);
+
+		/*
+		 * WAL-log like MarkBufferDirtyHint() might have done, just to avoid
+		 * differing from the rest of the file in this respect.  This is
+		 * optional; see README mention of full page images.  XXX consider
+		 * XLogSaveBufferForHint() for even closer similarity.
+		 *
+		 * A higher-level operation calls us at WAL replay.  If we crash
+		 * before the XLOG_SMGR_TRUNCATE flushes to disk, main fork length has
+		 * not changed, and our fork remains valid.  If we crash after that
+		 * flush, redo will return here.
+		 */
 		if (!InRecovery && RelationNeedsWAL(rel) && XLogHintBitIsNeeded())
 			log_newpage_buffer(buf, false);
 
@@ -721,8 +744,15 @@ fsm_search(Relation rel, uint8 min_cat)
 									(addr.level == FSM_BOTTOM_LEVEL),
 									false);
 			if (slot == -1)
+			{
 				max_avail = fsm_get_max_avail(BufferGetPage(buf));
-			UnlockReleaseBuffer(buf);
+				UnlockReleaseBuffer(buf);
+			}
+			else
+			{
+				/* Keep the pin for possible update below */
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+			}
 		}
 		else
 			slot = -1;
@@ -734,8 +764,37 @@ fsm_search(Relation rel, uint8 min_cat)
 			 * bottom.
 			 */
 			if (addr.level == FSM_BOTTOM_LEVEL)
-				return fsm_get_heap_blk(addr, slot);
-
+			{
+				BlockNumber blkno = fsm_get_heap_blk(addr, slot);
+				Page		page;
+
+				if (fsm_does_block_exist(rel, blkno))
+				{
+					ReleaseBuffer(buf);
+					return blkno;
+				}
+
+				/*
+				 * Block is past the end of the relation.  Update FSM, and
+				 * restart from root.  The usual "advancenext" behavior is
+				 * pessimal for this rare scenario, since every later slot is
+				 * unusable in the same way.  We could zero all affected slots
+				 * on the same FSM page, but don't bet on the benefits of that
+				 * optimization justifying its compiled code bulk.
+				 */
+				page = BufferGetPage(buf);
+				LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+				fsm_set_avail(page, slot, 0);
+				MarkBufferDirtyHint(buf, false);
+				UnlockReleaseBuffer(buf);
+				if (restarts++ > 10000) /* same rationale as below */
+					return InvalidBlockNumber;
+				addr = FSM_ROOT_ADDRESS;
+			}
+			else
+			{
+				ReleaseBuffer(buf);
+			}
 			addr = fsm_get_child(addr, slot);
 		}
 		else if (addr.level == FSM_ROOT_LEVEL)
@@ -903,3 +962,26 @@ fsm_vacuum_page(Relation rel, FSMAddress addr,
 
 	return max_avail;
 }
+
+
+/*
+ * Check whether a block number is past the end of the relation.  This can
+ * happen after WAL replay, if the FSM reached disk but newly-extended pages
+ * it refers to did not.
+ */
+static bool
+fsm_does_block_exist(Relation rel, BlockNumber blknumber)
+{
+	SMgrRelation smgr = RelationGetSmgr(rel);
+
+	/*
+	 * If below the cached nblocks, the block surely exists.  Otherwise, we
+	 * face a trade-off.  We opt to compare to a fresh nblocks, incurring
+	 * lseek() overhead.  The alternative would be to assume the block does
+	 * not exist, but that would cause FSM to set zero space available for
+	 * blocks that main fork extension just recorded.
+	 */
+	return ((BlockNumberIsValid(smgr->smgr_cached_nblocks[MAIN_FORKNUM]) &&
+			 blknumber < smgr->smgr_cached_nblocks[MAIN_FORKNUM]) ||
+			blknumber < RelationGetNumberOfBlocks(rel));
+}
diff --git a/src/backend/storage/ipc/dsm_impl.c b/src/backend/storage/ipc/dsm_impl.c
index 340048a..77a195b 100644
--- a/src/backend/storage/ipc/dsm_impl.c
+++ b/src/backend/storage/ipc/dsm_impl.c
@@ -880,7 +880,7 @@ dsm_impl_mmap(dsm_op op, dsm_handle handle, Size request_size,
 		 * transferring data to the kernel.
 		 */
 		char	   *zbuffer = (char *) palloc0(ZBUFFER_SIZE);
-		uint32		remaining = request_size;
+		Size		remaining = request_size;
 		bool		success = true;
 
 		/*
diff --git a/src/backend/storage/ipc/latch.c b/src/backend/storage/ipc/latch.c
index 8bdb549..c865281 100644
--- a/src/backend/storage/ipc/latch.c
+++ b/src/backend/storage/ipc/latch.c
@@ -529,48 +529,54 @@ WaitLatchOrSocket(Latch *latch, int wakeEvents, pgsocket sock,
 	WaitEvent	event;
 	WaitEventSet *set = CreateWaitEventSet(CurrentMemoryContext, 3);
 
-	if (wakeEvents & WL_TIMEOUT)
-		Assert(timeout >= 0);
-	else
-		timeout = -1;
+	PG_TRY();
+	{
+		if (wakeEvents & WL_TIMEOUT)
+			Assert(timeout >= 0);
+		else
+			timeout = -1;
 
-	if (wakeEvents & WL_LATCH_SET)
-		AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
-						  latch, NULL);
+		if (wakeEvents & WL_LATCH_SET)
+			AddWaitEventToSet(set, WL_LATCH_SET, PGINVALID_SOCKET,
+							  latch, NULL);
 
-	/* Postmaster-managed callers must handle postmaster death somehow. */
-	Assert(!IsUnderPostmaster ||
-		   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
-		   (wakeEvents & WL_POSTMASTER_DEATH));
+		/* Postmaster-managed callers must handle postmaster death somehow. */
+		Assert(!IsUnderPostmaster ||
+			   (wakeEvents & WL_EXIT_ON_PM_DEATH) ||
+			   (wakeEvents & WL_POSTMASTER_DEATH));
 
-	if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
-		AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
+		if ((wakeEvents & WL_POSTMASTER_DEATH) && IsUnderPostmaster)
+			AddWaitEventToSet(set, WL_POSTMASTER_DEATH, PGINVALID_SOCKET,
+							  NULL, NULL);
 
-	if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
-		AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
-						  NULL, NULL);
+		if ((wakeEvents & WL_EXIT_ON_PM_DEATH) && IsUnderPostmaster)
+			AddWaitEventToSet(set, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+							  NULL, NULL);
 
-	if (wakeEvents & WL_SOCKET_MASK)
-	{
-		int			ev;
+		if (wakeEvents & WL_SOCKET_MASK)
+		{
+			int			ev;
 
-		ev = wakeEvents & WL_SOCKET_MASK;
-		AddWaitEventToSet(set, ev, sock, NULL, NULL);
-	}
+			ev = wakeEvents & WL_SOCKET_MASK;
+			AddWaitEventToSet(set, ev, sock, NULL, NULL);
+		}
 
-	rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
+		rc = WaitEventSetWait(set, timeout, &event, 1, wait_event_info);
 
-	if (rc == 0)
-		ret |= WL_TIMEOUT;
-	else
+		if (rc == 0)
+			ret |= WL_TIMEOUT;
+		else
+		{
+			ret |= event.events & (WL_LATCH_SET |
+								   WL_POSTMASTER_DEATH |
+								   WL_SOCKET_MASK);
+		}
+	}
+	PG_FINALLY();
 	{
-		ret |= event.events & (WL_LATCH_SET |
-							   WL_POSTMASTER_DEATH |
-							   WL_SOCKET_MASK);
+		FreeWaitEventSet(set);
 	}
-
-	FreeWaitEventSet(set);
+	PG_END_TRY();
 
 	return ret;
 }
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 1543da6..2acba39 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -1020,6 +1020,44 @@ LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
 }
 
 /*
+ *		ConditionalLockDatabaseObject
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns true iff the lock was acquired.
+ */
+bool
+ConditionalLockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
+							  LOCKMODE lockmode)
+{
+	LOCKTAG		tag;
+	LOCALLOCK  *locallock;
+	LockAcquireResult res;
+
+	SET_LOCKTAG_OBJECT(tag,
+					   MyDatabaseId,
+					   classid,
+					   objid,
+					   objsubid);
+
+	res = LockAcquireExtended(&tag, lockmode, false, true, true, &locallock);
+
+	if (res == LOCKACQUIRE_NOT_AVAIL)
+		return false;
+
+	/*
+	 * Now that we have the lock, check for invalidation messages; see notes
+	 * in LockRelationOid.
+	 */
+	if (res != LOCKACQUIRE_ALREADY_CLEAR)
+	{
+		AcceptInvalidationMessages();
+		MarkLockClear(locallock);
+	}
+
+	return true;
+}
+
+/*
  *		UnlockDatabaseObject
  */
 void
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 631733b..8ded269 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -604,8 +604,9 @@ BlockNumber
 smgrnblocks_cached(SMgrRelation reln, ForkNumber forknum)
 {
 	/*
-	 * For now, we only use cached values in recovery due to lack of a shared
-	 * invalidation mechanism for changes in file size.
+	 * For now, this function uses cached values only in recovery due to lack
+	 * of a shared invalidation mechanism for changes in file size.  Code
+	 * elsewhere reads smgr_cached_nblocks and copes with stale data.
 	 */
 	if (InRecovery && reln->smgr_cached_nblocks[forknum] != InvalidBlockNumber)
 		return reln->smgr_cached_nblocks[forknum];