1 files changed, 920 insertions, 0 deletions
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
new file mode 100644
index 0000000..fc5d97f
--- /dev/null
+++ b/src/backend/access/hash/hash.c
@@ -0,0 +1,920 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash.c
+ *	  Implementation of Margo Seltzer's Hashing package for postgres.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/backend/access/hash/hash.c
+ *
+ * NOTES
+ *	  This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/hash_xlog.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "commands/progress.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "optimizer/plancat.h"
+#include "pgstat.h"
+#include "utils/builtins.h"
+#include "utils/index_selfuncs.h"
+#include "utils/rel.h"
+
+/* Working state for hashbuild and its callback */
+typedef struct
+{
+	HSpool	   *spool;			/* NULL if not using spooling */
+	double		indtuples;		/* # tuples accepted into index */
+	Relation	heapRel;		/* heap relation descriptor */
+} HashBuildState;
+
+static void hashbuildCallback(Relation index,
+							  ItemPointer tid,
+							  Datum *values,
+							  bool *isnull,
+							  bool tupleIsAlive,
+							  void *state);
+
+
+/*
+ * Hash handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+hashhandler(PG_FUNCTION_ARGS)
+{
+	IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+	amroutine->amstrategies = HTMaxStrategyNumber;
+	amroutine->amsupport = HASHNProcs;
+	amroutine->amoptsprocnum = HASHOPTIONS_PROC;
+	amroutine->amcanorder = false;
+	amroutine->amcanorderbyop = false;
+	amroutine->amcanbackward = true;
+	amroutine->amcanunique = false;
+	amroutine->amcanmulticol = false;
+	amroutine->amoptionalkey = false;
+	amroutine->amsearcharray = false;
+	amroutine->amsearchnulls = false;
+	amroutine->amstorage = false;
+	amroutine->amclusterable = false;
+	amroutine->ampredlocks = true;
+	amroutine->amcanparallel = false;
+	amroutine->amcaninclude = false;
+	amroutine->amusemaintenanceworkmem = false;
+	amroutine->amsummarizing = false;
+	amroutine->amparallelvacuumoptions =
+		VACUUM_OPTION_PARALLEL_BULKDEL;
+	amroutine->amkeytype = INT4OID;
+
+	amroutine->ambuild = hashbuild;
+	amroutine->ambuildempty = hashbuildempty;
+	amroutine->aminsert = hashinsert;
+	amroutine->ambulkdelete = hashbulkdelete;
+	amroutine->amvacuumcleanup = hashvacuumcleanup;
+	amroutine->amcanreturn = NULL;
+	amroutine->amcostestimate = hashcostestimate;
+	amroutine->amoptions = hashoptions;
+	amroutine->amproperty = NULL;
+	amroutine->ambuildphasename = NULL;
+	amroutine->amvalidate = hashvalidate;
+	amroutine->amadjustmembers = hashadjustmembers;
+	amroutine->ambeginscan = hashbeginscan;
+	amroutine->amrescan = hashrescan;
+	amroutine->amgettuple = hashgettuple;
+	amroutine->amgetbitmap = hashgetbitmap;
+	amroutine->amendscan = hashendscan;
+	amroutine->ammarkpos = NULL;
+	amroutine->amrestrpos = NULL;
+	amroutine->amestimateparallelscan = NULL;
+	amroutine->aminitparallelscan = NULL;
+	amroutine->amparallelrescan = NULL;
+
+	PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ *	hashbuild() -- build a new hash index.
+ */
+IndexBuildResult *
+hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+	IndexBuildResult *result;
+	BlockNumber relpages;
+	double		reltuples;
+	double		allvisfrac;
+	uint32		num_buckets;
+	long		sort_threshold;
+	HashBuildState buildstate;
+
+	/*
+	 * We expect to be called exactly once for any index relation. If that's
+	 * not the case, big trouble's what we have.
+	 */
+	if (RelationGetNumberOfBlocks(index) != 0)
+		elog(ERROR, "index \"%s\" already contains data",
+			 RelationGetRelationName(index));
+
+	/* Estimate the number of rows currently present in the table */
+	estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
+
+	/* Initialize the hash index metadata page and initial buckets */
+	num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
+
+	/*
+	 * If we just insert the tuples into the index in scan order, then
+	 * (assuming their hash codes are pretty random) there will be no locality
+	 * of access to the index, and if the index is bigger than available RAM
+	 * then we'll thrash horribly.  To prevent that scenario, we can sort the
+	 * tuples by (expected) bucket number.  However, such a sort is useless
+	 * overhead when the index does fit in RAM.  We choose to sort if the
+	 * initial index size exceeds maintenance_work_mem, or the number of
+	 * buffers usable for the index, whichever is less.  (Limiting by the
+	 * number of buffers should reduce thrashing between PG buffers and kernel
+	 * buffers, which seems useful even if no physical I/O results.  Limiting
+	 * by maintenance_work_mem is useful to allow easy testing of the sort
+	 * code path, and may be useful to DBAs as an additional control knob.)
+	 *
+	 * NOTE: this test will need adjustment if a bucket is ever different from
+	 * one page.  Also, "initial index size" accounting does not include the
+	 * metapage, nor the first bitmap page.
+	 */
+	sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
+	if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
+		sort_threshold = Min(sort_threshold, NBuffers);
+	else
+		sort_threshold = Min(sort_threshold, NLocBuffer);
+
+	if (num_buckets >= (uint32) sort_threshold)
+		buildstate.spool = _h_spoolinit(heap, index, num_buckets);
+	else
+		buildstate.spool = NULL;
+
+	/* prepare to build the index */
+	buildstate.indtuples = 0;
+	buildstate.heapRel = heap;
+
+	/* do the heap scan */
+	reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+									   hashbuildCallback,
+									   (void *) &buildstate, NULL);
+	pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
+								 buildstate.indtuples);
+
+	if (buildstate.spool)
+	{
+		/* sort the tuples and insert them into the index */
+		_h_indexbuild(buildstate.spool, buildstate.heapRel);
+		_h_spooldestroy(buildstate.spool);
+	}
+
+	/*
+	 * Return statistics
+	 */
+	result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+	result->heap_tuples = reltuples;
+	result->index_tuples = buildstate.indtuples;
+
+	return result;
+}
+
+/*
+ *	hashbuildempty() -- build an empty hash index in the initialization fork
+ */
+void
+hashbuildempty(Relation index)
+{
+	_hash_init(index, 0, INIT_FORKNUM);
+}
+
+/*
+ * Per-tuple callback for table_index_build_scan
+ */
+static void
+hashbuildCallback(Relation index,
+				  ItemPointer tid,
+				  Datum *values,
+				  bool *isnull,
+				  bool tupleIsAlive,
+				  void *state)
+{
+	HashBuildState *buildstate = (HashBuildState *) state;
+	Datum		index_values[1];
+	bool		index_isnull[1];
+	IndexTuple	itup;
+
+	/* convert data to a hash key; on failure, do not insert anything */
+	if (!_hash_convert_tuple(index,
+							 values, isnull,
+							 index_values, index_isnull))
+		return;
+
+	/* Either spool the tuple for sorting, or just put it into the index */
+	if (buildstate->spool)
+		_h_spool(buildstate->spool, tid, index_values, index_isnull);
+	else
+	{
+		/* form an index tuple and point it at the heap tuple */
+		itup = index_form_tuple(RelationGetDescr(index),
+								index_values, index_isnull);
+		itup->t_tid = *tid;
+		_hash_doinsert(index, itup, buildstate->heapRel, false);
+		pfree(itup);
+	}
+
+	buildstate->indtuples += 1;
+}
+
+/*
+ *	hashinsert() -- insert an index tuple into a hash table.
+ *
+ *	Hash on the heap tuple's key, form an index tuple with hash code.
+ *	Find the appropriate location for the new tuple, and put it there.
+ */
+bool
+hashinsert(Relation rel, Datum *values, bool *isnull,
+		   ItemPointer ht_ctid, Relation heapRel,
+		   IndexUniqueCheck checkUnique,
+		   bool indexUnchanged,
+		   IndexInfo *indexInfo)
+{
+	Datum		index_values[1];
+	bool		index_isnull[1];
+	IndexTuple	itup;
+
+	/* convert data to a hash key; on failure, do not insert anything */
+	if (!_hash_convert_tuple(rel,
+							 values, isnull,
+							 index_values, index_isnull))
+		return false;
+
+	/* form an index tuple and point it at the heap tuple */
+	itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
+	itup->t_tid = *ht_ctid;
+
+	_hash_doinsert(rel, itup, heapRel, false);
+
+	pfree(itup);
+
+	return false;
+}
+
+
+/*
+ *	hashgettuple() -- Get the next tuple in the scan.
+ */
+bool
+hashgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	bool		res;
+
+	/* Hash indexes are always lossy since we store only the hash code */
+	scan->xs_recheck = true;
+
+	/*
+	 * If we've already initialized this scan, we can just advance it in the
+	 * appropriate direction.  If we haven't done so yet, we call a routine to
+	 * get the first item in the scan.
+	 */
+	if (!HashScanPosIsValid(so->currPos))
+		res = _hash_first(scan, dir);
+	else
+	{
+		/*
+		 * Check to see if we should kill the previously-fetched tuple.
+		 */
+		if (scan->kill_prior_tuple)
+		{
+			/*
+			 * Yes, so remember it for later. (We'll deal with all such tuples
+			 * at once right after leaving the index page or at end of scan.)
+			 * In case if caller reverses the indexscan direction it is quite
+			 * possible that the same item might get entered multiple times.
+			 * But, we don't detect that; instead, we just forget any excess
+			 * entries.
+			 */
+			if (so->killedItems == NULL)
+				so->killedItems = (int *)
+					palloc(MaxIndexTuplesPerPage * sizeof(int));
+
+			if (so->numKilled < MaxIndexTuplesPerPage)
+				so->killedItems[so->numKilled++] = so->currPos.itemIndex;
+		}
+
+		/*
+		 * Now continue the scan.
+		 */
+		res = _hash_next(scan, dir);
+	}
+
+	return res;
+}
+
+
+/*
+ *	hashgetbitmap() -- get all tuples at once
+ */
+int64
+hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	bool		res;
+	int64		ntids = 0;
+	HashScanPosItem *currItem;
+
+	res = _hash_first(scan, ForwardScanDirection);
+
+	while (res)
+	{
+		currItem = &so->currPos.items[so->currPos.itemIndex];
+
+		/*
+		 * _hash_first and _hash_next handle eliminate dead index entries
+		 * whenever scan->ignore_killed_tuples is true.  Therefore, there's
+		 * nothing to do here except add the results to the TIDBitmap.
+		 */
+		tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
+		ntids++;
+
+		res = _hash_next(scan, ForwardScanDirection);
+	}
+
+	return ntids;
+}
+
+
+/*
+ *	hashbeginscan() -- start a scan on a hash index
+ */
+IndexScanDesc
+hashbeginscan(Relation rel, int nkeys, int norderbys)
+{
+	IndexScanDesc scan;
+	HashScanOpaque so;
+
+	/* no order by operators allowed */
+	Assert(norderbys == 0);
+
+	scan = RelationGetIndexScan(rel, nkeys, norderbys);
+
+	so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
+	HashScanPosInvalidate(so->currPos);
+	so->hashso_bucket_buf = InvalidBuffer;
+	so->hashso_split_bucket_buf = InvalidBuffer;
+
+	so->hashso_buc_populated = false;
+	so->hashso_buc_split = false;
+
+	so->killedItems = NULL;
+	so->numKilled = 0;
+
+	scan->opaque = so;
+
+	return scan;
+}
+
+/*
+ *	hashrescan() -- rescan an index relation
+ */
+void
+hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+		   ScanKey orderbys, int norderbys)
+{
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	Relation	rel = scan->indexRelation;
+
+	if (HashScanPosIsValid(so->currPos))
+	{
+		/* Before leaving current page, deal with any killed items */
+		if (so->numKilled > 0)
+			_hash_kill_items(scan);
+	}
+
+	_hash_dropscanbuf(rel, so);
+
+	/* set position invalid (this will cause _hash_first call) */
+	HashScanPosInvalidate(so->currPos);
+
+	/* Update scan key, if a new one is given */
+	if (scankey && scan->numberOfKeys > 0)
+	{
+		memmove(scan->keyData,
+				scankey,
+				scan->numberOfKeys * sizeof(ScanKeyData));
+	}
+
+	so->hashso_buc_populated = false;
+	so->hashso_buc_split = false;
+}
+
+/*
+ *	hashendscan() -- close down a scan
+ */
+void
+hashendscan(IndexScanDesc scan)
+{
+	HashScanOpaque so = (HashScanOpaque) scan->opaque;
+	Relation	rel = scan->indexRelation;
+
+	if (HashScanPosIsValid(so->currPos))
+	{
+		/* Before leaving current page, deal with any killed items */
+		if (so->numKilled > 0)
+			_hash_kill_items(scan);
+	}
+
+	_hash_dropscanbuf(rel, so);
+
+	if (so->killedItems != NULL)
+		pfree(so->killedItems);
+	pfree(so);
+	scan->opaque = NULL;
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * This function also deletes the tuples that are moved by split to other
+ * bucket.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+			   IndexBulkDeleteCallback callback, void *callback_state)
+{
+	Relation	rel = info->index;
+	double		tuples_removed;
+	double		num_index_tuples;
+	double		orig_ntuples;
+	Bucket		orig_maxbucket;
+	Bucket		cur_maxbucket;
+	Bucket		cur_bucket;
+	Buffer		metabuf = InvalidBuffer;
+	HashMetaPage metap;
+	HashMetaPage cachedmetap;
+
+	tuples_removed = 0;
+	num_index_tuples = 0;
+
+	/*
+	 * We need a copy of the metapage so that we can use its hashm_spares[]
+	 * values to compute bucket page addresses, but a cached copy should be
+	 * good enough.  (If not, we'll detect that further down and refresh the
+	 * cache as necessary.)
+	 */
+	cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
+	Assert(cachedmetap != NULL);
+
+	orig_maxbucket = cachedmetap->hashm_maxbucket;
+	orig_ntuples = cachedmetap->hashm_ntuples;
+
+	/* Scan the buckets that we know exist */
+	cur_bucket = 0;
+	cur_maxbucket = orig_maxbucket;
+
+loop_top:
+	while (cur_bucket <= cur_maxbucket)
+	{
+		BlockNumber bucket_blkno;
+		BlockNumber blkno;
+		Buffer		bucket_buf;
+		Buffer		buf;
+		HashPageOpaque bucket_opaque;
+		Page		page;
+		bool		split_cleanup = false;
+
+		/* Get address of bucket's start page */
+		bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
+
+		blkno = bucket_blkno;
+
+		/*
+		 * We need to acquire a cleanup lock on the primary bucket page to out
+		 * wait concurrent scans before deleting the dead tuples.
+		 */
+		buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+		LockBufferForCleanup(buf);
+		_hash_checkpage(rel, buf, LH_BUCKET_PAGE);
+
+		page = BufferGetPage(buf);
+		bucket_opaque = HashPageGetOpaque(page);
+
+		/*
+		 * If the bucket contains tuples that are moved by split, then we need
+		 * to delete such tuples.  We can't delete such tuples if the split
+		 * operation on bucket is not finished as those are needed by scans.
+		 */
+		if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
+			H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
+		{
+			split_cleanup = true;
+
+			/*
+			 * This bucket might have been split since we last held a lock on
+			 * the metapage.  If so, hashm_maxbucket, hashm_highmask and
+			 * hashm_lowmask might be old enough to cause us to fail to remove
+			 * tuples left behind by the most recent split.  To prevent that,
+			 * now that the primary page of the target bucket has been locked
+			 * (and thus can't be further split), check whether we need to
+			 * update our cached metapage data.
+			 */
+			Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
+			if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
+			{
+				cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+				Assert(cachedmetap != NULL);
+			}
+		}
+
+		bucket_buf = buf;
+
+		hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
+						  cachedmetap->hashm_maxbucket,
+						  cachedmetap->hashm_highmask,
+						  cachedmetap->hashm_lowmask, &tuples_removed,
+						  &num_index_tuples, split_cleanup,
+						  callback, callback_state);
+
+		_hash_dropbuf(rel, bucket_buf);
+
+		/* Advance to next bucket */
+		cur_bucket++;
+	}
+
+	if (BufferIsInvalid(metabuf))
+		metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
+
+	/* Write-lock metapage and check for split since we started */
+	LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+	metap = HashPageGetMeta(BufferGetPage(metabuf));
+
+	if (cur_maxbucket != metap->hashm_maxbucket)
+	{
+		/* There's been a split, so process the additional bucket(s) */
+		LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+		cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+		Assert(cachedmetap != NULL);
+		cur_maxbucket = cachedmetap->hashm_maxbucket;
+		goto loop_top;
+	}
+
+	/* Okay, we're really done.  Update tuple count in metapage. */
+	START_CRIT_SECTION();
+
+	if (orig_maxbucket == metap->hashm_maxbucket &&
+		orig_ntuples == metap->hashm_ntuples)
+	{
+		/*
+		 * No one has split or inserted anything since start of scan, so
+		 * believe our count as gospel.
+		 */
+		metap->hashm_ntuples = num_index_tuples;
+	}
+	else
+	{
+		/*
+		 * Otherwise, our count is untrustworthy since we may have
+		 * double-scanned tuples in split buckets.  Proceed by dead-reckoning.
+		 * (Note: we still return estimated_count = false, because using this
+		 * count is better than not updating reltuples at all.)
+		 */
+		if (metap->hashm_ntuples > tuples_removed)
+			metap->hashm_ntuples -= tuples_removed;
+		else
+			metap->hashm_ntuples = 0;
+		num_index_tuples = metap->hashm_ntuples;
+	}
+
+	MarkBufferDirty(metabuf);
+
+	/* XLOG stuff */
+	if (RelationNeedsWAL(rel))
+	{
+		xl_hash_update_meta_page xlrec;
+		XLogRecPtr	recptr;
+
+		xlrec.ntuples = metap->hashm_ntuples;
+
+		XLogBeginInsert();
+		XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
+
+		XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
+
+		recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
+		PageSetLSN(BufferGetPage(metabuf), recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	_hash_relbuf(rel, metabuf);
+
+	/* return statistics */
+	if (stats == NULL)
+		stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+	stats->estimated_count = false;
+	stats->num_index_tuples = num_index_tuples;
+	stats->tuples_removed += tuples_removed;
+	/* hashvacuumcleanup will fill in num_pages */
+
+	return stats;
+}
+
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+	Relation	rel = info->index;
+	BlockNumber num_pages;
+
+	/* If hashbulkdelete wasn't called, return NULL signifying no change */
+	/* Note: this covers the analyze_only case too */
+	if (stats == NULL)
+		return NULL;
+
+	/* update statistics */
+	num_pages = RelationGetNumberOfBlocks(rel);
+	stats->num_pages = num_pages;
+
+	return stats;
+}
+
+/*
+ * Helper function to perform deletion of index entries from a bucket.
+ *
+ * This function expects that the caller has acquired a cleanup lock on the
+ * primary bucket page, and will return with a write lock again held on the
+ * primary bucket page.  The lock won't necessarily be held continuously,
+ * though, because we'll release it when visiting overflow pages.
+ *
+ * There can't be any concurrent scans in progress when we first enter this
+ * function because of the cleanup lock we hold on the primary bucket page,
+ * but as soon as we release that lock, there might be.  If those scans got
+ * ahead of our cleanup scan, they might see a tuple before we kill it and
+ * wake up only after VACUUM has completed and the TID has been recycled for
+ * an unrelated tuple.  To avoid that calamity, we prevent scans from passing
+ * our cleanup scan by locking the next page in the bucket chain before
+ * releasing the lock on the previous page.  (This type of lock chaining is not
+ * ideal, so we might want to look for a better solution at some point.)
+ *
+ * We need to retain a pin on the primary bucket to ensure that no concurrent
+ * split can start.
+ */
+void
+hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
+				  BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
+				  uint32 maxbucket, uint32 highmask, uint32 lowmask,
+				  double *tuples_removed, double *num_index_tuples,
+				  bool split_cleanup,
+				  IndexBulkDeleteCallback callback, void *callback_state)
+{
+	BlockNumber blkno;
+	Buffer		buf;
+	Bucket		new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
+	bool		bucket_dirty = false;
+
+	blkno = bucket_blkno;
+	buf = bucket_buf;
+
+	if (split_cleanup)
+		new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
+														lowmask, maxbucket);
+
+	/* Scan each page in bucket */
+	for (;;)
+	{
+		HashPageOpaque opaque;
+		OffsetNumber offno;
+		OffsetNumber maxoffno;
+		Buffer		next_buf;
+		Page		page;
+		OffsetNumber deletable[MaxOffsetNumber];
+		int			ndeletable = 0;
+		bool		retain_pin = false;
+		bool		clear_dead_marking = false;
+
+		vacuum_delay_point();
+
+		page = BufferGetPage(buf);
+		opaque = HashPageGetOpaque(page);
+
+		/* Scan each tuple in page */
+		maxoffno = PageGetMaxOffsetNumber(page);
+		for (offno = FirstOffsetNumber;
+			 offno <= maxoffno;
+			 offno = OffsetNumberNext(offno))
+		{
+			ItemPointer htup;
+			IndexTuple	itup;
+			Bucket		bucket;
+			bool		kill_tuple = false;
+
+			itup = (IndexTuple) PageGetItem(page,
+											PageGetItemId(page, offno));
+			htup = &(itup->t_tid);
+
+			/*
+			 * To remove the dead tuples, we strictly want to rely on results
+			 * of callback function.  refer btvacuumpage for detailed reason.
+			 */
+			if (callback && callback(htup, callback_state))
+			{
+				kill_tuple = true;
+				if (tuples_removed)
+					*tuples_removed += 1;
+			}
+			else if (split_cleanup)
+			{
+				/* delete the tuples that are moved by split. */
+				bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
+											  maxbucket,
+											  highmask,
+											  lowmask);
+				/* mark the item for deletion */
+				if (bucket != cur_bucket)
+				{
+					/*
+					 * We expect tuples to either belong to current bucket or
+					 * new_bucket.  This is ensured because we don't allow
+					 * further splits from bucket that contains garbage. See
+					 * comments in _hash_expandtable.
+					 */
+					Assert(bucket == new_bucket);
+					kill_tuple = true;
+				}
+			}
+
+			if (kill_tuple)
+			{
+				/* mark the item for deletion */
+				deletable[ndeletable++] = offno;
+			}
+			else
+			{
+				/* we're keeping it, so count it */
+				if (num_index_tuples)
+					*num_index_tuples += 1;
+			}
+		}
+
+		/* retain the pin on primary bucket page till end of bucket scan */
+		if (blkno == bucket_blkno)
+			retain_pin = true;
+		else
+			retain_pin = false;
+
+		blkno = opaque->hasho_nextblkno;
+
+		/*
+		 * Apply deletions, advance to next page and write page if needed.
+		 */
+		if (ndeletable > 0)
+		{
+			/* No ereport(ERROR) until changes are logged */
+			START_CRIT_SECTION();
+
+			PageIndexMultiDelete(page, deletable, ndeletable);
+			bucket_dirty = true;
+
+			/*
+			 * Let us mark the page as clean if vacuum removes the DEAD tuples
+			 * from an index page. We do this by clearing
+			 * LH_PAGE_HAS_DEAD_TUPLES flag.
+			 */
+			if (tuples_removed && *tuples_removed > 0 &&
+				H_HAS_DEAD_TUPLES(opaque))
+			{
+				opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
+				clear_dead_marking = true;
+			}
+
+			MarkBufferDirty(buf);
+
+			/* XLOG stuff */
+			if (RelationNeedsWAL(rel))
+			{
+				xl_hash_delete xlrec;
+				XLogRecPtr	recptr;
+
+				xlrec.clear_dead_marking = clear_dead_marking;
+				xlrec.is_primary_bucket_page = (buf == bucket_buf);
+
+				XLogBeginInsert();
+				XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
+
+				/*
+				 * bucket buffer needs to be registered to ensure that we can
+				 * acquire a cleanup lock on it during replay.
+				 */
+				if (!xlrec.is_primary_bucket_page)
+					XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
+
+				XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
+				XLogRegisterBufData(1, (char *) deletable,
+									ndeletable * sizeof(OffsetNumber));
+
+				recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
+				PageSetLSN(BufferGetPage(buf), recptr);
+			}
+
+			END_CRIT_SECTION();
+		}
+
+		/* bail out if there are no more pages to scan. */
+		if (!BlockNumberIsValid(blkno))
+			break;
+
+		next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
+											  LH_OVERFLOW_PAGE,
+											  bstrategy);
+
+		/*
+		 * release the lock on previous page after acquiring the lock on next
+		 * page
+		 */
+		if (retain_pin)
+			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+		else
+			_hash_relbuf(rel, buf);
+
+		buf = next_buf;
+	}
+
+	/*
+	 * lock the bucket page to clear the garbage flag and squeeze the bucket.
+	 * if the current buffer is same as bucket buffer, then we already have
+	 * lock on bucket page.
+	 */
+	if (buf != bucket_buf)
+	{
+		_hash_relbuf(rel, buf);
+		LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
+	}
+
+	/*
+	 * Clear the garbage flag from bucket after deleting the tuples that are
+	 * moved by split.  We purposefully clear the flag before squeeze bucket,
+	 * so that after restart, vacuum shouldn't again try to delete the moved
+	 * by split tuples.
+	 */
+	if (split_cleanup)
+	{
+		HashPageOpaque bucket_opaque;
+		Page		page;
+
+		page = BufferGetPage(bucket_buf);
+		bucket_opaque = HashPageGetOpaque(page);
+
+		/* No ereport(ERROR) until changes are logged */
+		START_CRIT_SECTION();
+
+		bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
+		MarkBufferDirty(bucket_buf);
+
+		/* XLOG stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			XLogRecPtr	recptr;
+
+			XLogBeginInsert();
+			XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
+
+			recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
+			PageSetLSN(page, recptr);
+		}
+
+		END_CRIT_SECTION();
+	}
+
+	/*
+	 * If we have deleted anything, try to compact free space.  For squeezing
+	 * the bucket, we must have a cleanup lock, else it can impact the
+	 * ordering of tuples for a scan that has started before it.
+	 */
+	if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
+		_hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
+							bstrategy);
+	else
+		LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
+}