summaryrefslogtreecommitdiffstats
path: root/src/backend/access/hash/hash.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/backend/access/hash/hash.c')
-rw-r--r--src/backend/access/hash/hash.c920
1 files changed, 920 insertions, 0 deletions
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
new file mode 100644
index 0000000..fc5d97f
--- /dev/null
+++ b/src/backend/access/hash/hash.c
@@ -0,0 +1,920 @@
+/*-------------------------------------------------------------------------
+ *
+ * hash.c
+ * Implementation of Margo Seltzer's Hashing package for postgres.
+ *
+ * Portions Copyright (c) 1996-2023, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ * src/backend/access/hash/hash.c
+ *
+ * NOTES
+ * This file contains only the public interface routines.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "access/hash.h"
+#include "access/hash_xlog.h"
+#include "access/relscan.h"
+#include "access/tableam.h"
+#include "access/xloginsert.h"
+#include "catalog/index.h"
+#include "commands/progress.h"
+#include "commands/vacuum.h"
+#include "miscadmin.h"
+#include "optimizer/plancat.h"
+#include "pgstat.h"
+#include "utils/builtins.h"
+#include "utils/index_selfuncs.h"
+#include "utils/rel.h"
+
+/* Working state for hashbuild and its callback */
+typedef struct
+{
+ HSpool *spool; /* NULL if not using spooling */
+ double indtuples; /* # tuples accepted into index */
+ Relation heapRel; /* heap relation descriptor */
+} HashBuildState;
+
+static void hashbuildCallback(Relation index,
+ ItemPointer tid,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state);
+
+
+/*
+ * Hash handler function: return IndexAmRoutine with access method parameters
+ * and callbacks.
+ */
+Datum
+hashhandler(PG_FUNCTION_ARGS)
+{
+ IndexAmRoutine *amroutine = makeNode(IndexAmRoutine);
+
+ amroutine->amstrategies = HTMaxStrategyNumber;
+ amroutine->amsupport = HASHNProcs;
+ amroutine->amoptsprocnum = HASHOPTIONS_PROC;
+ amroutine->amcanorder = false;
+ amroutine->amcanorderbyop = false;
+ amroutine->amcanbackward = true;
+ amroutine->amcanunique = false;
+ amroutine->amcanmulticol = false;
+ amroutine->amoptionalkey = false;
+ amroutine->amsearcharray = false;
+ amroutine->amsearchnulls = false;
+ amroutine->amstorage = false;
+ amroutine->amclusterable = false;
+ amroutine->ampredlocks = true;
+ amroutine->amcanparallel = false;
+ amroutine->amcaninclude = false;
+ amroutine->amusemaintenanceworkmem = false;
+ amroutine->amsummarizing = false;
+ amroutine->amparallelvacuumoptions =
+ VACUUM_OPTION_PARALLEL_BULKDEL;
+ amroutine->amkeytype = INT4OID;
+
+ amroutine->ambuild = hashbuild;
+ amroutine->ambuildempty = hashbuildempty;
+ amroutine->aminsert = hashinsert;
+ amroutine->ambulkdelete = hashbulkdelete;
+ amroutine->amvacuumcleanup = hashvacuumcleanup;
+ amroutine->amcanreturn = NULL;
+ amroutine->amcostestimate = hashcostestimate;
+ amroutine->amoptions = hashoptions;
+ amroutine->amproperty = NULL;
+ amroutine->ambuildphasename = NULL;
+ amroutine->amvalidate = hashvalidate;
+ amroutine->amadjustmembers = hashadjustmembers;
+ amroutine->ambeginscan = hashbeginscan;
+ amroutine->amrescan = hashrescan;
+ amroutine->amgettuple = hashgettuple;
+ amroutine->amgetbitmap = hashgetbitmap;
+ amroutine->amendscan = hashendscan;
+ amroutine->ammarkpos = NULL;
+ amroutine->amrestrpos = NULL;
+ amroutine->amestimateparallelscan = NULL;
+ amroutine->aminitparallelscan = NULL;
+ amroutine->amparallelrescan = NULL;
+
+ PG_RETURN_POINTER(amroutine);
+}
+
+/*
+ * hashbuild() -- build a new hash index.
+ */
+IndexBuildResult *
+hashbuild(Relation heap, Relation index, IndexInfo *indexInfo)
+{
+ IndexBuildResult *result;
+ BlockNumber relpages;
+ double reltuples;
+ double allvisfrac;
+ uint32 num_buckets;
+ long sort_threshold;
+ HashBuildState buildstate;
+
+ /*
+ * We expect to be called exactly once for any index relation. If that's
+ * not the case, big trouble's what we have.
+ */
+ if (RelationGetNumberOfBlocks(index) != 0)
+ elog(ERROR, "index \"%s\" already contains data",
+ RelationGetRelationName(index));
+
+ /* Estimate the number of rows currently present in the table */
+ estimate_rel_size(heap, NULL, &relpages, &reltuples, &allvisfrac);
+
+ /* Initialize the hash index metadata page and initial buckets */
+ num_buckets = _hash_init(index, reltuples, MAIN_FORKNUM);
+
+ /*
+ * If we just insert the tuples into the index in scan order, then
+ * (assuming their hash codes are pretty random) there will be no locality
+ * of access to the index, and if the index is bigger than available RAM
+ * then we'll thrash horribly. To prevent that scenario, we can sort the
+ * tuples by (expected) bucket number. However, such a sort is useless
+ * overhead when the index does fit in RAM. We choose to sort if the
+ * initial index size exceeds maintenance_work_mem, or the number of
+ * buffers usable for the index, whichever is less. (Limiting by the
+ * number of buffers should reduce thrashing between PG buffers and kernel
+ * buffers, which seems useful even if no physical I/O results. Limiting
+ * by maintenance_work_mem is useful to allow easy testing of the sort
+ * code path, and may be useful to DBAs as an additional control knob.)
+ *
+ * NOTE: this test will need adjustment if a bucket is ever different from
+ * one page. Also, "initial index size" accounting does not include the
+ * metapage, nor the first bitmap page.
+ */
+ sort_threshold = (maintenance_work_mem * 1024L) / BLCKSZ;
+ if (index->rd_rel->relpersistence != RELPERSISTENCE_TEMP)
+ sort_threshold = Min(sort_threshold, NBuffers);
+ else
+ sort_threshold = Min(sort_threshold, NLocBuffer);
+
+ if (num_buckets >= (uint32) sort_threshold)
+ buildstate.spool = _h_spoolinit(heap, index, num_buckets);
+ else
+ buildstate.spool = NULL;
+
+ /* prepare to build the index */
+ buildstate.indtuples = 0;
+ buildstate.heapRel = heap;
+
+ /* do the heap scan */
+ reltuples = table_index_build_scan(heap, index, indexInfo, true, true,
+ hashbuildCallback,
+ (void *) &buildstate, NULL);
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_TOTAL,
+ buildstate.indtuples);
+
+ if (buildstate.spool)
+ {
+ /* sort the tuples and insert them into the index */
+ _h_indexbuild(buildstate.spool, buildstate.heapRel);
+ _h_spooldestroy(buildstate.spool);
+ }
+
+ /*
+ * Return statistics
+ */
+ result = (IndexBuildResult *) palloc(sizeof(IndexBuildResult));
+
+ result->heap_tuples = reltuples;
+ result->index_tuples = buildstate.indtuples;
+
+ return result;
+}
+
+/*
+ * hashbuildempty() -- build an empty hash index in the initialization fork
+ */
+void
+hashbuildempty(Relation index)
+{
+ _hash_init(index, 0, INIT_FORKNUM);
+}
+
+/*
+ * Per-tuple callback for table_index_build_scan
+ */
+static void
+hashbuildCallback(Relation index,
+ ItemPointer tid,
+ Datum *values,
+ bool *isnull,
+ bool tupleIsAlive,
+ void *state)
+{
+ HashBuildState *buildstate = (HashBuildState *) state;
+ Datum index_values[1];
+ bool index_isnull[1];
+ IndexTuple itup;
+
+ /* convert data to a hash key; on failure, do not insert anything */
+ if (!_hash_convert_tuple(index,
+ values, isnull,
+ index_values, index_isnull))
+ return;
+
+ /* Either spool the tuple for sorting, or just put it into the index */
+ if (buildstate->spool)
+ _h_spool(buildstate->spool, tid, index_values, index_isnull);
+ else
+ {
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_form_tuple(RelationGetDescr(index),
+ index_values, index_isnull);
+ itup->t_tid = *tid;
+ _hash_doinsert(index, itup, buildstate->heapRel, false);
+ pfree(itup);
+ }
+
+ buildstate->indtuples += 1;
+}
+
+/*
+ * hashinsert() -- insert an index tuple into a hash table.
+ *
+ * Hash on the heap tuple's key, form an index tuple with hash code.
+ * Find the appropriate location for the new tuple, and put it there.
+ */
+bool
+hashinsert(Relation rel, Datum *values, bool *isnull,
+ ItemPointer ht_ctid, Relation heapRel,
+ IndexUniqueCheck checkUnique,
+ bool indexUnchanged,
+ IndexInfo *indexInfo)
+{
+ Datum index_values[1];
+ bool index_isnull[1];
+ IndexTuple itup;
+
+ /* convert data to a hash key; on failure, do not insert anything */
+ if (!_hash_convert_tuple(rel,
+ values, isnull,
+ index_values, index_isnull))
+ return false;
+
+ /* form an index tuple and point it at the heap tuple */
+ itup = index_form_tuple(RelationGetDescr(rel), index_values, index_isnull);
+ itup->t_tid = *ht_ctid;
+
+ _hash_doinsert(rel, itup, heapRel, false);
+
+ pfree(itup);
+
+ return false;
+}
+
+
+/*
+ * hashgettuple() -- Get the next tuple in the scan.
+ */
+bool
+hashgettuple(IndexScanDesc scan, ScanDirection dir)
+{
+ HashScanOpaque so = (HashScanOpaque) scan->opaque;
+ bool res;
+
+ /* Hash indexes are always lossy since we store only the hash code */
+ scan->xs_recheck = true;
+
+ /*
+ * If we've already initialized this scan, we can just advance it in the
+ * appropriate direction. If we haven't done so yet, we call a routine to
+ * get the first item in the scan.
+ */
+ if (!HashScanPosIsValid(so->currPos))
+ res = _hash_first(scan, dir);
+ else
+ {
+ /*
+ * Check to see if we should kill the previously-fetched tuple.
+ */
+ if (scan->kill_prior_tuple)
+ {
+ /*
+ * Yes, so remember it for later. (We'll deal with all such tuples
+ * at once right after leaving the index page or at end of scan.)
+ * In case if caller reverses the indexscan direction it is quite
+ * possible that the same item might get entered multiple times.
+ * But, we don't detect that; instead, we just forget any excess
+ * entries.
+ */
+ if (so->killedItems == NULL)
+ so->killedItems = (int *)
+ palloc(MaxIndexTuplesPerPage * sizeof(int));
+
+ if (so->numKilled < MaxIndexTuplesPerPage)
+ so->killedItems[so->numKilled++] = so->currPos.itemIndex;
+ }
+
+ /*
+ * Now continue the scan.
+ */
+ res = _hash_next(scan, dir);
+ }
+
+ return res;
+}
+
+
+/*
+ * hashgetbitmap() -- get all tuples at once
+ */
+int64
+hashgetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
+{
+ HashScanOpaque so = (HashScanOpaque) scan->opaque;
+ bool res;
+ int64 ntids = 0;
+ HashScanPosItem *currItem;
+
+ res = _hash_first(scan, ForwardScanDirection);
+
+ while (res)
+ {
+ currItem = &so->currPos.items[so->currPos.itemIndex];
+
+ /*
+ * _hash_first and _hash_next handle eliminate dead index entries
+ * whenever scan->ignore_killed_tuples is true. Therefore, there's
+ * nothing to do here except add the results to the TIDBitmap.
+ */
+ tbm_add_tuples(tbm, &(currItem->heapTid), 1, true);
+ ntids++;
+
+ res = _hash_next(scan, ForwardScanDirection);
+ }
+
+ return ntids;
+}
+
+
+/*
+ * hashbeginscan() -- start a scan on a hash index
+ */
+IndexScanDesc
+hashbeginscan(Relation rel, int nkeys, int norderbys)
+{
+ IndexScanDesc scan;
+ HashScanOpaque so;
+
+ /* no order by operators allowed */
+ Assert(norderbys == 0);
+
+ scan = RelationGetIndexScan(rel, nkeys, norderbys);
+
+ so = (HashScanOpaque) palloc(sizeof(HashScanOpaqueData));
+ HashScanPosInvalidate(so->currPos);
+ so->hashso_bucket_buf = InvalidBuffer;
+ so->hashso_split_bucket_buf = InvalidBuffer;
+
+ so->hashso_buc_populated = false;
+ so->hashso_buc_split = false;
+
+ so->killedItems = NULL;
+ so->numKilled = 0;
+
+ scan->opaque = so;
+
+ return scan;
+}
+
+/*
+ * hashrescan() -- rescan an index relation
+ */
+void
+hashrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
+ ScanKey orderbys, int norderbys)
+{
+ HashScanOpaque so = (HashScanOpaque) scan->opaque;
+ Relation rel = scan->indexRelation;
+
+ if (HashScanPosIsValid(so->currPos))
+ {
+ /* Before leaving current page, deal with any killed items */
+ if (so->numKilled > 0)
+ _hash_kill_items(scan);
+ }
+
+ _hash_dropscanbuf(rel, so);
+
+ /* set position invalid (this will cause _hash_first call) */
+ HashScanPosInvalidate(so->currPos);
+
+ /* Update scan key, if a new one is given */
+ if (scankey && scan->numberOfKeys > 0)
+ {
+ memmove(scan->keyData,
+ scankey,
+ scan->numberOfKeys * sizeof(ScanKeyData));
+ }
+
+ so->hashso_buc_populated = false;
+ so->hashso_buc_split = false;
+}
+
+/*
+ * hashendscan() -- close down a scan
+ */
+void
+hashendscan(IndexScanDesc scan)
+{
+ HashScanOpaque so = (HashScanOpaque) scan->opaque;
+ Relation rel = scan->indexRelation;
+
+ if (HashScanPosIsValid(so->currPos))
+ {
+ /* Before leaving current page, deal with any killed items */
+ if (so->numKilled > 0)
+ _hash_kill_items(scan);
+ }
+
+ _hash_dropscanbuf(rel, so);
+
+ if (so->killedItems != NULL)
+ pfree(so->killedItems);
+ pfree(so);
+ scan->opaque = NULL;
+}
+
+/*
+ * Bulk deletion of all index entries pointing to a set of heap tuples.
+ * The set of target tuples is specified via a callback routine that tells
+ * whether any given heap tuple (identified by ItemPointer) is being deleted.
+ *
+ * This function also deletes the tuples that are moved by split to other
+ * bucket.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+hashbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ Relation rel = info->index;
+ double tuples_removed;
+ double num_index_tuples;
+ double orig_ntuples;
+ Bucket orig_maxbucket;
+ Bucket cur_maxbucket;
+ Bucket cur_bucket;
+ Buffer metabuf = InvalidBuffer;
+ HashMetaPage metap;
+ HashMetaPage cachedmetap;
+
+ tuples_removed = 0;
+ num_index_tuples = 0;
+
+ /*
+ * We need a copy of the metapage so that we can use its hashm_spares[]
+ * values to compute bucket page addresses, but a cached copy should be
+ * good enough. (If not, we'll detect that further down and refresh the
+ * cache as necessary.)
+ */
+ cachedmetap = _hash_getcachedmetap(rel, &metabuf, false);
+ Assert(cachedmetap != NULL);
+
+ orig_maxbucket = cachedmetap->hashm_maxbucket;
+ orig_ntuples = cachedmetap->hashm_ntuples;
+
+ /* Scan the buckets that we know exist */
+ cur_bucket = 0;
+ cur_maxbucket = orig_maxbucket;
+
+loop_top:
+ while (cur_bucket <= cur_maxbucket)
+ {
+ BlockNumber bucket_blkno;
+ BlockNumber blkno;
+ Buffer bucket_buf;
+ Buffer buf;
+ HashPageOpaque bucket_opaque;
+ Page page;
+ bool split_cleanup = false;
+
+ /* Get address of bucket's start page */
+ bucket_blkno = BUCKET_TO_BLKNO(cachedmetap, cur_bucket);
+
+ blkno = bucket_blkno;
+
+ /*
+ * We need to acquire a cleanup lock on the primary bucket page to out
+ * wait concurrent scans before deleting the dead tuples.
+ */
+ buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL, info->strategy);
+ LockBufferForCleanup(buf);
+ _hash_checkpage(rel, buf, LH_BUCKET_PAGE);
+
+ page = BufferGetPage(buf);
+ bucket_opaque = HashPageGetOpaque(page);
+
+ /*
+ * If the bucket contains tuples that are moved by split, then we need
+ * to delete such tuples. We can't delete such tuples if the split
+ * operation on bucket is not finished as those are needed by scans.
+ */
+ if (!H_BUCKET_BEING_SPLIT(bucket_opaque) &&
+ H_NEEDS_SPLIT_CLEANUP(bucket_opaque))
+ {
+ split_cleanup = true;
+
+ /*
+ * This bucket might have been split since we last held a lock on
+ * the metapage. If so, hashm_maxbucket, hashm_highmask and
+ * hashm_lowmask might be old enough to cause us to fail to remove
+ * tuples left behind by the most recent split. To prevent that,
+ * now that the primary page of the target bucket has been locked
+ * (and thus can't be further split), check whether we need to
+ * update our cached metapage data.
+ */
+ Assert(bucket_opaque->hasho_prevblkno != InvalidBlockNumber);
+ if (bucket_opaque->hasho_prevblkno > cachedmetap->hashm_maxbucket)
+ {
+ cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+ Assert(cachedmetap != NULL);
+ }
+ }
+
+ bucket_buf = buf;
+
+ hashbucketcleanup(rel, cur_bucket, bucket_buf, blkno, info->strategy,
+ cachedmetap->hashm_maxbucket,
+ cachedmetap->hashm_highmask,
+ cachedmetap->hashm_lowmask, &tuples_removed,
+ &num_index_tuples, split_cleanup,
+ callback, callback_state);
+
+ _hash_dropbuf(rel, bucket_buf);
+
+ /* Advance to next bucket */
+ cur_bucket++;
+ }
+
+ if (BufferIsInvalid(metabuf))
+ metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
+
+ /* Write-lock metapage and check for split since we started */
+ LockBuffer(metabuf, BUFFER_LOCK_EXCLUSIVE);
+ metap = HashPageGetMeta(BufferGetPage(metabuf));
+
+ if (cur_maxbucket != metap->hashm_maxbucket)
+ {
+ /* There's been a split, so process the additional bucket(s) */
+ LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
+ cachedmetap = _hash_getcachedmetap(rel, &metabuf, true);
+ Assert(cachedmetap != NULL);
+ cur_maxbucket = cachedmetap->hashm_maxbucket;
+ goto loop_top;
+ }
+
+ /* Okay, we're really done. Update tuple count in metapage. */
+ START_CRIT_SECTION();
+
+ if (orig_maxbucket == metap->hashm_maxbucket &&
+ orig_ntuples == metap->hashm_ntuples)
+ {
+ /*
+ * No one has split or inserted anything since start of scan, so
+ * believe our count as gospel.
+ */
+ metap->hashm_ntuples = num_index_tuples;
+ }
+ else
+ {
+ /*
+ * Otherwise, our count is untrustworthy since we may have
+ * double-scanned tuples in split buckets. Proceed by dead-reckoning.
+ * (Note: we still return estimated_count = false, because using this
+ * count is better than not updating reltuples at all.)
+ */
+ if (metap->hashm_ntuples > tuples_removed)
+ metap->hashm_ntuples -= tuples_removed;
+ else
+ metap->hashm_ntuples = 0;
+ num_index_tuples = metap->hashm_ntuples;
+ }
+
+ MarkBufferDirty(metabuf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_hash_update_meta_page xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.ntuples = metap->hashm_ntuples;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfHashUpdateMetaPage);
+
+ XLogRegisterBuffer(0, metabuf, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_UPDATE_META_PAGE);
+ PageSetLSN(BufferGetPage(metabuf), recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ _hash_relbuf(rel, metabuf);
+
+ /* return statistics */
+ if (stats == NULL)
+ stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
+ stats->estimated_count = false;
+ stats->num_index_tuples = num_index_tuples;
+ stats->tuples_removed += tuples_removed;
+ /* hashvacuumcleanup will fill in num_pages */
+
+ return stats;
+}
+
+/*
+ * Post-VACUUM cleanup.
+ *
+ * Result: a palloc'd struct containing statistical info for VACUUM displays.
+ */
+IndexBulkDeleteResult *
+hashvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
+{
+ Relation rel = info->index;
+ BlockNumber num_pages;
+
+ /* If hashbulkdelete wasn't called, return NULL signifying no change */
+ /* Note: this covers the analyze_only case too */
+ if (stats == NULL)
+ return NULL;
+
+ /* update statistics */
+ num_pages = RelationGetNumberOfBlocks(rel);
+ stats->num_pages = num_pages;
+
+ return stats;
+}
+
+/*
+ * Helper function to perform deletion of index entries from a bucket.
+ *
+ * This function expects that the caller has acquired a cleanup lock on the
+ * primary bucket page, and will return with a write lock again held on the
+ * primary bucket page. The lock won't necessarily be held continuously,
+ * though, because we'll release it when visiting overflow pages.
+ *
+ * There can't be any concurrent scans in progress when we first enter this
+ * function because of the cleanup lock we hold on the primary bucket page,
+ * but as soon as we release that lock, there might be. If those scans got
+ * ahead of our cleanup scan, they might see a tuple before we kill it and
+ * wake up only after VACUUM has completed and the TID has been recycled for
+ * an unrelated tuple. To avoid that calamity, we prevent scans from passing
+ * our cleanup scan by locking the next page in the bucket chain before
+ * releasing the lock on the previous page. (This type of lock chaining is not
+ * ideal, so we might want to look for a better solution at some point.)
+ *
+ * We need to retain a pin on the primary bucket to ensure that no concurrent
+ * split can start.
+ */
+void
+hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf,
+ BlockNumber bucket_blkno, BufferAccessStrategy bstrategy,
+ uint32 maxbucket, uint32 highmask, uint32 lowmask,
+ double *tuples_removed, double *num_index_tuples,
+ bool split_cleanup,
+ IndexBulkDeleteCallback callback, void *callback_state)
+{
+ BlockNumber blkno;
+ Buffer buf;
+ Bucket new_bucket PG_USED_FOR_ASSERTS_ONLY = InvalidBucket;
+ bool bucket_dirty = false;
+
+ blkno = bucket_blkno;
+ buf = bucket_buf;
+
+ if (split_cleanup)
+ new_bucket = _hash_get_newbucket_from_oldbucket(rel, cur_bucket,
+ lowmask, maxbucket);
+
+ /* Scan each page in bucket */
+ for (;;)
+ {
+ HashPageOpaque opaque;
+ OffsetNumber offno;
+ OffsetNumber maxoffno;
+ Buffer next_buf;
+ Page page;
+ OffsetNumber deletable[MaxOffsetNumber];
+ int ndeletable = 0;
+ bool retain_pin = false;
+ bool clear_dead_marking = false;
+
+ vacuum_delay_point();
+
+ page = BufferGetPage(buf);
+ opaque = HashPageGetOpaque(page);
+
+ /* Scan each tuple in page */
+ maxoffno = PageGetMaxOffsetNumber(page);
+ for (offno = FirstOffsetNumber;
+ offno <= maxoffno;
+ offno = OffsetNumberNext(offno))
+ {
+ ItemPointer htup;
+ IndexTuple itup;
+ Bucket bucket;
+ bool kill_tuple = false;
+
+ itup = (IndexTuple) PageGetItem(page,
+ PageGetItemId(page, offno));
+ htup = &(itup->t_tid);
+
+ /*
+ * To remove the dead tuples, we strictly want to rely on results
+ * of callback function. refer btvacuumpage for detailed reason.
+ */
+ if (callback && callback(htup, callback_state))
+ {
+ kill_tuple = true;
+ if (tuples_removed)
+ *tuples_removed += 1;
+ }
+ else if (split_cleanup)
+ {
+ /* delete the tuples that are moved by split. */
+ bucket = _hash_hashkey2bucket(_hash_get_indextuple_hashkey(itup),
+ maxbucket,
+ highmask,
+ lowmask);
+ /* mark the item for deletion */
+ if (bucket != cur_bucket)
+ {
+ /*
+ * We expect tuples to either belong to current bucket or
+ * new_bucket. This is ensured because we don't allow
+ * further splits from bucket that contains garbage. See
+ * comments in _hash_expandtable.
+ */
+ Assert(bucket == new_bucket);
+ kill_tuple = true;
+ }
+ }
+
+ if (kill_tuple)
+ {
+ /* mark the item for deletion */
+ deletable[ndeletable++] = offno;
+ }
+ else
+ {
+ /* we're keeping it, so count it */
+ if (num_index_tuples)
+ *num_index_tuples += 1;
+ }
+ }
+
+ /* retain the pin on primary bucket page till end of bucket scan */
+ if (blkno == bucket_blkno)
+ retain_pin = true;
+ else
+ retain_pin = false;
+
+ blkno = opaque->hasho_nextblkno;
+
+ /*
+ * Apply deletions, advance to next page and write page if needed.
+ */
+ if (ndeletable > 0)
+ {
+ /* No ereport(ERROR) until changes are logged */
+ START_CRIT_SECTION();
+
+ PageIndexMultiDelete(page, deletable, ndeletable);
+ bucket_dirty = true;
+
+ /*
+ * Let us mark the page as clean if vacuum removes the DEAD tuples
+ * from an index page. We do this by clearing
+ * LH_PAGE_HAS_DEAD_TUPLES flag.
+ */
+ if (tuples_removed && *tuples_removed > 0 &&
+ H_HAS_DEAD_TUPLES(opaque))
+ {
+ opaque->hasho_flag &= ~LH_PAGE_HAS_DEAD_TUPLES;
+ clear_dead_marking = true;
+ }
+
+ MarkBufferDirty(buf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_hash_delete xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.clear_dead_marking = clear_dead_marking;
+ xlrec.is_primary_bucket_page = (buf == bucket_buf);
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfHashDelete);
+
+ /*
+ * bucket buffer needs to be registered to ensure that we can
+ * acquire a cleanup lock on it during replay.
+ */
+ if (!xlrec.is_primary_bucket_page)
+ XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD | REGBUF_NO_IMAGE);
+
+ XLogRegisterBuffer(1, buf, REGBUF_STANDARD);
+ XLogRegisterBufData(1, (char *) deletable,
+ ndeletable * sizeof(OffsetNumber));
+
+ recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_DELETE);
+ PageSetLSN(BufferGetPage(buf), recptr);
+ }
+
+ END_CRIT_SECTION();
+ }
+
+ /* bail out if there are no more pages to scan. */
+ if (!BlockNumberIsValid(blkno))
+ break;
+
+ next_buf = _hash_getbuf_with_strategy(rel, blkno, HASH_WRITE,
+ LH_OVERFLOW_PAGE,
+ bstrategy);
+
+ /*
+ * release the lock on previous page after acquiring the lock on next
+ * page
+ */
+ if (retain_pin)
+ LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+ else
+ _hash_relbuf(rel, buf);
+
+ buf = next_buf;
+ }
+
+ /*
+ * lock the bucket page to clear the garbage flag and squeeze the bucket.
+ * if the current buffer is same as bucket buffer, then we already have
+ * lock on bucket page.
+ */
+ if (buf != bucket_buf)
+ {
+ _hash_relbuf(rel, buf);
+ LockBuffer(bucket_buf, BUFFER_LOCK_EXCLUSIVE);
+ }
+
+ /*
+ * Clear the garbage flag from bucket after deleting the tuples that are
+ * moved by split. We purposefully clear the flag before squeeze bucket,
+ * so that after restart, vacuum shouldn't again try to delete the moved
+ * by split tuples.
+ */
+ if (split_cleanup)
+ {
+ HashPageOpaque bucket_opaque;
+ Page page;
+
+ page = BufferGetPage(bucket_buf);
+ bucket_opaque = HashPageGetOpaque(page);
+
+ /* No ereport(ERROR) until changes are logged */
+ START_CRIT_SECTION();
+
+ bucket_opaque->hasho_flag &= ~LH_BUCKET_NEEDS_SPLIT_CLEANUP;
+ MarkBufferDirty(bucket_buf);
+
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+
+ XLogBeginInsert();
+ XLogRegisterBuffer(0, bucket_buf, REGBUF_STANDARD);
+
+ recptr = XLogInsert(RM_HASH_ID, XLOG_HASH_SPLIT_CLEANUP);
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+ }
+
+ /*
+ * If we have deleted anything, try to compact free space. For squeezing
+ * the bucket, we must have a cleanup lock, else it can impact the
+ * ordering of tuples for a scan that has started before it.
+ */
+ if (bucket_dirty && IsBufferCleanupOK(bucket_buf))
+ _hash_squeezebucket(rel, cur_bucket, bucket_blkno, bucket_buf,
+ bstrategy);
+ else
+ LockBuffer(bucket_buf, BUFFER_LOCK_UNLOCK);
+}